Exemplos de ping_host em Python, exemplos de mysql.utilities.common.tools.ping_host em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: server.py Projeto: molihuadba/dbatools-1

def get_server_state(server, host, pingtime=3, verbose=False):
    """Return the state of the server.
    
    This method returns one of the following states based on the
    criteria shown.
    
      UP   - server is connected
      WARN - server is not connected but can be pinged
      DOWN - server cannot be pinged nor is connected

    server[in]     Server class instance
    host[in]       host name to ping if server is not connected
    pingtime[in]   timeout in seconds for ping operation
                   Default = 3 seconds
    verbose[in]    if True, show ping status messages
                   Default = False

    Returns string - state
    """
    from mysql.utilities.common.tools import ping_host

    if verbose:
        print "# Attempting to contact %s ..." % host,
    if server is not None and server.is_alive():
        if verbose:
            print "Success"
        return "UP"
    elif ping_host(host, pingtime):
        if verbose:
            print "Server is reachable"
        return "WARN"
    if verbose:
        print "FAIL"
    return "DOWN"

Exemplo n.º 2

0

Exibir arquivo

Arquivo: rpl_admin.py Projeto: molihuadba/dbatools-1

    def run_auto_failover(self, console, interval):
        """Run automatic failover

        This method implements the automatic failover facility. It uses the
        FailoverConsole class from the failover_console.py to implement all
        user interface commands and uses the existing failover() method of
        this class to conduct failover.

        When the master goes down, the method can perform one of three actions:

        1) failover to list of candidates first then slaves
        2) failover to list of candidates only
        3) fail

        console[in]    instance of the failover console class
        interval[in]   time in seconds to wait to check status of servers

        Returns bool - True = success, raises exception on error
        """
        import time
        from mysql.utilities.common.tools import ping_host
        from mysql.utilities.common.tools import execute_script

        failover_mode = self.options.get("failover_mode", "auto")
        pingtime = self.options.get("pingtime", 3)
        timeout = int(self.options.get("timeout", 300))
        exec_fail = self.options.get("exec_fail", None)
        post_fail = self.options.get("post_fail", None)

        # Only works for GTID_MODE=ON
        if not self.topology.gtid_enabled():
            msg = "Topology must support global transaction ids " + \
                  "and have GTID_MODE=ON."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Check privileges
        self._report("# Checking privileges.")
        errors = self.topology.check_privileges(failover_mode != 'fail')
        if len(errors):
            msg = "User %s on %s does not have sufficient privileges to " + \
                  "execute the %s command."
            for error in errors:
                self._report(msg % (error[0], error[1], 'failover'),
                             logging.CRITICAL)
            raise UtilRplError("Not enough privileges to execute command.")

        # Require --master-info-repository=TABLE for all slaves
        if not self.topology.check_master_info_type("TABLE"):
            msg = "Failover requires --master-info-repository=TABLE for " + \
                  "all slaves."
            self._report(msg, logging.ERROR, False)
            raise UtilRplError(msg)

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print "# WARNING: %s" % _HOST_IP_WARNING
            self._report(_HOST_IP_WARNING, logging.WARN, False)
            print "#\n# Failover console will start in 10 seconds."
            time.sleep(10)

        # Test failover script. If it doesn't exist, fail.
        no_exec_fail_msg = "Failover check script cannot be found. Please " + \
                           "check the path and filename for accuracy and " + \
                           "restart the failover console."
        if exec_fail is not None and not os.path.exists(exec_fail):
            self._report(no_exec_fail_msg, logging.CRITICAL, False)
            raise UtilRplError(no_exec_fail_msg)

        self._report("Failover console started.", logging.INFO, False)
        self._report("Failover mode = %s." % failover_mode, logging.INFO,
                     False)

        # Main loop - loop and fire on interval.
        done = False
        first_pass = True
        failover = False
        while not done:
            # Use try block in case master class has gone away.
            try:
                old_host = self.master.host
                old_port = self.master.port
            except:
                pass
                old_host = "UNKNOWN"
                old_port = "UNKNOWN"

            # If a failover script is provided, check it else check master
            # using connectivity checks.
            if exec_fail is not None:
                # Execute failover check script
                if not os.path.exists(exec_fail):
                    self._report(no_exec_fail_msg, logging.CRITICAL, False)
                    raise UtilRplError(no_exec_fail_msg)
                else:
                    self._report("# Spawning external script for failover "
                                 "checking.")
                    res = execute_script(exec_fail, None, [old_host, old_port],
                                         self.verbose)
                    if res == 0:
                        self._report("# Failover check script completed Ok. "
                                     "Failover averted.")
                    else:
                        self._report(
                            "# Failover check script failed. "
                            "Failover initiated", logging.WARN)
                        failover = True
            else:
                # Check the master. If not alive, wait for pingtime seconds
                # and try again.
                if self.topology.master is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master may be down. Waiting for %s seconds." % \
                          pingtime
                    self._report(msg, logging.INFO, False)
                    time.sleep(pingtime)
                    try:
                        self.topology.master.connect()
                    except:
                        self._report("Cannot reconnect to master.",
                                     logging.INFO, False)

                # Check the master again. If no connection or lost connection,
                # try ping and if still not alive, failover. This performs the
                # timeout threshold for detecting a down master.
                if self.topology.master is None or \
                   not ping_host(self.topology.master.host, pingtime) or \
                   not self.topology.master.is_alive():
                    failover = True

            if failover:
                self._report("Master is confirmed to be down or unreachable.",
                             logging.CRITICAL, False)
                try:
                    self.topology.master.disconnect()
                except:
                    pass
                console.clear()
                if failover_mode == 'auto':
                    self._report("Failover starting in 'auto' mode...")
                    res = self.topology.failover(self.candidates, False)
                elif failover_mode == 'elect':
                    self._report("Failover starting in 'elect' mode...")
                    res = self.topology.failover(self.candidates, True)
                else:
                    msg = _FAILOVER_ERROR % \
                          "Master has failed and automatic failover is not enabled. "
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg, _FAILOVER_ERRNO)
                if not res:
                    msg = _FAILOVER_ERROR % "An error was encountered " + \
                          "during failover. "
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg)
                self.master = self.topology.master
                console.master = self.master
                self.topology.remove_discovered_slaves()
                self.topology.discover_slaves()
                console.list_data = None
                print "\nFailover console will restart in 5 seconds."
                time.sleep(5)
                console.clear()
                failover = False
                # Execute post failover script
                self.topology.run_script(
                    post_fail, False,
                    [old_host, old_port, self.master.host, self.master.port])

            # discover slaves if option was specified at startup
            elif self.options.get("discover", None) is not None and \
                (not first_pass or self.options.get("rediscover", False)):
                # Force refresh of health list if new slaves found
                if self.topology.discover_slaves():
                    console.list_data = None

            res = console.display_console()
            if res is not None:  # None = normal timeout, keep going
                if not res:
                    return False  # Errors detected
                done = True  # User has quit
            first_pass = False

        return True

Exemplo n.º 3

0

Exibir arquivo

    def run(self):
        """Run automatic failover.

        This method implements the automatic failover facility. It the existing
        failover() method of the RplCommands class to conduct failover.

        When the master goes down, the method can perform one of three actions:

        1) failover to list of candidates first then slaves
        2) failover to list of candidates only
        3) fail

        rpl[in]        instance of the RplCommands class
        interval[in]   time in seconds to wait to check status of servers

        Returns bool - True = success, raises exception on error
        """
        failover_mode = self.mode
        pingtime = self.options.get("pingtime", 3)
        exec_fail = self.options.get("exec_fail", None)
        post_fail = self.options.get("post_fail", None)
        pedantic = self.options.get("pedantic", False)

        # Only works for GTID_MODE=ON
        if not self.rpl.topology.gtid_enabled():
            msg = ("Topology must support global transaction ids and have "
                   "GTID_MODE=ON.")
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Require --master-info-repository=TABLE for all slaves
        if not self.rpl.topology.check_master_info_type("TABLE"):
            msg = ("Failover requires --master-info-repository=TABLE for "
                   "all slaves.")
            self._report(msg, logging.ERROR, False)
            raise UtilRplError(msg)

        # Check for mixing IP and hostnames
        if not self.rpl.check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)
            print("#\n# Failover daemon will start in 10 seconds.")
            time.sleep(10)

        # Test failover script. If it doesn't exist, fail.
        no_exec_fail_msg = ("Failover check script cannot be found. Please "
                            "check the path and filename for accuracy and "
                            "restart the failover daemon.")
        if exec_fail is not None and not os.path.exists(exec_fail):
            self._report(no_exec_fail_msg, logging.CRITICAL, False)
            raise UtilRplError(no_exec_fail_msg)

        # Check existence of errant transactions on slaves
        errant_tnx = self.rpl.topology.find_errant_transactions()
        if errant_tnx:
            print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
            self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
            for host, port, tnx_set in errant_tnx:
                errant_msg = (" - For slave '{0}@{1}': "
                              "{2}".format(host, port, ", ".join(tnx_set)))
                print("# {0}".format(errant_msg))
                self._report(errant_msg, logging.WARN, False)
            # Raise an exception (to stop) if pedantic mode is ON
            if pedantic:
                msg = ("{0} Note: If you want to ignore this issue, please do "
                       "not use the --pedantic option."
                       "".format(_ERRANT_TNX_ERROR))
                self._report(msg, logging.CRITICAL)
                raise UtilRplError(msg)

        self._report("Failover daemon started.", logging.INFO, False)
        self._report("Failover mode = {0}.".format(failover_mode),
                     logging.INFO, False)

        # Main loop - loop and fire on interval.
        done = False
        first_pass = True
        failover = False

        while not done:
            # Use try block in case master class has gone away.
            try:
                old_host = self.rpl.master.host
                old_port = self.rpl.master.port
            except:
                old_host = "UNKNOWN"
                old_port = "UNKNOWN"

            # If a failover script is provided, check it else check master
            # using connectivity checks.
            if exec_fail is not None:
                # Execute failover check script
                if not os.path.exists(exec_fail):
                    self._report(no_exec_fail_msg, logging.CRITICAL, False)
                    raise UtilRplError(no_exec_fail_msg)
                else:
                    self._report("# Spawning external script for failover "
                                 "checking.")
                    res = execute_script(exec_fail, None,
                                         [old_host, old_port],
                                         self.rpl.verbose)
                    if res == 0:
                        self._report("# Failover check script completed "
                                     "Ok. Failover averted.")
                    else:
                        self._report("# Failover check script failed. "
                                     "Failover initiated", logging.WARN)
                        failover = True
            else:
                # Check the master. If not alive, wait for pingtime seconds
                # and try again.
                if self.rpl.topology.master is not None and \
                   not self.rpl.topology.master.is_alive():
                    msg = ("Master may be down. Waiting for {0} seconds."
                           "".format(pingtime))
                    self._report(msg, logging.INFO, False)
                    time.sleep(pingtime)
                    try:
                        self.rpl.topology.master.connect()
                    except:
                        pass

                # Check the master again. If no connection or lost connection,
                # try ping. This performs the timeout threshold for detecting
                # a down master. If still not alive, try to reconnect and if
                # connection fails after 3 attempts, failover.
                if self.rpl.topology.master is None or \
                   not ping_host(self.rpl.topology.master.host, pingtime) or \
                   not self.rpl.topology.master.is_alive():
                    failover = True
                    if self._reconnect_master(self.pingtime):
                        failover = False  # Master is now connected again
                    if failover:
                        self._report("Failed to reconnect to the master after "
                                     "3 attempts.", logging.INFO)

            if failover:
                self._report("Master is confirmed to be down or "
                             "unreachable.", logging.CRITICAL, False)
                try:
                    self.rpl.topology.master.disconnect()
                except:
                    pass

                if failover_mode == "auto":
                    self._report("Failover starting in 'auto' mode...")
                    res = self.rpl.topology.failover(self.rpl.candidates,
                                                     False)
                elif failover_mode == "elect":
                    self._report("Failover starting in 'elect' mode...")
                    res = self.rpl.topology.failover(self.rpl.candidates, True)
                else:
                    msg = _FAILOVER_ERROR.format("Master has failed and "
                                                 "automatic failover is "
                                                 "not enabled. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    try:
                        self.rpl.topology.run_script(post_fail, False,
                                                     [old_host, old_port])
                    except Exception as err:  # pylint: disable=W0703
                        self._report("# Post fail script failed! {0}"
                                     "".format(err), level=logging.ERROR)
                    raise UtilRplError(msg, _FAILOVER_ERRNO)
                if not res:
                    msg = _FAILOVER_ERROR.format("An error was encountered "
                                                 "during failover. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    try:
                        self.rpl.topology.run_script(post_fail, False,
                                                     [old_host, old_port])
                    except Exception as err:  # pylint: disable=W0703
                        self._report("# Post fail script failed! {0}"
                                     "".format(err), level=logging.ERROR)
                    raise UtilRplError(msg)
                self.rpl.master = self.rpl.topology.master
                self.master = self.rpl.master
                self.rpl.topology.remove_discovered_slaves()
                self.rpl.topology.discover_slaves()
                self.list_data = None
                print("\nFailover daemon will restart in 5 seconds.")
                time.sleep(5)
                failover = False
                # Execute post failover script
                try:
                    self.rpl.topology.run_script(post_fail, False,
                                                 [old_host, old_port,
                                                  self.rpl.master.host,
                                                  self.rpl.master.port])
                except Exception as err:  # pylint: disable=W0703
                    self._report("# Post fail script failed! {0}"
                                 "".format(err), level=logging.ERROR)

                # Unregister existing instances from slaves
                self._report("Unregistering existing instances from slaves.",
                             logging.INFO, False)
                self.unregister_slaves(self.rpl.topology)

                # Register instance on the new master
                msg = ("Registering instance on new master "
                       "{0}:{1}.").format(self.master.host, self.master.port)
                self._report(msg, logging.INFO, False)

                failover_mode = self.register_instance()

            # discover slaves if option was specified at startup
            elif (self.options.get("discover", None) is not None and
                  not first_pass):
                # Force refresh of health list if new slaves found
                if self.rpl.topology.discover_slaves():
                    self.list_data = None

            # Check existence of errant transactions on slaves
            errant_tnx = self.rpl.topology.find_errant_transactions()
            if errant_tnx:
                if pedantic:
                    print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        print("# {0}".format(errant_msg))
                        self._report(errant_msg, logging.WARN, False)

                    # Raise an exception (to stop) if pedantic mode is ON
                    raise UtilRplError("{0} Note: If you want to ignore this "
                                       "issue, please do not use the "
                                       "--pedantic "
                                       "option.".format(_ERRANT_TNX_ERROR))
                else:
                    if self.rpl.logging:
                        warn_msg = ("{0} Check log for more "
                                    "details.".format(_ERRANT_TNX_ERROR))
                    else:
                        warn_msg = _ERRANT_TNX_ERROR
                    self.add_warning("errant_tnx", warn_msg)
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        self._report(errant_msg, logging.WARN, False)
            else:
                self.del_warning("errant_tnx")

            if self.master and self.master.is_alive():
                # Log status
                self._print_warnings()
                self._log_master_status()

                self.list_data = []
                if "health" in self.report_values:
                    (health_labels, health_data) = self._format_health_data()
                    if health_data:
                        self._log_data("Health Status:", health_labels,
                                       health_data)
                if "gtid" in self.report_values:
                    (gtid_labels, gtid_data) = self._format_gtid_data()
                    for i, v in enumerate(gtid_data):
                        if v:
                            self._log_data("GTID Status - {0}"
                                           "".format(_GTID_LISTS[i]),
                                           gtid_labels, v)
                if "uuid" in self.report_values:
                    (uuid_labels, uuid_data) = self._format_uuid_data()
                    if uuid_data:
                        self._log_data("UUID Status:", uuid_labels, uuid_data)

            # Disconnect the master while waiting for the interval to expire
            self.master.disconnect()

            # Wait for the interval to expire
            time.sleep(self.interval)

            # Reconnect to the master
            self._reconnect_master(self.pingtime)

            first_pass = False

        return True

Exemplo n.º 4

0

Exibir arquivo

    def run_auto_failover(self, console, failover_mode="auto"):
        """Run automatic failover

        This method implements the automatic failover facility. It uses the
        FailoverConsole class from the failover_console.py to implement all
        user interface commands and uses the existing failover() method of
        this class to conduct failover.

        When the master goes down, the method can perform one of three actions:

        1) failover to list of candidates first then slaves
        2) failover to list of candidates only
        3) fail

        console[in]    instance of the failover console class.

        Returns bool - True = success, raises exception on error
        """
        pingtime = self.options.get("pingtime", 3)
        exec_fail = self.options.get("exec_fail", None)
        post_fail = self.options.get("post_fail", None)
        pedantic = self.options.get('pedantic', False)
        fail_retry = self.options.get('fail_retry', None)

        # Only works for GTID_MODE=ON
        if not self.topology.gtid_enabled():
            msg = "Topology must support global transaction ids " + \
                  "and have GTID_MODE=ON."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Require --master-info-repository=TABLE for all slaves
        if not self.topology.check_master_info_type("TABLE"):
            msg = "Failover requires --master-info-repository=TABLE for " + \
                  "all slaves."
            self._report(msg, logging.ERROR, False)
            raise UtilRplError(msg)

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)
            print("#\n# Failover console will start in {0} seconds.".format(
                WARNING_SLEEP_TIME))
            time.sleep(WARNING_SLEEP_TIME)

        # Check existence of errant transactions on slaves
        errant_tnx = self.topology.find_errant_transactions()
        if errant_tnx:
            print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
            self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
            for host, port, tnx_set in errant_tnx:
                errant_msg = (" - For slave '{0}@{1}': "
                              "{2}".format(host, port, ", ".join(tnx_set)))
                print("# {0}".format(errant_msg))
                self._report(errant_msg, logging.WARN, False)
            # Raise an exception (to stop) if pedantic mode is ON
            if pedantic:
                raise UtilRplError("{0} Note: If you want to ignore this "
                                   "issue, please do not use the --pedantic "
                                   "option.".format(_ERRANT_TNX_ERROR))

        self._report("Failover console started.", logging.INFO, False)
        self._report("Failover mode = %s." % failover_mode, logging.INFO,
                     False)

        # Main loop - loop and fire on interval.
        done = False
        first_pass = True
        failover = False
        while not done:
            # Use try block in case master class has gone away.
            try:
                old_host = self.master.host
                old_port = self.master.port
            except:
                old_host = "UNKNOWN"
                old_port = "UNKNOWN"

            # If a failover script is provided, check it else check master
            # using connectivity checks.
            if exec_fail is not None:
                # Execute failover check script
                if not os.path.isfile(exec_fail):
                    message = EXTERNAL_SCRIPT_DOES_NOT_EXIST.format(
                        path=exec_fail)
                    self._report(message, logging.CRITICAL, False)
                    raise UtilRplError(message)
                elif not os.access(exec_fail, os.X_OK):
                    message = INSUFFICIENT_FILE_PERMISSIONS.format(
                        path=exec_fail, permissions='execute')
                    self._report(message, logging.CRITICAL, False)
                    raise UtilRplError(message)
                else:
                    self._report("# Spawning external script for failover "
                                 "checking.")
                    res = execute_script(exec_fail, None,
                                         [old_host, old_port], self.verbose)
                    if res == 0:
                        self._report("# Failover check script completed Ok. "
                                     "Failover averted.")
                    else:
                        self._report("# Failover check script failed. "
                                     "Failover initiated", logging.WARN)
                        failover = True
            else:
                # Check the master. If not alive, wait for pingtime seconds
                # and try again.
                if self.topology.master is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master may be down. Waiting for %s seconds." % \
                          pingtime
                    self._report(msg, logging.INFO, False)
                    time.sleep(pingtime)
                    try:
                        self.topology.master.connect()
                    except:
                        pass

                # If user specified a master fail retry, wait for the
                # predetermined time and attempt to check the master again.
                if fail_retry is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master is still not reachable. Waiting for %s " \
                          "seconds to retry detection." % fail_retry
                    self._report(msg, logging.INFO, False)
                    time.sleep(fail_retry)
                    try:
                        self.topology.master.connect()
                    except:
                        pass

                # Check the master again. If no connection or lost connection,
                # try ping. This performs the timeout threshold for detecting
                # a down master. If still not alive, try to reconnect and if
                # connection fails after 3 attempts, failover.
                if self.topology.master is None or \
                   not ping_host(self.topology.master.host, pingtime) or \
                   not self.topology.master.is_alive():
                    failover = True
                    i = 0
                    while i < 3:
                        try:
                            self.topology.master.connect()
                            failover = False  # Master is now connected again
                            break
                        except:
                            pass
                        time.sleep(pingtime)
                        i += 1

                    if failover:
                        self._report("Failed to reconnect to the master after "
                                     "3 attemps.", logging.INFO)
                    else:
                        self._report("Master is Ok. Resuming watch.",
                                     logging.INFO)

            if failover:
                self._report("Master is confirmed to be down or unreachable.",
                             logging.CRITICAL, False)
                try:
                    self.topology.master.disconnect()
                except:
                    pass
                console.clear()
                if failover_mode == 'auto':
                    self._report("Failover starting in 'auto' mode...")
                    res = self.topology.failover(self.candidates, False)
                elif failover_mode == 'elect':
                    self._report("Failover starting in 'elect' mode...")
                    res = self.topology.failover(self.candidates, True)
                else:
                    msg = _FAILOVER_ERROR % ("Master has failed and automatic "
                                             "failover is not enabled. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg, _FAILOVER_ERRNO)
                if not res:
                    msg = _FAILOVER_ERROR % ("An error was encountered "
                                             "during failover. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg)
                self.master = self.topology.master
                console.master = self.master
                self.topology.remove_discovered_slaves()
                self.topology.discover_slaves()
                console.list_data = None
                print "\nFailover console will restart in 5 seconds."
                time.sleep(5)
                console.clear()
                failover = False
                # Execute post failover script
                self.topology.run_script(post_fail, False,
                                         [old_host, old_port,
                                          self.master.host, self.master.port])

                # Unregister existing instances from slaves
                self._report("Unregistering existing instances from slaves.",
                             logging.INFO, False)
                console.unregister_slaves(self.topology)

                # Register instance on the new master
                self._report("Registering instance on master.", logging.INFO,
                             False)
                failover_mode = console.register_instance()

            # discover slaves if option was specified at startup
            elif (self.options.get("discover", None) is not None
                  and not first_pass):
                # Force refresh of health list if new slaves found
                if self.topology.discover_slaves():
                    console.list_data = None

            # Check existence of errant transactions on slaves
            errant_tnx = self.topology.find_errant_transactions()
            if errant_tnx:
                if pedantic:
                    print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        print("# {0}".format(errant_msg))
                        self._report(errant_msg, logging.WARN, False)

                    # Raise an exception (to stop) if pedantic mode is ON
                    raise UtilRplError("{0} Note: If you want to ignore this "
                                       "issue, please do not use the "
                                       "--pedantic "
                                       "option.".format(_ERRANT_TNX_ERROR))
                else:
                    if self.logging:
                        warn_msg = ("{0} Check log for more "
                                    "details.".format(_ERRANT_TNX_ERROR))
                    else:
                        warn_msg = _ERRANT_TNX_ERROR
                    console.add_warning('errant_tnx', warn_msg)
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        self._report(errant_msg, logging.WARN, False)
            else:
                console.del_warning('errant_tnx')

            res = console.display_console()
            if res is not None:    # None = normal timeout, keep going
                if not res:
                    return False   # Errors detected
                done = True        # User has quit
            first_pass = False

        return True

Exemplo n.º 5

0

Exibir arquivo

Arquivo: rpl_admin.py Projeto: dannykopping/deploi

    def auto_failover(self, interval):
        """Automatic failover
        
        This method implements the automatic failover facility. It uses the
        FailoverConsole class from the failover_console.py to implement all
        user interface commands and uses the existing failover() method of
        this class to conduct failover.
        
        When the master goes down, the method can perform one of three actions:
        
        1) failover to list of candidates first then slaves
        2) failover to list of candidates only
        3) fail
            
        interval[in]   time in seconds to wait to check status of servers
        
        Returns bool - True = success, raises exception on error
        """
        import time
        from mysql.utilities.command.failover_console import FailoverConsole
        from mysql.utilities.common.tools import ping_host
        from mysql.utilities.common.tools import execute_script
        
        failover_mode = self.options.get("failover_mode", "auto")
        pingtime = self.options.get("pingtime", 3)
        timeout = self.options.get("timeout", 3)
        exec_fail = self.options.get("exec_fail", None)
        force = self.options.get("force", False)
        post_fail = self.options.get("post_fail", None)
                
        # Only works for GTID_MODE=ON
        if not self.topology.gtid_enabled():
            msg = "Topology must support global transaction ids " + \
                  "and have GTID_MODE=ON."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Check privileges
        self._report("# Checking privileges.")
        errors = self.topology.check_privileges(failover_mode != 'fail')
        if len(errors):
            msg = "User %s on %s does not have sufficient privileges to " + \
                  "execute the %s command."
            for error in errors:
                self._report(msg % (error[0], error[1], command),
                                    logging.CRITICAL)
            raise UtilRplError("Not enough privileges to execute command.")

        # Test failover script. If it doesn't exist, fail.
        no_exec_fail_msg = "Failover check script cannot be found. Please " + \
                           "check the path and filename for accuracy and " + \
                           "restart the failover console."
        if exec_fail is not None and not os.path.exists(fail_check):
            self._report(no_exec_fail_msg, logging.CRITICAL, False)
            raise UtilRplError(no_exec_fail_msg)
               
        # Initialize a console
        console = FailoverConsole(self.topology.master, self.topology.get_health,
                                  self.topology.get_gtid_data,
                                  self.topology.get_server_uuids,
                                  self.options)
        
        # Register instance
        self._report("Registering instance on master.", logging.INFO, False)
        old_mode = failover_mode
        failover_mode = console.register_instance(force)
        if failover_mode != old_mode:
            self._report("Multiple instances of failover console found for "
                         "master %s:%s." % (self.topology.master.host,
                                            self.topology.master.port),
                         logging.WARN)
            print "Failover mode changed to 'FAIL'. Console will start in 5 seconds."
            time.sleep(5)
        
        self._report("Failover console started.", logging.INFO, False)
        self._report("Failover mode = %s." % failover_mode, logging.INFO, False)
        
        # Main loop - loop and fire on interval.
        done = False
        first_pass = True
        failover = False
        while not done:
            # If a failover script is provided, check it else check master
            # using connectivity checks.
            if exec_fail is not None:
                # Execute failover check script
                if not os.path.exists(exec_fail):
                    self._report(no_exec_fail_msg, logging.CRITICAL, False)
                    raise UtilRplError(no_exec_fail_msg)
                else:
                    self._report("# Spawning external script for failover "
                                 "checking.")
                    res = execute_script(script)
                    if res == 0:
                        self._report("# Failover check script completed Ok. "
                                     "Failover averted.")
                    else:
                        self._report("# Failover check script failed. "
                                     "Failover initiated", logging.WARN)
                        failover = True
            else:
                # Check the master. If not alive, wait for timeout seconds
                # and try again.
                if self.topology.master is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master may be down. Waiting for %s seconds." % \
                          timeout
                    self._report(msg, logging.INFO, False)
                    time.sleep(timeout)
                    try:
                        self.topology.master.connect()
                    except:
                        self._report("Cannot reconnect to master.",
                                     logging.INFO, False)
                        
                # Check the master again. If no connection or lost connection, 
                # try ping and if still not alive, failover. This performs the
                # timeout threshold for detecting a down master.
                if self.topology.master is None or \
                   not ping_host(self.topology.master.host, pingtime) or \
                   not self.topology.master.is_alive():
                    failover = True
            
            if failover:
                self._report("Master is confirmed to be down or unreachable.",
                             logging.CRITICAL, False)
                try:
                    self.topology.master.disconnect()
                except:
                    pass
                console.clear()
                if failover_mode == 'auto':
                    self._report("Failover starting in 'auto' mode...")
                    res = self.topology.failover(self.candidates, False)
                elif failover_mode == 'elect':
                    self._report("Failover starting in 'elect' mode...")
                    res = self.topology.failover(self.candidates, True)
                else:
                    msg = _FAILOVER_ERROR % \
                          "Master has failed and automatic failover is not enabled. "
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False)
                    raise UtilRplError(msg, _FAILOVER_ERRNO)
                if not res:
                    msg = _FAILOVER_ERROR % "An error was encountered " + \
                          "during failover. "
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False)
                    raise UtilRplError(msg)
                self.master = self.topology.master
                console.master = self.master
                self.topology.remove_discovered_slaves()
                self.topology.discover_slaves()
                console.list_data = None
                print "\nFailover console will restart in 5 seconds."
                time.sleep(5)
                console.clear()
                failover = False
                # Execute post failover script
                self.topology.run_script(post_fail, False)

            # discover slaves if option was specified at startup
            elif self.options.get("discover", None) is not None \
                and not first_pass:
                # Force refresh of health list if new slaves found
                if self.topology.discover_slaves():
                    console.list_data = None

            res = console.display_console()
            if res is not None:    # None = normal timeout, keep going
                if not res:
                    return False   # Errors detected
                done = True        # User has quit
            first_pass = False

        # Unregister instance
        self._report("Unregistering instance on master.", logging.INFO, False)
        console.register_instance(False, False)
        self._report("Failover console stopped.", logging.INFO, False)

        return True

Exemplo n.º 6

0

Exibir arquivo

Arquivo: failover_daemon.py Projeto: fengshao0907/mysql-utilities

    def run(self):
        """Run automatic failover.

        This method implements the automatic failover facility. It the existing
        failover() method of the RplCommands class to conduct failover.

        When the master goes down, the method can perform one of three actions:

        1) failover to list of candidates first then slaves
        2) failover to list of candidates only
        3) fail

        rpl[in]        instance of the RplCommands class
        interval[in]   time in seconds to wait to check status of servers

        Returns bool - True = success, raises exception on error
        """
        failover_mode = self.mode
        pingtime = self.options.get("pingtime", 3)
        exec_fail = self.options.get("exec_fail", None)
        post_fail = self.options.get("post_fail", None)
        pedantic = self.options.get("pedantic", False)

        # Only works for GTID_MODE=ON
        if not self.rpl.topology.gtid_enabled():
            msg = ("Topology must support global transaction ids and have "
                   "GTID_MODE=ON.")
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Require --master-info-repository=TABLE for all slaves
        if not self.rpl.topology.check_master_info_type("TABLE"):
            msg = ("Failover requires --master-info-repository=TABLE for "
                   "all slaves.")
            self._report(msg, logging.ERROR, False)
            raise UtilRplError(msg)

        # Check for mixing IP and hostnames
        if not self.rpl.check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)
            print("#\n# Failover daemon will start in 10 seconds.")
            time.sleep(10)

        # Test failover script. If it doesn't exist, fail.
        no_exec_fail_msg = ("Failover check script cannot be found. Please "
                            "check the path and filename for accuracy and "
                            "restart the failover daemon.")
        if exec_fail is not None and not os.path.exists(exec_fail):
            self._report(no_exec_fail_msg, logging.CRITICAL, False)
            raise UtilRplError(no_exec_fail_msg)

        # Check existence of errant transactions on slaves
        errant_tnx = self.rpl.topology.find_errant_transactions()
        if errant_tnx:
            print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
            self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
            for host, port, tnx_set in errant_tnx:
                errant_msg = (" - For slave '{0}@{1}': "
                              "{2}".format(host, port, ", ".join(tnx_set)))
                print("# {0}".format(errant_msg))
                self._report(errant_msg, logging.WARN, False)
            # Raise an exception (to stop) if pedantic mode is ON
            if pedantic:
                msg = ("{0} Note: If you want to ignore this issue, please do "
                       "not use the --pedantic option."
                       "".format(_ERRANT_TNX_ERROR))
                self._report(msg, logging.CRITICAL)
                raise UtilRplError(msg)

        self._report("Failover daemon started.", logging.INFO, False)
        self._report("Failover mode = {0}.".format(failover_mode),
                     logging.INFO, False)

        # Main loop - loop and fire on interval.
        done = False
        first_pass = True
        failover = False

        while not done:
            # Use try block in case master class has gone away.
            try:
                old_host = self.rpl.master.host
                old_port = self.rpl.master.port
            except:
                old_host = "UNKNOWN"
                old_port = "UNKNOWN"

            # If a failover script is provided, check it else check master
            # using connectivity checks.
            if exec_fail is not None:
                # Execute failover check script
                if not os.path.exists(exec_fail):
                    self._report(no_exec_fail_msg, logging.CRITICAL, False)
                    raise UtilRplError(no_exec_fail_msg)
                else:
                    self._report("# Spawning external script for failover "
                                 "checking.")
                    res = execute_script(exec_fail, None,
                                         [old_host, old_port],
                                         self.rpl.verbose)
                    if res == 0:
                        self._report("# Failover check script completed "
                                     "Ok. Failover averted.")
                    else:
                        self._report("# Failover check script failed. "
                                     "Failover initiated", logging.WARN)
                        failover = True
            else:
                # Check the master. If not alive, wait for pingtime seconds
                # and try again.
                if self.rpl.topology.master is not None and \
                   not self.rpl.topology.master.is_alive():
                    msg = ("Master may be down. Waiting for {0} seconds."
                           "".format(pingtime))
                    self._report(msg, logging.INFO, False)
                    time.sleep(pingtime)
                    try:
                        self.rpl.topology.master.connect()
                    except:
                        pass

                # Check the master again. If no connection or lost connection,
                # try ping. This performs the timeout threshold for detecting
                # a down master. If still not alive, try to reconnect and if
                # connection fails after 3 attempts, failover.
                if self.rpl.topology.master is None or \
                   not ping_host(self.rpl.topology.master.host, pingtime) or \
                   not self.rpl.topology.master.is_alive():
                    failover = True
                    if self._reconnect_master(self.pingtime):
                        failover = False  # Master is now connected again
                    if failover:
                        self._report("Failed to reconnect to the master after "
                                     "3 attemps.", logging.INFO)

            if failover:
                self._report("Master is confirmed to be down or "
                             "unreachable.", logging.CRITICAL, False)
                try:
                    self.rpl.topology.master.disconnect()
                except:
                    pass

                if failover_mode == "auto":
                    self._report("Failover starting in 'auto' mode...")
                    res = self.rpl.topology.failover(self.rpl.candidates,
                                                     False)
                elif failover_mode == "elect":
                    self._report("Failover starting in 'elect' mode...")
                    res = self.rpl.topology.failover(self.rpl.candidates, True)
                else:
                    msg = _FAILOVER_ERROR.format("Master has failed and "
                                                 "automatic failover is "
                                                 "not enabled. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.rpl.topology.run_script(post_fail, False,
                                                 [old_host, old_port])
                    raise UtilRplError(msg, _FAILOVER_ERRNO)
                if not res:
                    msg = _FAILOVER_ERROR.format("An error was encountered "
                                                 "during failover. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.rpl.topology.run_script(post_fail, False,
                                                 [old_host, old_port])
                    raise UtilRplError(msg)
                self.rpl.master = self.rpl.topology.master
                self.master = self.rpl.master
                self.rpl.topology.remove_discovered_slaves()
                self.rpl.topology.discover_slaves()
                self.list_data = None
                print("\nFailover daemon will restart in 5 seconds.")
                time.sleep(5)
                failover = False
                # Execute post failover script
                self.rpl.topology.run_script(post_fail, False,
                                             [old_host, old_port,
                                              self.rpl.master.host,
                                              self.rpl.master.port])

                # Unregister existing instances from slaves
                self._report("Unregistering existing instances from slaves.",
                             logging.INFO, False)
                self.unregister_slaves(self.rpl.topology)

                # Register instance on the new master
                msg = ("Registering instance on new master "
                       "{0}:{1}.").format(self.master.host, self.master.port)
                self._report(msg, logging.INFO, False)

                failover_mode = self.register_instance()

            # discover slaves if option was specified at startup
            elif (self.options.get("discover", None) is not None
                  and not first_pass):
                # Force refresh of health list if new slaves found
                if self.rpl.topology.discover_slaves():
                    self.list_data = None

            # Check existence of errant transactions on slaves
            errant_tnx = self.rpl.topology.find_errant_transactions()
            if errant_tnx:
                if pedantic:
                    print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        print("# {0}".format(errant_msg))
                        self._report(errant_msg, logging.WARN, False)

                    # Raise an exception (to stop) if pedantic mode is ON
                    raise UtilRplError("{0} Note: If you want to ignore this "
                                       "issue, please do not use the "
                                       "--pedantic "
                                       "option.".format(_ERRANT_TNX_ERROR))
                else:
                    if self.rpl.logging:
                        warn_msg = ("{0} Check log for more "
                                    "details.".format(_ERRANT_TNX_ERROR))
                    else:
                        warn_msg = _ERRANT_TNX_ERROR
                    self.add_warning("errant_tnx", warn_msg)
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        self._report(errant_msg, logging.WARN, False)
            else:
                self.del_warning("errant_tnx")

            if self.master and self.master.is_alive():
                # Log status
                self._print_warnings()
                self._log_master_status()

                self.list_data = []
                if "health" in self.report_values:
                    (health_labels, health_data) = self._format_health_data()
                    if health_data:
                        self._log_data("Health Status:", health_labels,
                                       health_data)
                if "gtid" in self.report_values:
                    (gtid_labels, gtid_data) = self._format_gtid_data()
                    for i, v in enumerate(gtid_data):
                        if v:
                            self._log_data("GTID Status - {0}"
                                           "".format(_GTID_LISTS[i]),
                                           gtid_labels, v)
                if "uuid" in self.report_values:
                    (uuid_labels, uuid_data) = self._format_uuid_data()
                    if uuid_data:
                        self._log_data("UUID Status:", uuid_labels, uuid_data)

            # Disconnect the master while waiting for the interval to expire
            self.master.disconnect()

            # Wait for the interval to expire
            time.sleep(self.interval)

            # Reconnect to the master
            self._reconnect_master(self.pingtime)

            first_pass = False

        return True

Exemplo n.º 7

0

Exibir arquivo

Arquivo: rpl_admin.py Projeto: abooitt/mysql-utilities

    def run_auto_failover(self, console, failover_mode="auto"):
        """Run automatic failover

        This method implements the automatic failover facility. It uses the
        FailoverConsole class from the failover_console.py to implement all
        user interface commands and uses the existing failover() method of
        this class to conduct failover.

        When the master goes down, the method can perform one of three actions:

        1) failover to list of candidates first then slaves
        2) failover to list of candidates only
        3) fail

        console[in]    instance of the failover console class.

        Returns bool - True = success, raises exception on error
        """
        pingtime = self.options.get("pingtime", 3)
        exec_fail = self.options.get("exec_fail", None)
        post_fail = self.options.get("post_fail", None)
        pedantic = self.options.get('pedantic', False)

        # Only works for GTID_MODE=ON
        if not self.topology.gtid_enabled():
            msg = "Topology must support global transaction ids " + \
                  "and have GTID_MODE=ON."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Require --master-info-repository=TABLE for all slaves
        if not self.topology.check_master_info_type("TABLE"):
            msg = "Failover requires --master-info-repository=TABLE for " + \
                  "all slaves."
            self._report(msg, logging.ERROR, False)
            raise UtilRplError(msg)

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)
            print("#\n# Failover console will start in {0} seconds.".format(
                WARNING_SLEEP_TIME))
            time.sleep(WARNING_SLEEP_TIME)

        # Check existence of errant transactions on slaves
        errant_tnx = self.topology.find_errant_transactions()
        if errant_tnx:
            print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
            self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
            for host, port, tnx_set in errant_tnx:
                errant_msg = (" - For slave '{0}@{1}': "
                              "{2}".format(host, port, ", ".join(tnx_set)))
                print("# {0}".format(errant_msg))
                self._report(errant_msg, logging.WARN, False)
            # Raise an exception (to stop) if pedantic mode is ON
            if pedantic:
                raise UtilRplError("{0} Note: If you want to ignore this "
                                   "issue, please do not use the --pedantic "
                                   "option.".format(_ERRANT_TNX_ERROR))

        self._report("Failover console started.", logging.INFO, False)
        self._report("Failover mode = %s." % failover_mode, logging.INFO,
                     False)

        # Main loop - loop and fire on interval.
        done = False
        first_pass = True
        failover = False
        while not done:
            # Use try block in case master class has gone away.
            try:
                old_host = self.master.host
                old_port = self.master.port
            except:
                old_host = "UNKNOWN"
                old_port = "UNKNOWN"

            # If a failover script is provided, check it else check master
            # using connectivity checks.
            if exec_fail is not None:
                # Execute failover check script
                if not os.path.isfile(exec_fail):
                    message = EXTERNAL_SCRIPT_DOES_NOT_EXIST.format(
                        path=exec_fail)
                    self._report(message, logging.CRITICAL, False)
                    raise UtilRplError(message)
                elif not os.access(exec_fail, os.X_OK):
                    message = INSUFFICIENT_FILE_PERMISSIONS.format(
                        path=exec_fail, permissions='execute')
                    self._report(message, logging.CRITICAL, False)
                    raise UtilRplError(message)
                else:
                    self._report("# Spawning external script for failover "
                                 "checking.")
                    res = execute_script(exec_fail, None,
                                         [old_host, old_port], self.verbose)
                    if res == 0:
                        self._report("# Failover check script completed Ok. "
                                     "Failover averted.")
                    else:
                        self._report("# Failover check script failed. "
                                     "Failover initiated", logging.WARN)
                        failover = True
            else:
                # Check the master. If not alive, wait for pingtime seconds
                # and try again.
                if self.topology.master is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master may be down. Waiting for %s seconds." % \
                          pingtime
                    self._report(msg, logging.INFO, False)
                    time.sleep(pingtime)
                    try:
                        self.topology.master.connect()
                    except:
                        pass

                # Check the master again. If no connection or lost connection,
                # try ping. This performs the timeout threshold for detecting
                # a down master. If still not alive, try to reconnect and if
                # connection fails after 3 attempts, failover.
                if self.topology.master is None or \
                   not ping_host(self.topology.master.host, pingtime) or \
                   not self.topology.master.is_alive():
                    failover = True
                    i = 0
                    while i < 3:
                        try:
                            self.topology.master.connect()
                            failover = False  # Master is now connected again
                            break
                        except:
                            pass
                        time.sleep(pingtime)
                        i += 1

                    if failover:
                        self._report("Failed to reconnect to the master after "
                                     "3 attemps.", logging.INFO)

            if failover:
                self._report("Master is confirmed to be down or unreachable.",
                             logging.CRITICAL, False)
                try:
                    self.topology.master.disconnect()
                except:
                    pass
                console.clear()
                if failover_mode == 'auto':
                    self._report("Failover starting in 'auto' mode...")
                    res = self.topology.failover(self.candidates, False)
                elif failover_mode == 'elect':
                    self._report("Failover starting in 'elect' mode...")
                    res = self.topology.failover(self.candidates, True)
                else:
                    msg = _FAILOVER_ERROR % ("Master has failed and automatic "
                                             "failover is not enabled. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg, _FAILOVER_ERRNO)
                if not res:
                    msg = _FAILOVER_ERROR % ("An error was encountered "
                                             "during failover. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg)
                self.master = self.topology.master
                console.master = self.master
                self.topology.remove_discovered_slaves()
                self.topology.discover_slaves()
                console.list_data = None
                print "\nFailover console will restart in 5 seconds."
                time.sleep(5)
                console.clear()
                failover = False
                # Execute post failover script
                self.topology.run_script(post_fail, False,
                                         [old_host, old_port,
                                          self.master.host, self.master.port])

                # Unregister existing instances from slaves
                self._report("Unregistering existing instances from slaves.",
                             logging.INFO, False)
                console.unregister_slaves(self.topology)

                # Register instance on the new master
                self._report("Registering instance on master.", logging.INFO,
                             False)
                failover_mode = console.register_instance()

            # discover slaves if option was specified at startup
            elif (self.options.get("discover", None) is not None
                  and not first_pass):
                # Force refresh of health list if new slaves found
                if self.topology.discover_slaves():
                    console.list_data = None

            # Check existence of errant transactions on slaves
            errant_tnx = self.topology.find_errant_transactions()
            if errant_tnx:
                if pedantic:
                    print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        print("# {0}".format(errant_msg))
                        self._report(errant_msg, logging.WARN, False)

                    # Raise an exception (to stop) if pedantic mode is ON
                    raise UtilRplError("{0} Note: If you want to ignore this "
                                       "issue, please do not use the "
                                       "--pedantic "
                                       "option.".format(_ERRANT_TNX_ERROR))
                else:
                    if self.logging:
                        warn_msg = ("{0} Check log for more "
                                    "details.".format(_ERRANT_TNX_ERROR))
                    else:
                        warn_msg = _ERRANT_TNX_ERROR
                    console.add_warning('errant_tnx', warn_msg)
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        self._report(errant_msg, logging.WARN, False)
            else:
                console.del_warning('errant_tnx')

            res = console.display_console()
            if res is not None:    # None = normal timeout, keep going
                if not res:
                    return False   # Errors detected
                done = True        # User has quit
            first_pass = False

        return True