def _check_candidate_switch(group_id, slave_id): """Check if the candidate has all the features to become the new master. """ allowed_status = (_server.MySQLServer.SECONDARY, _server.MySQLServer.SPARE) group = _server.Group.fetch(group_id) if not group.master: raise _errors.GroupError( "Group (%s) does not contain a valid " "master. Please, run a promote or failover." % (group_id, ) ) slave = _retrieve_server(slave_id, group_id) slave.connect() if group.master == slave.uuid: raise _errors.ServerError( "Candidate slave (%s) is already master." % (slave_id, ) ) master_issues = _replication.check_master_issues(slave) if master_issues: raise _errors.ServerError( "Server (%s) is not a valid candidate slave " "due to the following reason(s): (%s)." % (slave.uuid, master_issues) ) slave_issues = _replication.check_slave_issues(slave) if slave_issues: raise _errors.ServerError( "Server (%s) is not a valid candidate slave " "due to the following reason: (%s)." % (slave.uuid, slave_issues) ) master_uuid = _replication.slave_has_master(slave) if master_uuid is None or group.master != _uuid.UUID(master_uuid): raise _errors.GroupError( "The group's master (%s) is different from the candidate's " "master (%s)." % (group.master, master_uuid) ) if slave.status not in allowed_status: raise _errors.ServerError("Server (%s) is faulty." % (slave_id, )) _events.trigger_within_procedure( BLOCK_WRITE_SWITCH, group_id, master_uuid, str(slave.uuid) )
def _check_candidate_switch(group_id, slave_id): """Check if the candidate has all the features to become the new master. """ allowed_status = (_server.MySQLServer.SECONDARY, _server.MySQLServer.SPARE) group = _server.Group.fetch(group_id) if not group.master: raise _errors.GroupError("Group (%s) does not contain a valid " "master. Please, run a promote or failover." % (group_id, )) slave = _retrieve_server(slave_id, group_id) slave.connect() if group.master == slave.uuid: raise _errors.ServerError("Candidate slave (%s) is already master." % (slave_id, )) master_issues, why_master_issues = _replication.check_master_issues(slave) if master_issues: raise _errors.ServerError("Server (%s) is not a valid candidate slave " "due to the following reason(s): (%s)." % (slave.uuid, why_master_issues)) slave_issues, why_slave_issues = _replication.check_slave_issues(slave) if slave_issues: raise _errors.ServerError("Server (%s) is not a valid candidate slave " "due to the following reason: (%s)." % (slave.uuid, why_slave_issues)) master_uuid = _replication.slave_has_master(slave) if master_uuid is None or group.master != _uuid.UUID(master_uuid): raise _errors.GroupError( "The group's master (%s) is different from the candidate's " "master (%s)." % (group.master, master_uuid)) if slave.status not in allowed_status: raise _errors.ServerError("Server (%s) is faulty." % (slave_id, )) _events.trigger_within_procedure(BLOCK_WRITE_SWITCH, group_id, master_uuid, str(slave.uuid))
def _do_wait_slaves_catch(group_id, master, skip_servers=None): """Synchronize slaves with master. """ skip_servers = skip_servers or [] skip_servers.append(str(master.uuid)) group = _server.Group.fetch(group_id) for server in group.servers(): if str(server.uuid) not in skip_servers: try: server.connect() used_master_uuid = _replication.slave_has_master(server) if str(master.uuid) == used_master_uuid: _utils.synchronize(server, master) else: _LOGGER.debug( "Slave (%s) has a different master " "from group (%s).", server.uuid, group_id) except _errors.DatabaseError as error: _LOGGER.debug("Error synchronizing slave (%s): %s.", server.uuid, error)
def _do_wait_slaves_catch(group_id, master, skip_servers=None): """Synchronize slaves with master. """ skip_servers = skip_servers or [] skip_servers.append(str(master.uuid)) group = _server.Group.fetch(group_id) for server in group.servers(): if str(server.uuid) not in skip_servers: try: server.connect() used_master_uuid = _replication.slave_has_master(server) if str(master.uuid) == used_master_uuid: _utils.synchronize(server, master) else: _LOGGER.debug("Slave (%s) has a different master " "from group (%s).", server.uuid, group_id) except _errors.DatabaseError as error: _LOGGER.debug( "Error synchronizing slave (%s).", server.uuid, exc_info=error )
def _health(group_id): """Check which servers in a group are up and down. """ availability = {} group = _server.Group.fetch(group_id) if not group: raise _errors.GroupError("Group (%s) does not exist." % (group_id, )) for server in group.servers(): alive = False is_master = (group.master == server.uuid) thread_issues = {} status = server.status try: server.connect() alive = True if not is_master: slave_issues = _replication.check_slave_issues(server) str_master_uuid = _replication.slave_has_master(server) if (group.master is None or str(group.master) != \ str_master_uuid) and not slave_issues: thread_issues = \ "Group has master (%s) but server is connected " \ "to master (%s)." % \ (group.master, str_master_uuid) elif slave_issues: thread_issues = slave_issues except _errors.DatabaseError: status = _server.MySQLServer.FAULTY availability[str(server.uuid)] = { "is_alive" : alive, "status" : status, "threads" : thread_issues } return availability
def execute(self, group_id): """Check if any server within a group has failed. :param group_id: Group's id. """ group = _server.Group.fetch(group_id) if not group: raise _errors.GroupError("Group (%s) does not exist." % (group_id, )) info = ResultSet( names=[ 'uuid', 'is_alive', 'status', 'is_not_running', 'is_not_configured', 'io_not_running', 'sql_not_running', 'io_error', 'sql_error', 'gtid_executed' ], types=[str, bool, str] + [bool] * 4 + [str, str, str] ) issues = ResultSet(names=['issue'], types=[str]) for server in group.servers(): alive = False is_master = (group.master == server.uuid) status = server.status why_slave_issues = {} # These are used when server is not contactable. why_slave_issues = { 'is_not_running': False, 'is_not_configured': False, 'io_not_running': False, 'sql_not_running': False, 'io_error': False, 'sql_error': False, } try: # TODO: CHECK WHETHER WE SHOULD USE IS_ALIVE OR NOT. if server.is_alive: server.connect() alive = True if not is_master: slave_issues, why_slave_issues = \ _replication.check_slave_issues(server) str_master_uuid = _replication.slave_has_master(server) if (group.master is None or str(group.master) != \ str_master_uuid) and not slave_issues: issues.append_row([ "Group has master (%s) but server is connected " \ "to master (%s)." % \ (group.master, str_master_uuid) ]) gtid_executed= server.get_gtid_status()[0].GTID_EXECUTED else: status = _server.MySQLServer.FAULTY gtid_executed= "UNKNOWN" except _errors.DatabaseError: status = _server.MySQLServer.FAULTY gtid_executed= "UNKNOWN" info.append_row([ server.uuid, alive, status, why_slave_issues['is_not_running'], why_slave_issues['is_not_configured'], why_slave_issues['io_not_running'], why_slave_issues['sql_not_running'], why_slave_issues['io_error'], why_slave_issues['sql_error'], ' '.join(gtid_executed.splitlines()), ]) return CommandResult(None, results=[info, issues])
def _do_find_candidate(group_id, event): """Find the best candidate in a group that may be used to replace the current master if there is any. It chooses the slave that has processed more transactions and may become a master, e.g. has the binary log enabled. This function does not consider purged transactions and delays in the slave while picking up a slave. :param group_id: Group's id from where a candidate will be chosen. :return: Return the uuid of the best candidate to become a master in the group. """ forbidden_status = (_server.MySQLServer.FAULTY, _server.MySQLServer.SPARE) group = _server.Group.fetch(group_id) master_uuid = None if group.master: master_uuid = str(group.master) chosen_uuid = None chosen_gtid_status = None for candidate in group.servers(): if master_uuid != str(candidate.uuid) and \ candidate.status not in forbidden_status: try: candidate.connect() gtid_status = candidate.get_gtid_status() master_issues, why_master_issues = \ _replication.check_master_issues(candidate) slave_issues = False why_slave_issues = {} if event == FIND_CANDIDATE_SWITCH: slave_issues, why_slave_issues = \ _replication.check_slave_issues(candidate) has_valid_master = (master_uuid is None or \ _replication.slave_has_master(candidate) == master_uuid) can_become_master = False if chosen_gtid_status: n_trans = 0 try: n_trans = _replication.get_slave_num_gtid_behind( candidate, chosen_gtid_status) except _errors.InvalidGtidError: pass if n_trans == 0 and not master_issues and \ has_valid_master and not slave_issues: chosen_gtid_status = gtid_status chosen_uuid = str(candidate.uuid) can_become_master = True elif not master_issues and has_valid_master and \ not slave_issues: chosen_gtid_status = gtid_status chosen_uuid = str(candidate.uuid) can_become_master = True if not can_become_master: _LOGGER.warning( "Candidate (%s) cannot become a master due to the " "following reasons: issues to become a " "master (%s), prerequistes as a slave (%s), valid " "master (%s).", candidate.uuid, why_master_issues, why_slave_issues, has_valid_master) except _errors.DatabaseError as error: _LOGGER.warning("Error accessing candidate (%s): %s.", candidate.uuid, error) if not chosen_uuid: raise _errors.GroupError( "There is no valid candidate that can be automatically " "chosen in group (%s). Please, choose one manually." % (group_id, )) return chosen_uuid
def test_switchover_with_no_master(self): """Ensure that a switchover/failover happens when masters in the shard and global groups are dead. """ # Check that a shard group has it master pointing to a the master # in the global group. global_group = Group.fetch("GROUPID1") shard_group = Group.fetch("GROUPID2") other_shard_group = Group.fetch("GROUPID3") global_master = fetch_test_server(global_group.master) global_master.connect() shard_master = fetch_test_server(shard_group.master) shard_master.connect() other_shard_master = fetch_test_server(other_shard_group.master) other_shard_master.connect() self.assertEqual(_replication.slave_has_master(shard_master), str(global_group.master)) self.assertEqual(_replication.slave_has_master(other_shard_master), str(global_group.master)) # Demote the master in the global group and check that a # shard group points to None. global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.proxy.group.demote("GROUPID1") global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, None) self.assertEqual(_replication.slave_has_master(shard_master), None) self.assertEqual(_replication.slave_has_master(other_shard_master), None) # Demote the master in a shard group and promote the master # in the global group. global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, None) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, shard_master.uuid) self.proxy.group.demote("GROUPID2") shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, None) self.proxy.group.promote("GROUPID1", str(global_master.uuid)) global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.assertEqual(_replication.slave_has_master(shard_master), None) self.assertEqual(_replication.slave_has_master(other_shard_master), str(global_group.master)) # Promote the master in the previous shard group and check that # everything is back to normal. global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.assertEqual(_replication.slave_has_master(shard_master), None) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, None) self.proxy.group.promote("GROUPID2", str(shard_master.uuid)) self.assertEqual(_replication.slave_has_master(shard_master), str(global_group.master)) self.assertEqual(_replication.slave_has_master(other_shard_master), str(global_group.master)) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, shard_master.uuid) # Demote the master in the global group, check that a shard group # points to None, promot it again and check that everything is back # to normal global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, shard_master.uuid) self.proxy.group.demote("GROUPID1") global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, None) self.assertEqual(_replication.slave_has_master(shard_master), None) self.proxy.group.promote("GROUPID1", str(global_master.uuid)) global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.assertEqual(_replication.slave_has_master(shard_master), str(global_group.master)) self.assertEqual(_replication.slave_has_master(other_shard_master), str(global_group.master))
def test_switchover_with_no_master(self): """Ensure that a switchover/failover happens when masters in the shard and global groups are dead. """ # Check that a shard group has it master pointing to a the master # in the global group. global_group = Group.fetch("GROUPID1") shard_group = Group.fetch("GROUPID2") other_shard_group = Group.fetch("GROUPID3") global_master = MySQLServer.fetch(global_group.master) global_master.connect() shard_master = MySQLServer.fetch(shard_group.master) shard_master.connect() other_shard_master = MySQLServer.fetch(other_shard_group.master) other_shard_master.connect() self.assertEqual( _replication.slave_has_master(shard_master), str(global_group.master) ) self.assertEqual( _replication.slave_has_master(other_shard_master), str(global_group.master) ) # Demote the master in the global group and check that a # shard group points to None. global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.proxy.group.demote("GROUPID1") global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, None) self.assertEqual(_replication.slave_has_master(shard_master), None) self.assertEqual( _replication.slave_has_master(other_shard_master), None ) # Demote the master in a shard group and promote the master # in the global group. global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, None) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, shard_master.uuid) self.proxy.group.demote("GROUPID2") shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, None) self.proxy.group.promote("GROUPID1", str(global_master.uuid)) global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.assertEqual(_replication.slave_has_master(shard_master), None) self.assertEqual( _replication.slave_has_master(other_shard_master), str(global_group.master) ) # Promote the master in the previous shard group and check that # everything is back to normal. global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.assertEqual(_replication.slave_has_master(shard_master), None) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, None) self.proxy.group.promote("GROUPID2", str(shard_master.uuid)) self.assertEqual( _replication.slave_has_master(shard_master), str(global_group.master) ) self.assertEqual( _replication.slave_has_master(other_shard_master), str(global_group.master) ) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, shard_master.uuid) # Demote the master in the global group, check that a shard group # points to None, promot it again and check that everything is back # to normal global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, shard_master.uuid) self.proxy.group.demote("GROUPID1") global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, None) self.assertEqual(_replication.slave_has_master(shard_master), None) self.proxy.group.promote("GROUPID1", str(global_master.uuid)) global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.assertEqual( _replication.slave_has_master(shard_master), str(global_group.master) ) self.assertEqual( _replication.slave_has_master(other_shard_master), str(global_group.master) )
def _do_find_candidate(group_id, event): """Find the best candidate in a group that may be used to replace the current master if there is any. It chooses the slave that has processed more transactions and may become a master, e.g. has the binary log enabled. This function does not consider purged transactions and delays in the slave while picking up a slave. :param group_id: Group's id from where a candidate will be chosen. :return: Return the uuid of the best candidate to become a master in the group. """ forbidden_status = (_server.MySQLServer.FAULTY, _server.MySQLServer.SPARE) group = _server.Group.fetch(group_id) master_uuid = None if group.master: master_uuid = str(group.master) chosen_uuid = None chosen_gtid_status = None for candidate in group.servers(): if master_uuid != str(candidate.uuid) and \ candidate.status not in forbidden_status: try: candidate.connect() gtid_status = candidate.get_gtid_status() master_issues = \ _replication.check_master_issues(candidate) if event == FIND_CANDIDATE_SWITCH: slave_issues = \ _replication.check_slave_issues(candidate) else: slave_issues = {} has_valid_master = (master_uuid is None or \ _replication.slave_has_master(candidate) == master_uuid) can_become_master = False if chosen_gtid_status: n_trans = 0 try: n_trans = _replication.get_slave_num_gtid_behind( candidate, chosen_gtid_status ) except _errors.InvalidGtidError: pass if n_trans == 0 and not master_issues and \ has_valid_master and not slave_issues: chosen_gtid_status = gtid_status chosen_uuid = str(candidate.uuid) can_become_master = True elif not master_issues and has_valid_master and \ not slave_issues: chosen_gtid_status = gtid_status chosen_uuid = str(candidate.uuid) can_become_master = True if not can_become_master: _LOGGER.warning( "Candidate (%s) cannot become a master due to the " "following reasons: issues to become a " "master (%s), prerequistes as a slave (%s), valid " "master (%s).", candidate.uuid, master_issues, slave_issues, has_valid_master ) except _errors.DatabaseError as error: _LOGGER.warning( "Error accessing candidate (%s).", candidate.uuid, exc_info=error ) if not chosen_uuid: raise _errors.GroupError( "There is no valid candidate that can be automatically " "chosen in group (%s). Please, choose one manually." % (group_id, ) ) return chosen_uuid
def execute(self, group_id, timeout=None): """Check if any server within a group has failed. :param group_id: Group's id. :param group_id: Timeout value after which a server is considered unreachable. If None is provided, it assumes the default value in the configuration file. """ group = _server.Group.fetch(group_id) if not group: raise _errors.GroupError("Group (%s) does not exist." % (group_id, )) info = ResultSet( names=[ 'uuid', 'is_alive', 'status', 'is_not_running', 'is_not_configured', 'io_not_running', 'sql_not_running', 'io_error', 'sql_error' ], types=[str, bool, str] + [bool] * 4 + [str, str] ) issues = ResultSet(names=['issue'], types=[str]) try: timeout = float(timeout) except (TypeError, ValueError): pass for server in group.servers(): alive = False is_master = (group.master == server.uuid) status = server.status why_slave_issues = {} # These are used when server is not contactable. why_slave_issues = { 'is_not_running': False, 'is_not_configured': False, 'io_not_running': False, 'sql_not_running': False, 'io_error': False, 'sql_error': False, } try: alive = server.is_alive(timeout or DEFAULT_UNREACHABLE_TIMEOUT) if alive and not is_master: server.connect() slave_issues, why_slave_issues = \ _replication.check_slave_issues(server) str_master_uuid = _replication.slave_has_master(server) if (group.master is None or str(group.master) != \ str_master_uuid) and not slave_issues: issues.append_row([ "Group has master (%s) but server is connected " \ "to master (%s)." % \ (group.master, str_master_uuid) ]) except _errors.DatabaseError: alive = False info.append_row([ server.uuid, alive, status, why_slave_issues['is_not_running'], why_slave_issues['is_not_configured'], why_slave_issues['io_not_running'], why_slave_issues['sql_not_running'], why_slave_issues['io_error'], why_slave_issues['sql_error'], ]) return CommandResult(None, results=[info, issues])