Exemplo n.º 1
0
def _check_candidate_switch(group_id, slave_id):
    """Check if the candidate has all the features to become the new
    master.
    """
    allowed_status = (_server.MySQLServer.SECONDARY, _server.MySQLServer.SPARE)
    group = _server.Group.fetch(group_id)

    if not group.master:
        raise _errors.GroupError(
            "Group (%s) does not contain a valid "
            "master. Please, run a promote or failover." % (group_id, )
        )

    slave = _retrieve_server(slave_id, group_id)
    slave.connect()

    if group.master == slave.uuid:
        raise _errors.ServerError(
            "Candidate slave (%s) is already master." % (slave_id, )
            )

    master_issues = _replication.check_master_issues(slave)
    if master_issues:
        raise _errors.ServerError(
            "Server (%s) is not a valid candidate slave "
            "due to the following reason(s): (%s)." %
            (slave.uuid, master_issues)
            )

    slave_issues = _replication.check_slave_issues(slave)
    if slave_issues:
        raise _errors.ServerError(
            "Server (%s) is not a valid candidate slave "
            "due to the following reason: (%s)." %
            (slave.uuid, slave_issues)
            )

    master_uuid = _replication.slave_has_master(slave)
    if master_uuid is None or group.master != _uuid.UUID(master_uuid):
        raise _errors.GroupError(
            "The group's master (%s) is different from the candidate's "
            "master (%s)." % (group.master, master_uuid)
            )

    if slave.status not in allowed_status:
        raise _errors.ServerError("Server (%s) is faulty." % (slave_id, ))

    _events.trigger_within_procedure(
        BLOCK_WRITE_SWITCH, group_id, master_uuid, str(slave.uuid)
        )
Exemplo n.º 2
0
def _check_candidate_switch(group_id, slave_id):
    """Check if the candidate has all the features to become the new
    master.
    """
    allowed_status = (_server.MySQLServer.SECONDARY, _server.MySQLServer.SPARE)
    group = _server.Group.fetch(group_id)

    if not group.master:
        raise _errors.GroupError("Group (%s) does not contain a valid "
                                 "master. Please, run a promote or failover." %
                                 (group_id, ))

    slave = _retrieve_server(slave_id, group_id)
    slave.connect()

    if group.master == slave.uuid:
        raise _errors.ServerError("Candidate slave (%s) is already master." %
                                  (slave_id, ))

    master_issues, why_master_issues = _replication.check_master_issues(slave)
    if master_issues:
        raise _errors.ServerError("Server (%s) is not a valid candidate slave "
                                  "due to the following reason(s): (%s)." %
                                  (slave.uuid, why_master_issues))

    slave_issues, why_slave_issues = _replication.check_slave_issues(slave)
    if slave_issues:
        raise _errors.ServerError("Server (%s) is not a valid candidate slave "
                                  "due to the following reason: (%s)." %
                                  (slave.uuid, why_slave_issues))

    master_uuid = _replication.slave_has_master(slave)
    if master_uuid is None or group.master != _uuid.UUID(master_uuid):
        raise _errors.GroupError(
            "The group's master (%s) is different from the candidate's "
            "master (%s)." % (group.master, master_uuid))

    if slave.status not in allowed_status:
        raise _errors.ServerError("Server (%s) is faulty." % (slave_id, ))

    _events.trigger_within_procedure(BLOCK_WRITE_SWITCH, group_id, master_uuid,
                                     str(slave.uuid))
Exemplo n.º 3
0
def _do_wait_slaves_catch(group_id, master, skip_servers=None):
    """Synchronize slaves with master.
    """
    skip_servers = skip_servers or []
    skip_servers.append(str(master.uuid))

    group = _server.Group.fetch(group_id)
    for server in group.servers():
        if str(server.uuid) not in skip_servers:
            try:
                server.connect()
                used_master_uuid = _replication.slave_has_master(server)
                if str(master.uuid) == used_master_uuid:
                    _utils.synchronize(server, master)
                else:
                    _LOGGER.debug(
                        "Slave (%s) has a different master "
                        "from group (%s).", server.uuid, group_id)
            except _errors.DatabaseError as error:
                _LOGGER.debug("Error synchronizing slave (%s): %s.",
                              server.uuid, error)
Exemplo n.º 4
0
def _do_wait_slaves_catch(group_id, master, skip_servers=None):
    """Synchronize slaves with master.
    """
    skip_servers = skip_servers or []
    skip_servers.append(str(master.uuid))

    group = _server.Group.fetch(group_id)
    for server in group.servers():
        if str(server.uuid) not in skip_servers:
            try:
                server.connect()
                used_master_uuid = _replication.slave_has_master(server)
                if  str(master.uuid) == used_master_uuid:
                    _utils.synchronize(server, master)
                else:
                    _LOGGER.debug("Slave (%s) has a different master "
                        "from group (%s).", server.uuid, group_id)
            except _errors.DatabaseError as error:
                _LOGGER.debug(
                    "Error synchronizing slave (%s).", server.uuid,
                    exc_info=error
                )
Exemplo n.º 5
0
def _health(group_id):
    """Check which servers in a group are up and down.
    """
    availability = {}

    group = _server.Group.fetch(group_id)
    if not group:
        raise _errors.GroupError("Group (%s) does not exist." % (group_id, ))

    for server in group.servers():
        alive = False
        is_master = (group.master == server.uuid)
        thread_issues = {}
        status = server.status
        try:
            server.connect()
            alive = True
            if not is_master:
                slave_issues = _replication.check_slave_issues(server)
                str_master_uuid = _replication.slave_has_master(server)
                if (group.master is None or str(group.master) != \
                    str_master_uuid) and not slave_issues:
                    thread_issues = \
                        "Group has master (%s) but server is connected " \
                        "to master (%s)." % \
                        (group.master, str_master_uuid)
                elif slave_issues:
                    thread_issues = slave_issues
        except _errors.DatabaseError:
            status = _server.MySQLServer.FAULTY
        availability[str(server.uuid)] = {
            "is_alive" : alive,
            "status" : status,
            "threads" : thread_issues
            }

    return availability
Exemplo n.º 6
0
    def execute(self, group_id):
        """Check if any server within a group has failed.

        :param group_id: Group's id.
        """

        group = _server.Group.fetch(group_id)
        if not group:
            raise _errors.GroupError("Group (%s) does not exist." % (group_id, ))

        info = ResultSet(
            names=[
                'uuid', 'is_alive', 'status',
                'is_not_running', 'is_not_configured', 'io_not_running',
                'sql_not_running', 'io_error', 'sql_error', 'gtid_executed'
            ],
            types=[str, bool, str] + [bool] * 4 + [str, str, str]
        )
        issues = ResultSet(names=['issue'], types=[str])

        for server in group.servers():
            alive = False
            is_master = (group.master == server.uuid)
            status = server.status
            why_slave_issues = {}
            # These are used when server is not contactable.
            why_slave_issues = {
                'is_not_running': False,
                'is_not_configured': False,
                'io_not_running': False,
                'sql_not_running': False,
                'io_error': False,
                'sql_error': False,
            }
            try:
                # TODO: CHECK WHETHER WE SHOULD USE IS_ALIVE OR NOT.
                if server.is_alive:
                  server.connect()
                  alive = True
                  if not is_master:
                      slave_issues, why_slave_issues = \
                          _replication.check_slave_issues(server)
                      str_master_uuid = _replication.slave_has_master(server)
                      if (group.master is None or str(group.master) != \
                          str_master_uuid) and not slave_issues:
                          issues.append_row([
                              "Group has master (%s) but server is connected " \
                              "to master (%s)." % \
                              (group.master, str_master_uuid)
                          ])
                  gtid_executed= server.get_gtid_status()[0].GTID_EXECUTED
                else:
                  status = _server.MySQLServer.FAULTY
                  gtid_executed= "UNKNOWN"
                  
            except _errors.DatabaseError:
                status = _server.MySQLServer.FAULTY
                gtid_executed= "UNKNOWN"

            info.append_row([
                server.uuid,
                alive,
                status,
                why_slave_issues['is_not_running'],
                why_slave_issues['is_not_configured'],
                why_slave_issues['io_not_running'],
                why_slave_issues['sql_not_running'],
                why_slave_issues['io_error'],
                why_slave_issues['sql_error'],
                ' '.join(gtid_executed.splitlines()),
            ])

        return CommandResult(None, results=[info, issues])
Exemplo n.º 7
0
def _do_find_candidate(group_id, event):
    """Find the best candidate in a group that may be used to replace the
    current master if there is any.

    It chooses the slave that has processed more transactions and may become a
    master, e.g. has the binary log enabled. This function does not consider
    purged transactions and delays in the slave while picking up a slave.

    :param group_id: Group's id from where a candidate will be chosen.
    :return: Return the uuid of the best candidate to become a master in the
             group.
    """
    forbidden_status = (_server.MySQLServer.FAULTY, _server.MySQLServer.SPARE)
    group = _server.Group.fetch(group_id)

    master_uuid = None
    if group.master:
        master_uuid = str(group.master)

    chosen_uuid = None
    chosen_gtid_status = None
    for candidate in group.servers():
        if master_uuid != str(candidate.uuid) and \
            candidate.status not in forbidden_status:
            try:
                candidate.connect()
                gtid_status = candidate.get_gtid_status()
                master_issues, why_master_issues = \
                    _replication.check_master_issues(candidate)
                slave_issues = False
                why_slave_issues = {}
                if event == FIND_CANDIDATE_SWITCH:
                    slave_issues, why_slave_issues = \
                        _replication.check_slave_issues(candidate)
                has_valid_master = (master_uuid is None or \
                    _replication.slave_has_master(candidate) == master_uuid)
                can_become_master = False
                if chosen_gtid_status:
                    n_trans = 0
                    try:
                        n_trans = _replication.get_slave_num_gtid_behind(
                            candidate, chosen_gtid_status)
                    except _errors.InvalidGtidError:
                        pass
                    if n_trans == 0 and not master_issues and \
                        has_valid_master and not slave_issues:
                        chosen_gtid_status = gtid_status
                        chosen_uuid = str(candidate.uuid)
                        can_become_master = True
                elif not master_issues and has_valid_master and \
                    not slave_issues:
                    chosen_gtid_status = gtid_status
                    chosen_uuid = str(candidate.uuid)
                    can_become_master = True
                if not can_become_master:
                    _LOGGER.warning(
                        "Candidate (%s) cannot become a master due to the "
                        "following reasons: issues to become a "
                        "master (%s), prerequistes as a slave (%s), valid "
                        "master (%s).", candidate.uuid, why_master_issues,
                        why_slave_issues, has_valid_master)
            except _errors.DatabaseError as error:
                _LOGGER.warning("Error accessing candidate (%s): %s.",
                                candidate.uuid, error)

    if not chosen_uuid:
        raise _errors.GroupError(
            "There is no valid candidate that can be automatically "
            "chosen in group (%s). Please, choose one manually." %
            (group_id, ))
    return chosen_uuid
Exemplo n.º 8
0
    def test_switchover_with_no_master(self):
        """Ensure that a switchover/failover happens when masters in the
        shard and global groups are dead.
        """
        # Check that a shard group has it master pointing to a the master
        # in the global group.
        global_group = Group.fetch("GROUPID1")
        shard_group = Group.fetch("GROUPID2")
        other_shard_group = Group.fetch("GROUPID3")
        global_master = fetch_test_server(global_group.master)
        global_master.connect()
        shard_master = fetch_test_server(shard_group.master)
        shard_master.connect()
        other_shard_master = fetch_test_server(other_shard_group.master)
        other_shard_master.connect()
        self.assertEqual(_replication.slave_has_master(shard_master),
                         str(global_group.master))
        self.assertEqual(_replication.slave_has_master(other_shard_master),
                         str(global_group.master))

        # Demote the master in the global group and check that a
        # shard group points to None.
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, global_master.uuid)
        self.proxy.group.demote("GROUPID1")
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, None)
        self.assertEqual(_replication.slave_has_master(shard_master), None)
        self.assertEqual(_replication.slave_has_master(other_shard_master),
                         None)

        # Demote the master in a shard group and promote the master
        # in the global group.
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, None)
        shard_group = Group.fetch("GROUPID2")
        self.assertEqual(shard_group.master, shard_master.uuid)
        self.proxy.group.demote("GROUPID2")
        shard_group = Group.fetch("GROUPID2")
        self.assertEqual(shard_group.master, None)
        self.proxy.group.promote("GROUPID1", str(global_master.uuid))
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, global_master.uuid)
        self.assertEqual(_replication.slave_has_master(shard_master), None)
        self.assertEqual(_replication.slave_has_master(other_shard_master),
                         str(global_group.master))

        # Promote the master in the previous shard group and check that
        # everything is back to normal.
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, global_master.uuid)
        self.assertEqual(_replication.slave_has_master(shard_master), None)
        shard_group = Group.fetch("GROUPID2")
        self.assertEqual(shard_group.master, None)
        self.proxy.group.promote("GROUPID2", str(shard_master.uuid))
        self.assertEqual(_replication.slave_has_master(shard_master),
                         str(global_group.master))
        self.assertEqual(_replication.slave_has_master(other_shard_master),
                         str(global_group.master))
        shard_group = Group.fetch("GROUPID2")
        self.assertEqual(shard_group.master, shard_master.uuid)

        # Demote the master in the global group, check that a shard group
        # points to None, promot it again and check that everything is back
        # to normal
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, global_master.uuid)
        shard_group = Group.fetch("GROUPID2")
        self.assertEqual(shard_group.master, shard_master.uuid)
        self.proxy.group.demote("GROUPID1")
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, None)
        self.assertEqual(_replication.slave_has_master(shard_master), None)
        self.proxy.group.promote("GROUPID1", str(global_master.uuid))
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, global_master.uuid)
        self.assertEqual(_replication.slave_has_master(shard_master),
                         str(global_group.master))
        self.assertEqual(_replication.slave_has_master(other_shard_master),
                         str(global_group.master))
    def test_switchover_with_no_master(self):
        """Ensure that a switchover/failover happens when masters in the
        shard and global groups are dead.
        """
        # Check that a shard group has it master pointing to a the master
        # in the global group.
        global_group = Group.fetch("GROUPID1")
        shard_group = Group.fetch("GROUPID2")
        other_shard_group = Group.fetch("GROUPID3")
        global_master = MySQLServer.fetch(global_group.master)
        global_master.connect()
        shard_master = MySQLServer.fetch(shard_group.master)
        shard_master.connect()
        other_shard_master = MySQLServer.fetch(other_shard_group.master)
        other_shard_master.connect()
        self.assertEqual(
            _replication.slave_has_master(shard_master),
            str(global_group.master)
        )
        self.assertEqual(
            _replication.slave_has_master(other_shard_master),
            str(global_group.master)
        )

        # Demote the master in the global group and check that a
        # shard group points to None.
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, global_master.uuid)
        self.proxy.group.demote("GROUPID1")
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, None)
        self.assertEqual(_replication.slave_has_master(shard_master), None)
        self.assertEqual(
            _replication.slave_has_master(other_shard_master), None
        )

        # Demote the master in a shard group and promote the master
        # in the global group.
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, None)
        shard_group = Group.fetch("GROUPID2")
        self.assertEqual(shard_group.master, shard_master.uuid)
        self.proxy.group.demote("GROUPID2")
        shard_group = Group.fetch("GROUPID2")
        self.assertEqual(shard_group.master, None)
        self.proxy.group.promote("GROUPID1", str(global_master.uuid))
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, global_master.uuid)
        self.assertEqual(_replication.slave_has_master(shard_master), None)
        self.assertEqual(
            _replication.slave_has_master(other_shard_master),
            str(global_group.master)
        )

        # Promote the master in the previous shard group and check that
        # everything is back to normal.
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, global_master.uuid)
        self.assertEqual(_replication.slave_has_master(shard_master), None)
        shard_group = Group.fetch("GROUPID2")
        self.assertEqual(shard_group.master, None)
        self.proxy.group.promote("GROUPID2", str(shard_master.uuid))
        self.assertEqual(
            _replication.slave_has_master(shard_master),
            str(global_group.master)
        )
        self.assertEqual(
            _replication.slave_has_master(other_shard_master),
            str(global_group.master)
        )
        shard_group = Group.fetch("GROUPID2")
        self.assertEqual(shard_group.master, shard_master.uuid)

        # Demote the master in the global group, check that a shard group
        # points to None, promot it again and check that everything is back
        # to normal
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, global_master.uuid)
        shard_group = Group.fetch("GROUPID2")
        self.assertEqual(shard_group.master, shard_master.uuid)
        self.proxy.group.demote("GROUPID1")
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, None)
        self.assertEqual(_replication.slave_has_master(shard_master), None)
        self.proxy.group.promote("GROUPID1", str(global_master.uuid))
        global_group = Group.fetch("GROUPID1")
        self.assertEqual(global_group.master, global_master.uuid)
        self.assertEqual(
            _replication.slave_has_master(shard_master),
            str(global_group.master)
        )
        self.assertEqual(
            _replication.slave_has_master(other_shard_master),
            str(global_group.master)
        )
Exemplo n.º 10
0
def _do_find_candidate(group_id, event):
    """Find the best candidate in a group that may be used to replace the
    current master if there is any.

    It chooses the slave that has processed more transactions and may become a
    master, e.g. has the binary log enabled. This function does not consider
    purged transactions and delays in the slave while picking up a slave.

    :param group_id: Group's id from where a candidate will be chosen.
    :return: Return the uuid of the best candidate to become a master in the
             group.
    """
    forbidden_status = (_server.MySQLServer.FAULTY, _server.MySQLServer.SPARE)
    group = _server.Group.fetch(group_id)

    master_uuid = None
    if group.master:
        master_uuid = str(group.master)

    chosen_uuid = None
    chosen_gtid_status = None
    for candidate in group.servers():
        if master_uuid != str(candidate.uuid) and \
            candidate.status not in forbidden_status:
            try:
                candidate.connect()
                gtid_status = candidate.get_gtid_status()
                master_issues = \
                    _replication.check_master_issues(candidate)
                if event == FIND_CANDIDATE_SWITCH:
                    slave_issues = \
                        _replication.check_slave_issues(candidate)
                else:
                    slave_issues = {}
                has_valid_master = (master_uuid is None or \
                    _replication.slave_has_master(candidate) == master_uuid)
                can_become_master = False
                if chosen_gtid_status:
                    n_trans = 0
                    try:
                        n_trans = _replication.get_slave_num_gtid_behind(
                            candidate, chosen_gtid_status
                            )
                    except _errors.InvalidGtidError:
                        pass
                    if n_trans == 0 and not master_issues and \
                        has_valid_master and not slave_issues:
                        chosen_gtid_status = gtid_status
                        chosen_uuid = str(candidate.uuid)
                        can_become_master = True
                elif not master_issues and has_valid_master and \
                    not slave_issues:
                    chosen_gtid_status = gtid_status
                    chosen_uuid = str(candidate.uuid)
                    can_become_master = True
                if not can_become_master:
                    _LOGGER.warning(
                        "Candidate (%s) cannot become a master due to the "
                        "following reasons: issues to become a "
                        "master (%s), prerequistes as a slave (%s), valid "
                        "master (%s).", candidate.uuid, master_issues,
                        slave_issues, has_valid_master
                        )
            except _errors.DatabaseError as error:
                _LOGGER.warning(
                    "Error accessing candidate (%s).", candidate.uuid,
                    exc_info=error
                )

    if not chosen_uuid:
        raise _errors.GroupError(
            "There is no valid candidate that can be automatically "
            "chosen in group (%s). Please, choose one manually." %
            (group_id, )
            )
    return chosen_uuid
Exemplo n.º 11
0
    def execute(self, group_id, timeout=None):
        """Check if any server within a group has failed.

        :param group_id: Group's id.
        :param group_id: Timeout value after which a server is considered
                         unreachable. If None is provided, it assumes the
                         default value in the configuration file.
        """

        group = _server.Group.fetch(group_id)
        if not group:
            raise _errors.GroupError("Group (%s) does not exist." % (group_id, ))

        info = ResultSet(
            names=[
                'uuid', 'is_alive', 'status',
                'is_not_running', 'is_not_configured', 'io_not_running',
                'sql_not_running', 'io_error', 'sql_error'
            ],
            types=[str, bool, str] + [bool] * 4 + [str, str]
        )
        issues = ResultSet(names=['issue'], types=[str])

        try:
            timeout = float(timeout)
        except (TypeError, ValueError):
            pass

        for server in group.servers():
            alive = False
            is_master = (group.master == server.uuid)
            status = server.status

            why_slave_issues = {}
            # These are used when server is not contactable.
            why_slave_issues = {
                'is_not_running': False,
                'is_not_configured': False,
                'io_not_running': False,
                'sql_not_running': False,
                'io_error': False,
                'sql_error': False,
            }

            try:
                alive = server.is_alive(timeout or DEFAULT_UNREACHABLE_TIMEOUT)
                if alive and not is_master:
                    server.connect()
                    slave_issues, why_slave_issues = \
                        _replication.check_slave_issues(server)
                    str_master_uuid = _replication.slave_has_master(server)
                    if (group.master is None or str(group.master) != \
                        str_master_uuid) and not slave_issues:
                        issues.append_row([
                            "Group has master (%s) but server is connected " \
                            "to master (%s)." % \
                            (group.master, str_master_uuid)
                        ])
            except _errors.DatabaseError:
                alive = False

            info.append_row([
                server.uuid,
                alive,
                status,
                why_slave_issues['is_not_running'],
                why_slave_issues['is_not_configured'],
                why_slave_issues['io_not_running'],
                why_slave_issues['sql_not_running'],
                why_slave_issues['io_error'],
                why_slave_issues['sql_error'],
            ])

        return CommandResult(None, results=[info, issues])