Пример #1
0
    def _execute(self, conn=None, discard_results=False):
        """
        Execute and commit all composed statements and flushes the buffer
        """
        try:
            if not conn:
                conn = self.conn
        except (WaitTimeoutError, MySQLError):
            raise # unrecoverable

        try:
            cur = conn.cursor(dictionary=True, buffered=True)
            for stmt, params in self._query_buffer.items():
                log.debug('%s %s', stmt, params)
                cur.execute(stmt, params=params)
                if not discard_results:
                    return cur.fetchall()

                # we discard results from writes
                conn.commit()
                try:
                    cur.fetchall()
                except MySQLError:
                    # Will get "InternalError: No result set to fetch from."
                    # for SET statements. We can safely let this slide if the
                    # `execute` call passes
                    pass
        finally:
            # exceptions are an unrecoverable situation
            self._query_buffer.clear()
            cur.close()
Пример #2
0
    def _execute(self, conn=None, discard_results=False):
        """
        Execute and commit all composed statements and flushes the buffer
        """
        try:
            if not conn:
                conn = self.conn
        except (WaitTimeoutError, MySQLError):
            raise  # unrecoverable

        try:
            cur = conn.cursor(dictionary=True, buffered=True)
            for stmt, params in self._query_buffer.items():
                log.debug('%s %s', stmt, params)
                cur.execute(stmt, params=params)
                if not discard_results:
                    return cur.fetchall()

                # we discard results from writes
                conn.commit()
                try:
                    cur.fetchall()
                except MySQLError:
                    # Will get "InternalError: No result set to fetch from."
                    # for SET statements. We can safely let this slide if the
                    # `execute` call passes
                    pass
        finally:
            # exceptions are an unrecoverable situation
            self._query_buffer.clear()
            cur.close()
Пример #3
0
    def is_primary(self):
        """
        Check if this node is the primary by checking in-memory cache,
        then Consul, then MySQL replication status. Caches its result so
        the node `state` field needs to be set to UNASSIGNED if you want
        to force a check of Consul, etc.
        """
        log.debug('state: %s' % self.cp.state)
        if self.cp.state != UNASSIGNED:
            return self.cp.state == PRIMARY

        try:
            # am I already replicating from somewhere else?
            _, primary_ip = self.mysql.get_primary()
            if not primary_ip:
                pass
            elif primary_ip == self.ip:
                self.cp.state = PRIMARY
                return True
            else:
                self.cp.state = REPLICA
                return False
        except (MySQLError, WaitTimeoutError, UnknownPrimary) as ex:
            log.debug('could not determine primary via mysqld status: %s', ex)

        try:
            # am I already reporting I'm a healthy primary to Consul?
            _, primary_ip = self.consul.get_primary()
            if not primary_ip:
                pass
            elif primary_ip == self.ip:
                self.cp.state = PRIMARY
                return True
            else:
                self.cp.state = REPLICA
                return False
        except (UnknownPrimary, ValueError) as ex:
            log.debug('could not determine primary via Consul: %s', ex)

        # am I listed in the Consul PRIMARY_KEY??
        _, primary_name = self.consul.read_lock(PRIMARY_KEY)
        log.debug('primary_name: %s' % primary_name)
        if primary_name == self.name:
            self.cp.state = PRIMARY
            return True

        # A fail over is happening and I'm the only one left and I got the lock
        _, failover_name = self.consul.read_lock(FAILOVER_KEY)
        log.debug('failover_name: %s' % failover_name)
        if failover_name == self.name:
            self.cp.state = PRIMARY
            return True

        self.cp.state = UNASSIGNED
        return False
Пример #4
0
def on_change(node):
    """ The top-level ContainerPilot onChange handler """

    # first check if this node has already been set primary by a completed
    # call to failover and update the ContainerPilot config as needed.
    if node.is_primary():
        log.debug('[on_change] this node is primary, no failover required.')
        if node.cp.update():
            # we're ignoring the lock here intentionally
            node.consul.put(PRIMARY_KEY, node.name)
            node.cp.reload()
        return

    # check if another node has been set primary already and is reporting
    # as healthy, in which case there's no failover required. Note that
    # we can't simply check if we're a replica via .is_replica() b/c that
    # trusts mysqld's view of the world.
    try:
        node.consul.get_primary(timeout=1)
        log.debug('[on_change] primary is already healthy, no failover required')
        return
    except (UnknownPrimary, WaitTimeoutError) as ex:
        log.debug('[on_change] no primary from consul: %s', ex)

    if node.consul.lock_failover(node.name):
        try:
            nodes = node.consul.client.health.service(REPLICA, passing=True)[1]
            ips = [instance['Service']['Address'] for instance in nodes]
            log.info('[on_change] Executing failover with candidates: %s', ips)
            node.mysql.failover(ips)
        except Exception:
            # On failure we bubble-up the exception and fail the onChange.
            # Either another instance that didn't overlap in time will
            # complete failover or we'll be left w/o a primary and require
            # manual intervention via `mysqlrpladmin failover`
            node.consul.unlock_failover()
            raise
    else:
        log.info('[on_change] Failover in progress on another node, '
                 'waiting to complete.')
        node.consul.wait_for_failover_lock()

    # need to determine replicaton status at this point, so make
    # sure we refresh .state from mysqld/Consul
    node.cp.state = UNASSIGNED
    if node.is_primary():
        log.info('[on_change] node %s is primary after failover', node.name)
        if node.cp.update():
            # we're intentionally ignoring the advisory lock here
            ok = node.consul.put(PRIMARY_KEY, node.name)
            log.debug('[on_change] %s obtained lock: %s', node.name, ok)
            node.cp.reload()
        return
    elif node.is_replica():
        log.info('[on_change] node %s is replica after failover', node.name)

    if node.cp.state == UNASSIGNED:
        log.error('[on_change] this node is neither primary or replica '
                  'after failover; check replication status on cluster.')
        sys.exit(1)
Пример #5
0
def on_change(node):
    """ The top-level ContainerPilot onChange handler """

    # first check if this node has already been set primary by a completed
    # call to failover and update the ContainerPilot config as needed.
    if node.is_primary():
        log.debug('[on_change] this node is primary, no failover required.')
        if node.cp.update():
            # we're ignoring the lock here intentionally
            node.consul.put(PRIMARY_KEY, node.name)
            node.cp.reload()
        return

    # check if another node has been set primary already and is reporting
    # as healthy, in which case there's no failover required. Note that
    # we can't simply check if we're a replica via .is_replica() b/c that
    # trusts mysqld's view of the world.
    try:
        node.consul.get_primary(timeout=1)
        log.debug('[on_change] primary is already healthy, no failover required')
        return
    except (UnknownPrimary, WaitTimeoutError) as ex:
        log.debug('[on_change] no primary from consul: %s', ex)

    if node.consul.lock_failover(node.name):
        try:
            nodes = node.consul.client.health.service(REPLICA, passing=True)[1]
            ips = [instance['Service']['Address'] for instance in nodes]
            log.info('[on_change] Executing failover with candidates: %s', ips)
            node.mysql.failover(ips)
        except Exception:
            # On failure we bubble-up the exception and fail the onChange.
            # Either another instance that didn't overlap in time will
            # complete failover or we'll be left w/o a primary and require
            # manual intervention via `mysqlrpladmin failover`
            node.consul.unlock_failover()
            raise
    else:
        log.info('[on_change] Failover in progress on another node, '
                 'waiting to complete.')
        node.consul.wait_for_failover_lock()

    # need to determine replicaton status at this point, so make
    # sure we refresh .state from mysqld/Consul
    node.cp.state = UNASSIGNED
    if node.is_primary():
        log.info('[on_change] node %s is primary after failover', node.name)
        if node.cp.update():
            # we're intentionally ignoring the advisory lock here
            ok = node.consul.put(PRIMARY_KEY, node.name)
            log.debug('[on_change] %s obtained lock: %s', node.name, ok)
            node.cp.reload()
        return
    elif node.is_replica():
        log.info('[on_change] node %s is replica after failover', node.name)

    if node.cp.state == UNASSIGNED:
        log.error('[on_change] this node is neither primary or replica '
                  'after failover; check replication status on cluster.')
        sys.exit(1)
Пример #6
0
    def is_primary(self):
        """
        Check if this node is the primary by checking in-memory cache,
        then Consul, then MySQL replication status. Caches its result so
        the node `state` field needs to be set to UNASSIGNED if you want
        to force a check of Consul, etc.
        """
        if self.cp.state != UNASSIGNED:
            return self.cp.state == PRIMARY

        try:
            # am I already replicating from somewhere else?
            _, primary_ip = self.mysql.get_primary()
            if not primary_ip:
                pass
            elif primary_ip == self.ip:
                self.cp.state = PRIMARY
                return True
            else:
                self.cp.state = REPLICA
                return False
        except (MySQLError, WaitTimeoutError, UnknownPrimary) as ex:
            log.debug('could not determine primary via mysqld status: %s', ex)

        try:
            # am I already reporting I'm a healthy primary to Consul?
            _, primary_ip = self.consul.get_primary()
            if not primary_ip:
                pass
            elif primary_ip == self.ip:
                self.cp.state = PRIMARY
                return True
            else:
                self.cp.state = REPLICA
                return False
        except (UnknownPrimary, ValueError) as ex:
            log.debug('could not determine primary via Consul: %s', ex)

        # am I listed in the Consul PRIMARY_KEY??
        _, primary_name = self.consul.read_lock(PRIMARY_KEY)
        if primary_name == self.name:
            self.cp.state = PRIMARY
            return True

        self.cp.state = UNASSIGNED
        return False
Пример #7
0
 def unlock_failover(self):
     """
     If we've previously locked a session for failover and a new
     primary has registered as healthy, unlock the session and
     remove the session file.
     """
     try:
         with open(FAILOVER_SESSION_FILE, 'r') as f:
             session_id = f.read()
             if self.get_primary():
                 self.unlock(FAILOVER_KEY, session_id)
                 os.remove(FAILOVER_SESSION_FILE)
     except (IOError, OSError):
         # we don't have a session file so just move on
         pass
     except (UnknownPrimary, WaitTimeoutError):
         # the primary isn't ready yet so we'll try
         # to unlock again on the next pass
         log.debug('failover session lock (%s) not removed because '
                   'primary has not reported as healthy', session_id)
Пример #8
0
 def get_primary(self, timeout=10):
     """
     Returns the (name, IP) tuple for the instance that Consul thinks
     is the healthy primary.
     """
     while timeout > 0:
         try:
             nodes = self.client.health.service(PRIMARY_KEY, passing=True)[1]
             log.debug(nodes)
             instances = [service['Service'] for service in nodes]
             if len(instances) > 1:
                 raise UnknownPrimary('Multiple primaries detected! %s', instances)
             return instances[0]['ID'], instances[0]['Address']
         except pyconsul.ConsulException as ex:
             log.debug(ex)
             timeout = timeout - 1
             time.sleep(1)
         except (IndexError, KeyError):
             raise UnknownPrimary('No primary found')
     raise WaitTimeoutError('Could not find primary before timeout.')
Пример #9
0
 def unlock_failover(self):
     """
     If we've previously locked a session for failover and a new
     primary has registered as healthy, unlock the session and
     remove the session file.
     """
     try:
         with open(FAILOVER_SESSION_FILE, 'r') as f:
             session_id = f.read()
             if self.get_primary():
                 self.unlock(FAILOVER_KEY, session_id)
                 os.remove(FAILOVER_SESSION_FILE)
     except (IOError, OSError):
         # we don't have a session file so just move on
         pass
     except (UnknownPrimary, WaitTimeoutError):
         # the primary isn't ready yet so we'll try
         # to unlock again on the next pass
         log.debug('failover session lock (%s) not removed because '
                   'primary has not reported as healthy', session_id)
Пример #10
0
 def get_primary(self, timeout=10):
     """
     Returns the (name, IP) tuple for the instance that Consul thinks
     is the healthy primary.
     """
     while timeout > 0:
         try:
             nodes = self.client.health.service(PRIMARY_KEY, passing=True)[1]
             log.debug(nodes)
             instances = [service['Service'] for service in nodes]
             if len(instances) > 1:
                 raise UnknownPrimary('Multiple primaries detected! %s', instances)
             return instances[0]['ID'], instances[0]['Address']
         except pyconsul.ConsulException as ex:
             log.debug(ex)
             timeout = timeout - 1
             time.sleep(1)
         except (IndexError, KeyError):
             raise UnknownPrimary('No primary found')
     raise WaitTimeoutError('Could not find primary before timeout.')
Пример #11
0
    def is_primary(self):
        """
        Check if this node is the primary by checking in-memory cache,
        then Consul, then MySQL replication status. Caches its result so
        the node `state` field needs to be set to UNASSIGNED if you want
        to force a check of Consul, etc.
        """
        if self.cp.state != UNASSIGNED:
            return self.cp.state == PRIMARY

        try:
            # am I already replicating from somewhere else?
            _, primary_ip = self.mysql.get_primary()
            if not primary_ip:
                pass
            elif primary_ip == self.ip:
                self.cp.state = PRIMARY
                return True
            else:
                self.cp.state = REPLICA
                return False
        except (MySQLError, WaitTimeoutError, UnknownPrimary) as ex:
            log.debug('could not determine primary via mysqld status: %s', ex)

        try:
            # am I already reporting I'm a healthy primary to Consul?
            _, primary_ip = self.consul.get_primary()
            if not primary_ip:
                pass
            elif primary_ip == self.ip:
                self.cp.state = PRIMARY
                return True
            else:
                self.cp.state = REPLICA
                return False
        except (UnknownPrimary, ValueError) as ex:
            log.debug('could not determine primary via Consul: %s', ex)

        self.cp.state = UNASSIGNED
        return False
Пример #12
0
def run_as_primary(node):
    """
    The overall workflow here is ported and reworked from the
    Oracle-provided Docker image:
    https://github.com/mysql/mysql-docker/blob/mysql-server/5.7/docker-entrypoint.sh
    """
    if not node.consul.mark_as_primary(node.name):
        return False
    node.cp.state = PRIMARY

    conn = None
    try:
        conn = node.mysql.wait_for_connection()
    except WaitTimeoutError:
        # Access Denied is expected if we are loading from a backup
        # and there are no other DBs to sync with
        log.debug(
            "[run_as_primary] no insecure connection found, database already setup"
        )

    my = node.mysql
    if conn:
        # if we can make a connection w/o a password then this is the
        # first pass. *Note: the conn is not the same as `node.conn`!*
        my.set_timezone_info()
        my.setup_root_user(conn)
        my.create_db(conn)
        my.create_default_user(conn)
        my.create_repl_user(conn)
        my.expire_root_password(conn)
    else:
        # in case this is a newly-promoted primary
        my.execute('STOP SLAVE')

    # although backups will be run from any instance, we need to first
    # snapshot the primary so that we can bootstrap replicas.
    write_snapshot(node)
    return True