def _execute(self, conn=None, discard_results=False): """ Execute and commit all composed statements and flushes the buffer """ try: if not conn: conn = self.conn except (WaitTimeoutError, MySQLError): raise # unrecoverable try: cur = conn.cursor(dictionary=True, buffered=True) for stmt, params in self._query_buffer.items(): log.debug('%s %s', stmt, params) cur.execute(stmt, params=params) if not discard_results: return cur.fetchall() # we discard results from writes conn.commit() try: cur.fetchall() except MySQLError: # Will get "InternalError: No result set to fetch from." # for SET statements. We can safely let this slide if the # `execute` call passes pass finally: # exceptions are an unrecoverable situation self._query_buffer.clear() cur.close()
def is_primary(self): """ Check if this node is the primary by checking in-memory cache, then Consul, then MySQL replication status. Caches its result so the node `state` field needs to be set to UNASSIGNED if you want to force a check of Consul, etc. """ log.debug('state: %s' % self.cp.state) if self.cp.state != UNASSIGNED: return self.cp.state == PRIMARY try: # am I already replicating from somewhere else? _, primary_ip = self.mysql.get_primary() if not primary_ip: pass elif primary_ip == self.ip: self.cp.state = PRIMARY return True else: self.cp.state = REPLICA return False except (MySQLError, WaitTimeoutError, UnknownPrimary) as ex: log.debug('could not determine primary via mysqld status: %s', ex) try: # am I already reporting I'm a healthy primary to Consul? _, primary_ip = self.consul.get_primary() if not primary_ip: pass elif primary_ip == self.ip: self.cp.state = PRIMARY return True else: self.cp.state = REPLICA return False except (UnknownPrimary, ValueError) as ex: log.debug('could not determine primary via Consul: %s', ex) # am I listed in the Consul PRIMARY_KEY?? _, primary_name = self.consul.read_lock(PRIMARY_KEY) log.debug('primary_name: %s' % primary_name) if primary_name == self.name: self.cp.state = PRIMARY return True # A fail over is happening and I'm the only one left and I got the lock _, failover_name = self.consul.read_lock(FAILOVER_KEY) log.debug('failover_name: %s' % failover_name) if failover_name == self.name: self.cp.state = PRIMARY return True self.cp.state = UNASSIGNED return False
def on_change(node): """ The top-level ContainerPilot onChange handler """ # first check if this node has already been set primary by a completed # call to failover and update the ContainerPilot config as needed. if node.is_primary(): log.debug('[on_change] this node is primary, no failover required.') if node.cp.update(): # we're ignoring the lock here intentionally node.consul.put(PRIMARY_KEY, node.name) node.cp.reload() return # check if another node has been set primary already and is reporting # as healthy, in which case there's no failover required. Note that # we can't simply check if we're a replica via .is_replica() b/c that # trusts mysqld's view of the world. try: node.consul.get_primary(timeout=1) log.debug('[on_change] primary is already healthy, no failover required') return except (UnknownPrimary, WaitTimeoutError) as ex: log.debug('[on_change] no primary from consul: %s', ex) if node.consul.lock_failover(node.name): try: nodes = node.consul.client.health.service(REPLICA, passing=True)[1] ips = [instance['Service']['Address'] for instance in nodes] log.info('[on_change] Executing failover with candidates: %s', ips) node.mysql.failover(ips) except Exception: # On failure we bubble-up the exception and fail the onChange. # Either another instance that didn't overlap in time will # complete failover or we'll be left w/o a primary and require # manual intervention via `mysqlrpladmin failover` node.consul.unlock_failover() raise else: log.info('[on_change] Failover in progress on another node, ' 'waiting to complete.') node.consul.wait_for_failover_lock() # need to determine replicaton status at this point, so make # sure we refresh .state from mysqld/Consul node.cp.state = UNASSIGNED if node.is_primary(): log.info('[on_change] node %s is primary after failover', node.name) if node.cp.update(): # we're intentionally ignoring the advisory lock here ok = node.consul.put(PRIMARY_KEY, node.name) log.debug('[on_change] %s obtained lock: %s', node.name, ok) node.cp.reload() return elif node.is_replica(): log.info('[on_change] node %s is replica after failover', node.name) if node.cp.state == UNASSIGNED: log.error('[on_change] this node is neither primary or replica ' 'after failover; check replication status on cluster.') sys.exit(1)
def is_primary(self): """ Check if this node is the primary by checking in-memory cache, then Consul, then MySQL replication status. Caches its result so the node `state` field needs to be set to UNASSIGNED if you want to force a check of Consul, etc. """ if self.cp.state != UNASSIGNED: return self.cp.state == PRIMARY try: # am I already replicating from somewhere else? _, primary_ip = self.mysql.get_primary() if not primary_ip: pass elif primary_ip == self.ip: self.cp.state = PRIMARY return True else: self.cp.state = REPLICA return False except (MySQLError, WaitTimeoutError, UnknownPrimary) as ex: log.debug('could not determine primary via mysqld status: %s', ex) try: # am I already reporting I'm a healthy primary to Consul? _, primary_ip = self.consul.get_primary() if not primary_ip: pass elif primary_ip == self.ip: self.cp.state = PRIMARY return True else: self.cp.state = REPLICA return False except (UnknownPrimary, ValueError) as ex: log.debug('could not determine primary via Consul: %s', ex) # am I listed in the Consul PRIMARY_KEY?? _, primary_name = self.consul.read_lock(PRIMARY_KEY) if primary_name == self.name: self.cp.state = PRIMARY return True self.cp.state = UNASSIGNED return False
def unlock_failover(self): """ If we've previously locked a session for failover and a new primary has registered as healthy, unlock the session and remove the session file. """ try: with open(FAILOVER_SESSION_FILE, 'r') as f: session_id = f.read() if self.get_primary(): self.unlock(FAILOVER_KEY, session_id) os.remove(FAILOVER_SESSION_FILE) except (IOError, OSError): # we don't have a session file so just move on pass except (UnknownPrimary, WaitTimeoutError): # the primary isn't ready yet so we'll try # to unlock again on the next pass log.debug('failover session lock (%s) not removed because ' 'primary has not reported as healthy', session_id)
def get_primary(self, timeout=10): """ Returns the (name, IP) tuple for the instance that Consul thinks is the healthy primary. """ while timeout > 0: try: nodes = self.client.health.service(PRIMARY_KEY, passing=True)[1] log.debug(nodes) instances = [service['Service'] for service in nodes] if len(instances) > 1: raise UnknownPrimary('Multiple primaries detected! %s', instances) return instances[0]['ID'], instances[0]['Address'] except pyconsul.ConsulException as ex: log.debug(ex) timeout = timeout - 1 time.sleep(1) except (IndexError, KeyError): raise UnknownPrimary('No primary found') raise WaitTimeoutError('Could not find primary before timeout.')
def is_primary(self): """ Check if this node is the primary by checking in-memory cache, then Consul, then MySQL replication status. Caches its result so the node `state` field needs to be set to UNASSIGNED if you want to force a check of Consul, etc. """ if self.cp.state != UNASSIGNED: return self.cp.state == PRIMARY try: # am I already replicating from somewhere else? _, primary_ip = self.mysql.get_primary() if not primary_ip: pass elif primary_ip == self.ip: self.cp.state = PRIMARY return True else: self.cp.state = REPLICA return False except (MySQLError, WaitTimeoutError, UnknownPrimary) as ex: log.debug('could not determine primary via mysqld status: %s', ex) try: # am I already reporting I'm a healthy primary to Consul? _, primary_ip = self.consul.get_primary() if not primary_ip: pass elif primary_ip == self.ip: self.cp.state = PRIMARY return True else: self.cp.state = REPLICA return False except (UnknownPrimary, ValueError) as ex: log.debug('could not determine primary via Consul: %s', ex) self.cp.state = UNASSIGNED return False
def run_as_primary(node): """ The overall workflow here is ported and reworked from the Oracle-provided Docker image: https://github.com/mysql/mysql-docker/blob/mysql-server/5.7/docker-entrypoint.sh """ if not node.consul.mark_as_primary(node.name): return False node.cp.state = PRIMARY conn = None try: conn = node.mysql.wait_for_connection() except WaitTimeoutError: # Access Denied is expected if we are loading from a backup # and there are no other DBs to sync with log.debug( "[run_as_primary] no insecure connection found, database already setup" ) my = node.mysql if conn: # if we can make a connection w/o a password then this is the # first pass. *Note: the conn is not the same as `node.conn`!* my.set_timezone_info() my.setup_root_user(conn) my.create_db(conn) my.create_default_user(conn) my.create_repl_user(conn) my.expire_root_password(conn) else: # in case this is a newly-promoted primary my.execute('STOP SLAVE') # although backups will be run from any instance, we need to first # snapshot the primary so that we can bootstrap replicas. write_snapshot(node) return True