def main(): """ Parse argument as command and execute that command with parameters containing the state of MySQL, ContainerPilot, etc. Default behavior is to run `pre_start` DB initialization. """ if len(sys.argv) == 1: consul = Consul(envs={'CONSUL': os.environ.get('CONSUL', 'consul')}) cmd = pre_start else: consul = Consul() try: cmd = globals()[sys.argv[1]] except KeyError: log.error('Invalid command: %s', sys.argv[1]) sys.exit(1) my = MySQL() cp = ContainerPilot() cp.load() # what storage backend did we use? driver = os.environ.get('BACKUP_DRIVER', 'manta').lower() if driver == 'manta': storage = Manta() elif driver == 's3': storage = S3() elif driver == 'scp': storage = SCP() else: storage = None node = Node(mysql=my, consul=consul, storage=storage, cp=cp) cmd(node)
def on_change(node): """ The top-level ContainerPilot onChange handler """ # first check if this node has already been set primary by a completed # call to failover and update the ContainerPilot config as needed. if node.is_primary(): log.debug('[on_change] this node is primary, no failover required.') if node.cp.update(): # we're ignoring the lock here intentionally node.consul.put(PRIMARY_KEY, node.name) node.cp.reload() return # check if another node has been set primary already and is reporting # as healthy, in which case there's no failover required. Note that # we can't simply check if we're a replica via .is_replica() b/c that # trusts mysqld's view of the world. try: node.consul.get_primary(timeout=1) log.debug('[on_change] primary is already healthy, no failover required') return except (UnknownPrimary, WaitTimeoutError) as ex: log.debug('[on_change] no primary from consul: %s', ex) if node.consul.lock_failover(node.name): try: nodes = node.consul.client.health.service(REPLICA, passing=True)[1] ips = [instance['Service']['Address'] for instance in nodes] log.info('[on_change] Executing failover with candidates: %s', ips) node.mysql.failover(ips) except Exception: # On failure we bubble-up the exception and fail the onChange. # Either another instance that didn't overlap in time will # complete failover or we'll be left w/o a primary and require # manual intervention via `mysqlrpladmin failover` node.consul.unlock_failover() raise else: log.info('[on_change] Failover in progress on another node, ' 'waiting to complete.') node.consul.wait_for_failover_lock() # need to determine replicaton status at this point, so make # sure we refresh .state from mysqld/Consul node.cp.state = UNASSIGNED if node.is_primary(): log.info('[on_change] node %s is primary after failover', node.name) if node.cp.update(): # we're intentionally ignoring the advisory lock here ok = node.consul.put(PRIMARY_KEY, node.name) log.debug('[on_change] %s obtained lock: %s', node.name, ok) node.cp.reload() return elif node.is_replica(): log.info('[on_change] node %s is replica after failover', node.name) if node.cp.state == UNASSIGNED: log.error('[on_change] this node is neither primary or replica ' 'after failover; check replication status on cluster.') sys.exit(1)
def main(): """ Parse argument as command and execute that command with parameters containing the state of MySQL, ContainerPilot, etc. Default behavior is to run `pre_start` DB initialization. """ if len(sys.argv) == 1: consul = Consul(envs={'CONSUL': os.environ.get('CONSUL', 'consul')}) cmd = pre_start else: consul = Consul() try: cmd = globals()[sys.argv[1]] except KeyError: log.error('Invalid command: %s', sys.argv[1]) sys.exit(1) storage_class = env('BACKUP_STORAGE_CLASS', 'manager.libmanta.Manta') my = MySQL() backup_store = get_class(storage_class)() cp = ContainerPilot() cp.load() node = Node(mysql=my, consul=consul, backup_store=backup_store, cp=cp) cmd(node)
def main(): """ Parse argument as command and execute that command with parameters containing the state of MySQL, ContainerPilot, etc. Default behavior is to run `pre_start` DB initialization. """ if len(sys.argv) == 1: consul = Consul(envs={'CONSUL': os.environ.get('CONSUL', 'consul')}) cmd = pre_start else: consul = Consul() try: cmd = globals()[sys.argv[1]] except KeyError: log.error('Invalid command: %s', sys.argv[1]) sys.exit(1) my = MySQL() snapshot_backend = os.environ.get('SNAPSHOT_BACKEND', 'manta') if snapshot_backend == 'local': snaps = Local() elif snapshot_backend == 'minio': snaps = Minio() else: snaps = Manta() cp = ContainerPilot() cp.load() node = Node(mysql=my, consul=consul, snaps=snaps, cp=cp) cmd(node) my.close()
def set_timezone_info(self): """ Write TZ data to mysqld by piping mysql_tzinfo_to_sql to the mysql client. This is kinda gross but piping it avoids having to parse the output for a bulk insert with the Connector/MySQL client. """ try: subprocess.check_output( '/usr/bin/mysql_tzinfo_to_sql /usr/share/zoneinfo | ' '/usr/bin/mysql -uroot --protocol=socket ' '--socket=/var/run/mysqld/mysqld.sock') except (subprocess.CalledProcessError, OSError) as ex: log.error('mysql_tzinfo_to_sql returned error: %s', ex)
def create_repl_user(self, conn): """ this user will be used for both replication and backups """ if not self.repl_user or not self.repl_password: log.error('No replication user/password configured.') return self.add('CREATE USER `{}`@`%` IDENTIFIED BY %s; ' .format(self.repl_user), (self.repl_password,)) self.add('GRANT SUPER, SELECT, INSERT, REPLICATION SLAVE, RELOAD' ', LOCK TABLES, GRANT OPTION, REPLICATION CLIENT' ', RELOAD, DROP, CREATE ' 'ON *.* TO `{}`@`%`; ' .format(self.repl_user)) self.add('FLUSH PRIVILEGES;') self.execute_many(conn=conn)
def create_repl_user(self, conn): """ this user will be used for both replication and backups """ if not self.repl_user or not self.repl_password: log.error('No replication user/password configured.') return self.add( 'CREATE USER `{}`@`%` IDENTIFIED BY %s; '.format(self.repl_user), (self.repl_password, )) self.add('GRANT SUPER, SELECT, INSERT, REPLICATION SLAVE, RELOAD' ', LOCK TABLES, GRANT OPTION, REPLICATION CLIENT' ', RELOAD, DROP, CREATE ' 'ON *.* TO `{}`@`%`; '.format(self.repl_user)) self.add('FLUSH PRIVILEGES;') self.execute_many(conn=conn)
def load(self, envs=os.environ): """ Fetches the ContainerPilot config file and asks ContainerPilot to render it out so that all environment variables have been interpolated. """ self.path = env('CONTAINERPILOT', None, envs) try: cfg = subprocess.check_output( ['containerpilot', '-config', self.path, '-template'], env=envs.copy()) except (subprocess.CalledProcessError, OSError) as ex: log.error('containerpilot -template returned error: %s', ex) raise (ex) config = json5.loads(cfg) self.config = config
def health(node): """ The top-level ContainerPilot `health` handler. Runs a simple health check. Also acts as a check for whether the ContainerPilot configuration needs to be reloaded (if it's been changed externally). """ # Because we need MySQL up to finish initialization, we need to check # for each pass thru the health check that we've done so. The happy # path is to check a lock file against the node state (which has been # set above) and immediately return when we discover the lock exists. # Otherwise, we bootstrap the instance for its *current* state. assert_initialized_for_state(node) if node.is_primary(): # If this lock is allowed to expire and the health check for the # primary fails the `onChange` handlers for the replicas will try # to failover and then the primary will obtain a new lock. # If this node can update the lock but the DB fails its health check, # then the operator will need to manually intervene if they want to # force a failover. This architecture is a result of Consul not # permitting us to acquire a new lock on a health-checked session if the # health check is *currently* failing, but has the happy side-effect of # reducing the risk of flapping on a transient health check failure. node.consul.renew_session() # Simple health check; exceptions result in a non-zero exit code node.mysql.query('select 1') # When failing over the new node needs a chance to lock the kv node.consul.mark_as_primary(node.name) elif node.is_replica(): # TODO: we should make this check actual replication health # and not simply that replication has been established if not node.mysql.query('show slave status'): log.error('Replica is not replicating.') sys.exit(1) else: # If we're still somehow marked UNASSIGNED we exit now. This is a # byzantine failure mode where the end-user needs to intervene. log.error('Cannot determine MySQL state; failing health check.') sys.exit(1) node.consul.unlock_failover()
def health(node): """ The top-level ContainerPilot `health` handler. Runs a simple health check. Also acts as a check for whether the ContainerPilot configuration needs to be reloaded (if it's been changed externally). """ # Because we need MySQL up to finish initialization, we need to check # for each pass thru the health check that we've done so. The happy # path is to check a lock file against the node state (which has been # set above) and immediately return when we discover the lock exists. # Otherwise, we bootstrap the instance for its *current* state. assert_initialized_for_state(node) if node.is_primary(): # If this lock is allowed to expire and the health check for the # primary fails the `onChange` handlers for the replicas will try # to failover and then the primary will obtain a new lock. # If this node can update the lock but the DB fails its health check, # then the operator will need to manually intervene if they want to # force a failover. This architecture is a result of Consul not # permitting us to acquire a new lock on a health-checked session if the # health check is *currently* failing, but has the happy side-effect of # reducing the risk of flapping on a transient health check failure. node.consul.renew_session() # Simple health check; exceptions result in a non-zero exit code node.mysql.query('select 1') elif node.is_replica(): # TODO: we should make this check actual replication health # and not simply that replication has been established if not node.mysql.query('show slave status'): log.error('Replica is not replicating.') sys.exit(1) else: # If we're still somehow marked UNASSIGNED we exit now. This is a # byzantine failure mode where the end-user needs to intervene. log.error('Cannot determine MySQL state; failing health check.') sys.exit(1) node.consul.unlock_failover()
def assert_initialized_for_state(node): """ If the node has not yet been set up, find the correct state and initialize for that state. After the first health check we'll have written a lock file and will never hit this path again. """ LOCK_PATH = '/var/run/init.lock' try: os.mkdir(LOCK_PATH, 0700) except OSError: # the lock file exists so we've already initialized return True # the check for primary will set the state if its known. If another # instance is the primary then we'll be marked as REPLICA, so if # we can't determine after the check which we are then we're likely # the first instance (this will get safely verified later). try: if not run_as_primary(node): log.error( 'Tried to mark node %s primary but primary exists, ' 'exiting for retry on next check.', node.name) os.rmdir(LOCK_PATH) sys.exit(1) except MySQLError as ex: # We've made it only partly thru setup. Setup isn't idempotent # but should be safe to retry if we can make more progress. At # worst we end up with a bunch of failure logs. log.error( 'Failed to set up %s as primary (%s). Exiting but will ' 'retry setup. Check logs following this line to see if ' 'setup needs reconfiguration or manual intervention to ' 'continue.', node.name, ex) os.rmdir(LOCK_PATH) sys.exit(1) return False
def main(): """ Parse argument as command and execute that command with parameters containing the state of MySQL, ContainerPilot, etc. Default behavior is to run `pre_start` DB initialization. """ if len(sys.argv) == 1: consul = Consul(envs={'CONSUL': os.environ.get('CONSUL', 'consul')}) cmd = pre_start else: consul = Consul() try: cmd = globals()[sys.argv[1]] except KeyError: log.error('Invalid command: %s', sys.argv[1]) sys.exit(1) my = MySQL() manta = Manta() cp = ContainerPilot() cp.load() node = Node(mysql=my, consul=consul, manta=manta, cp=cp) cmd(node)
def main(): """ Parse argument as command and execute that command with parameters containing the state of MySQL, ContainerPilot, etc. Default behavior is to run `pre_start` DB initialization. """ if len(sys.argv) == 1: consul = Consul(envs={'CONSUL': os.environ.get('CONSUL', 'consul')}) cmd = pre_start else: consul = Consul() try: cmd = globals()[sys.argv[1]] except KeyError: log.error('Invalid command: %s', sys.argv[1]) sys.exit(1) cp = ContainerPilot() cass = CassandraService() cp.load() print(consul) node = Node(kvstore=consul, cp=cp, service=cass) cmd(node)
def assert_initialized_for_state(node): """ If the node has not yet been set up, find the correct state and initialize for that state. After the first health check we'll have written a lock file and will never hit this path again. """ LOCK_PATH = '/var/run/init.lock' try: os.mkdir(LOCK_PATH, 0700) except OSError: # the lock file exists so we've already initialized return True # the check for primary will set the state if its known. If another # instance is the primary then we'll be marked as REPLICA, so if # we can't determine after the check which we are then we're likely # the first instance (this will get safely verified later). if node.is_primary() or node.cp.state == UNASSIGNED: try: if not run_as_primary(node): log.error( 'Tried to mark node %s primary but primary exists, ' 'exiting for retry on next check.', node.name) os.rmdir(LOCK_PATH) sys.exit(1) except MySQLError as ex: # We've made it only partly thru setup. Setup isn't idempotent # but should be safe to retry if we can make more progress. At # worst we end up with a bunch of failure logs. log.error( 'Failed to set up %s as primary (%s). Exiting but will ' 'retry setup. Check logs following this line to see if ' 'setup needs reconfiguration or manual intervention to ' 'continue.', node.name, ex) os.rmdir(LOCK_PATH) sys.exit(1) if node.cp.update(): os.rmdir(LOCK_PATH) node.cp.reload() # this is racy with the SIGHUP that ContainerPilot just got # sent, but if the Consul agent shuts down quickly enough we # end up sending extra API calls to it and get a bunch of log # spam. This forces us to exit early. sys.exit(0) else: try: run_as_replica(node) except (UnknownPrimary, MySQLError) as ex: log.error( 'Failed to set up %s for replication (%s). Exiting for retry ' 'on next check.', node.name, ex) os.rmdir(LOCK_PATH) sys.exit(1) return False
def assert_initialized_for_state(node): """ If the node has not yet been set up, find the correct state and initialize for that state. After the first health check we'll have written a lock file and will never hit this path again. """ LOCK_PATH = '/var/run/init.lock' try: os.mkdir(LOCK_PATH, 0700) except OSError: # the lock file exists so we've already initialized return True # the check for primary will set the state if its known. If another # instance is the primary then we'll be marked as REPLICA, so if # we can't determine after the check which we are then we're likely # the first instance (this will get safely verified later). if node.is_primary() or node.cp.state == UNASSIGNED: try: if not run_as_primary(node): log.error('Tried to mark node %s primary but primary exists, ' 'exiting for retry on next check.', node.name) os.rmdir(LOCK_PATH) sys.exit(1) except MySQLError as ex: # We've made it only partly thru setup. Setup isn't idempotent # but should be safe to retry if we can make more progress. At # worst we end up with a bunch of failure logs. log.error('Failed to set up %s as primary (%s). Exiting but will ' 'retry setup. Check logs following this line to see if ' 'setup needs reconfiguration or manual intervention to ' 'continue.', node.name, ex) os.rmdir(LOCK_PATH) sys.exit(1) if node.cp.update(): os.rmdir(LOCK_PATH) node.cp.reload() # this is racy with the SIGHUP that ContainerPilot just got # sent, but if the Consul agent shuts down quickly enough we # end up sending extra API calls to it and get a bunch of log # spam. This forces us to exit early. sys.exit(0) else: try: run_as_replica(node) except (UnknownPrimary, MySQLError) as ex: log.error('Failed to set up %s for replication (%s). Exiting for retry ' 'on next check.', node.name, ex) os.rmdir(LOCK_PATH) sys.exit(1) return False