def confirm_max_replica_lag(replicas, lag_tolerance, dead_master, replicas_synced=False, timeout=0): """ Test replication lag Args: replicas - A set of hostaddr object to be tested for replication lag max_lag - Max computed replication lag in seconds. If 0 is supplied, then exec position is compared from replica servers to the master rather than using a computed second behind as the heartbeat will be blocked by read_only. replicas_synced - Replica servers must have executed to the same position in the binary log. timeout - How long to wait for replication to be in the desired state """ start = time.time() if dead_master: replication_checks = set( [mysql_lib.CHECK_SQL_THREAD, mysql_lib.CHECK_CORRECT_MASTER]) else: replication_checks = mysql_lib.ALL_REPLICATION_CHECKS while True: acceptable = True for replica in replicas: # Confirm threads are running, expected master try: mysql_lib.assert_replication_sanity(replica, replication_checks) except Exception as e: log.warning(e) log.info('Trying to restart replication, then ' 'sleep 20 seconds') mysql_lib.restart_replication(replica) time.sleep(20) mysql_lib.assert_replication_sanity(replica, replication_checks) try: mysql_lib.assert_replication_unlagged(replica, lag_tolerance, dead_master) except Exception as e: log.warning(e) acceptable = False if replicas_synced and not confirm_replicas_in_sync(replicas): acceptable = False log.warning('Replica servers are not in sync and replicas_synced ' 'is set') if acceptable: return elif (time.time() - start) > timeout: raise Exception('Replication is not in an acceptable state on ' 'replica {r}'.format(r=replica)) else: log.info('Sleeping for 5 second to allow replication to catch up') time.sleep(5)
def is_master_alive(master, replicas): """ Determine if the master is alive The function will: 1. Attempt to connect to the master via the mysql protcol. If successful the master is considered alive. 2. If #1 fails, check the io thread of the replica instance(s). If the io thread is not running, the master will be considered dead. If step #1 fails and step #2 succeeds, we are in a weird state and will throw an exception. Args: master - A hostaddr object for the master instance replicas - A set of hostaddr objects for the replica instances Returns: A mysql connection to the master if the master is alive, False otherwise. """ if len(replicas) == 0: raise Exception('At least one replica must be present to determine ' 'a master is dead') try: master_conn = mysql_lib.connect_mysql(master) return master_conn except MySQLdb.OperationalError as detail: (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise master_conn = False log.info('Unable to connect to current master {master} from ' '{hostname}, will check replica servers beforce declaring ' 'the master dead'.format(master=master, hostname=host_utils.HOSTNAME)) except: log.info('This is an unknown connection error. If you are very sure ' 'that the master is dead, please put a "return False" at the ' 'top of is_master_alive and then send rwultsch a stack trace') raise # We can not get a connection to the master, so poll the replica servers for replica in replicas: # If replication has not hit a timeout, a dead master can still have # a replica which thinks it is ok. "STOP SLAVE; START SLAVE" followed # by a sleep will get us truthyness. mysql_lib.restart_replication(replica) try: mysql_lib.assert_replication_sanity(replica) raise Exception('Replica {replica} thinks it can connect to ' 'master {master}, but failover script can not. ' 'Possible network partition!' ''.format(replica=replica, master=master)) except: # The exception is expected in this case pass log.info('Replica {replica} also can not connect to master ' '{master}.'.format(replica=replica, master=master)) return False
def mysql_backup(instance, backup_type=backup.BACKUP_TYPE_XBSTREAM, initial_build=False): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object backup_type - backup.BACKUP_TYPE_LOGICAL or backup.BACKUP_TYPE_XBSTREAM initial_build - Boolean, if this is being created right after the server was built """ log.info('Confirming sanity of replication (if applicable)') zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(instance) except: # instance is not in production replica_type = None if replica_type and replica_type != host_utils.REPLICA_ROLE_MASTER: mysql_lib.assert_replication_sanity(instance) log.info('Logging initial status to mysqlops') start_timestamp = time.localtime() lock_handle = None backup_id = mysql_lib.start_backup_log(instance, backup_type, start_timestamp) # Take a lock to prevent multiple backups from running concurrently try: log.info('Taking backup lock') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) # Actually run the backup log.info('Running backup') if backup_type == backup.BACKUP_TYPE_XBSTREAM: backup_file = backup.xtrabackup_instance(instance, start_timestamp, initial_build) elif backup_type == backup.BACKUP_TYPE_LOGICAL: backup_file = backup.logical_backup_instance( instance, start_timestamp, initial_build) else: raise Exception('Unsupported backup type {backup_type}' ''.format(backup_type=backup_type)) finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) # Update database with additional info now that backup is done. if backup_id: log.info("Updating database log entry with final backup info") mysql_lib.finalize_backup_log(backup_id, backup_file) else: log.info("The backup is complete, but we were not able to " "write to the central log DB.")
def confirm_max_replica_lag(replicas, lag_tolerance, dead_master, replicas_synced=False, timeout=0): """ Test replication lag Args: replicas - A set of hostaddr object to be tested for replication lag max_lag - Max computed replication lag in seconds. If 0 is supplied, then exec position is compared from replica servers to the master rather than using a computed second behind as the heartbeat will be blocked by read_only. replicas_synced - Replica servers must have executed to the same position in the binary log. timeout - How long to wait for replication to be in the desired state """ start = time.time() if dead_master: replication_checks = set([mysql_lib.CHECK_SQL_THREAD, mysql_lib.CHECK_CORRECT_MASTER]) else: replication_checks = mysql_lib.ALL_REPLICATION_CHECKS while True: acceptable = True for replica in replicas: # Confirm threads are running, expected master try: mysql_lib.assert_replication_sanity(replica, replication_checks) except Exception as e: log.warning(e) log.info('Trying to restart replication, then ' 'sleep 20 seconds') mysql_lib.restart_replication(replica) time.sleep(20) mysql_lib.assert_replication_sanity(replica, replication_checks) try: mysql_lib.assert_replication_unlagged(replica, lag_tolerance, dead_master) except Exception as e: log.warning(e) acceptable = False if replicas_synced and not confirm_replicas_in_sync(replicas): acceptable = False log.warning('Replica servers are not in sync and replicas_synced ' 'is set') if acceptable: return elif (time.time() - start) > timeout: raise Exception('Replication is not in an acceptable state on ' 'replica {r}'.format(r=replica)) else: log.info('Sleeping for 5 second to allow replication to catch up') time.sleep(5)
def mysql_backup(instance, backup_type=backup.BACKUP_TYPE_XBSTREAM, initial_build=False): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object backup_type - backup.BACKUP_TYPE_LOGICAL or backup.BACKUP_TYPE_XBSTREAM initial_build - Boolean, if this is being created right after the server was built """ log.info('Confirming sanity of replication (if applicable)') zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(instance) except: # instance is not in production replica_type = None if replica_type and replica_type != host_utils.REPLICA_ROLE_MASTER: mysql_lib.assert_replication_sanity(instance) log.info('Logging initial status to mysqlops') start_timestamp = time.localtime() lock_handle = None backup_id = mysql_lib.start_backup_log(instance, backup_type, start_timestamp) # Take a lock to prevent multiple backups from running concurrently try: log.info('Taking backup lock') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) # Actually run the backup log.info('Running backup') if backup_type == backup.BACKUP_TYPE_XBSTREAM: backup_file = backup.xtrabackup_instance(instance, start_timestamp, initial_build) elif backup_type == backup.BACKUP_TYPE_LOGICAL: backup_file = backup.logical_backup_instance(instance, start_timestamp, initial_build) else: raise Exception('Unsupported backup type {backup_type}' ''.format(backup_type=backup_type)) finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) # Update database with additional info now that backup is done. if backup_id: log.info("Updating database log entry with final backup info") mysql_lib.finalize_backup_log(backup_id, backup_file) else: log.info("The backup is complete, but we were not able to " "write to the central log DB.")
def main(): description = ("MySQL checksum wrapper\n\n" "Wrapper of pt-table-checksum and pt-table-sync.\n" "Defaults to checksumming 1/{k}th of databases on instance.\n" "If diffs are found, use pt-table-sync to measure actual " "divergence,\nbut only if the number of diffs is between " "--min_diffs and --max_diffs.").format(k=DB_CHECK_FRACTION) parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-i', '--instance', help='Instance to act on if other than localhost:3306', default=''.join((socket.getfqdn(), ':3306'))) parser.add_argument('-a', '--all', help='Checksums all dbs rather than the default', action='store_true', default=False) parser.add_argument('-d', '--dbs', help=("Comma separated list of db's to check rather " "than the default"), default=False) parser.add_argument('-q', '--quiet', help=("Do not print output to stdout"), action='store_true', default=False) parser.add_argument('-m', '--min_diffs', help=("Do per-row check if chunk diff count is at " "least this value"), dest='min_diffs', default=MIN_DIFFS) parser.add_argument('-M', '--max_diffs', help=("Do not do per-row check if chunk diff count " "is greater than this value"), dest='max_diffs', default=MAX_DIFFS) parser.add_argument('-C', '--no_create_table', help=("If test.checksum_detail is missing, do " "not try to create it."), dest='create_table', action='store_false', default=True) parser.add_argument('-v', '--verbose', help=("Store raw output from PT tools in the DB?"), action='store_true', default=False) parser.add_argument('-c', '--check_fraction', help=('Check this fraction of databases.'), default=DB_CHECK_FRACTION) args = parser.parse_args() instance = host_utils.HostAddr(args.instance) zk = host_utils.MysqlZookeeper() if instance not in \ zk.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER): raise Exception("Instance is not a master in ZK") # If enabled, try to create the table that holds the checksum info. # If not enabled, make sure that the table exists. if not mysql_lib.does_table_exist(instance, mysql_lib.METADATA_DB, CHECKSUM_TBL): if args.create_table: create_checksum_detail_table(instance) else: raise Exception("Checksum table not found. Unable to continue." "Consider not using the -C option or create it " "yourself.") # Determine what replica set we belong to and get a list of slaves. replica_set = zk.get_replica_set_from_instance(instance)[0] slaves = set() for rtype in host_utils.REPLICA_ROLE_SLAVE, host_utils.REPLICA_ROLE_DR_SLAVE: s = zk.get_mysql_instance_from_replica_set(replica_set, rtype) if s: slaves.add(s) if len(slaves) == 0: log.info("This server has no slaves. Nothing to do.") sys.exit(0) # before we even start this, make sure replication is OK. for slave in slaves: mysql_lib.assert_replication_sanity(slave) if args.dbs: db_to_check = set(args.dbs.split(',')) else: dbs = mysql_lib.get_dbs(instance) if args.all: db_to_check = dbs else: # default behaviour, check a given DB every N days based on # day of year. minimizes month-boundary issues. db_to_check = set() check_modulus = int(time.strftime("%j")) % int(args.check_fraction) counter = 0 for db in dbs: modulus = counter % int(args.check_fraction) if modulus == check_modulus: db_to_check.add(db) counter = counter + 1 # Iterate through the list of DBs and check one table at a time. # We do it this way to ensure more coverage in case pt-table-checksum # loses its DB connection and errors out before completing a full scan # of a given database. # for db in db_to_check: tables_to_check = mysql_lib.get_tables(instance, db, skip_views=True) for tbl in tables_to_check: c_cmd, c_out, c_err, c_ret = checksum_tbl(instance, db, tbl) if not args.quiet: log.info("Checksum command executed was:\n{cmd}".format(cmd=c_cmd)) log.info("Standard out:\n{out}".format(out=c_out)) log.info("Standard error:\n{err}".format(err=c_err)) log.info("Return code: {ret}".format(ret=c_ret)) # parse each line of STDOUT (there should only be one with # actual data). We only care about errors, rows, chunks, and # skipped, since we'll need to figure out diffs separately for # each slave box. for line in c_out.split("\n"): results = parse_checksum_row(line) if results: chunk_errors = int(results[1]) row_count = int(results[3]) chunk_count = int(results[4]) chunk_skips = int(results[5]) for slave in slaves: rows_checked = 'NO' sync_cmd = "" sync_out = "" sync_err = "" sync_ret = -1 row_diffs = 0 elapsed_time_ms,\ chunk_diffs = check_one_replica(slave, db, tbl) # if we skipped some chunks or there were errors, # this means we can't have complete information about the # state of the replica. in the case of a hard error, # we'll just stop. in the case of a skipped chunk, we will # treat it as a different chunk for purposes of deciding # whether or not to do a more detailed analysis. # checkable_chunks = chunk_skips + chunk_diffs if chunk_errors > 0: checksum_status = 'ERRORS_IN_CHECKSUM_PROCESS' elif checkable_chunks == 0: checksum_status = 'GOOD' else: if checkable_chunks > int(args.max_diffs): # too many chunk diffs, don't bother checking # further. not good. checksum_status = 'TOO_MANY_CHUNK_DIFFS' elif checkable_chunks < int(args.min_diffs): # some diffs, but not enough that we care. checksum_status = 'CHUNK_DIFFS_FOUND_BUT_OK' else: start_time = int(time.time()*1000) rows_checked = 'YES' # set the proper status - did we do a sync-based check # because of explicit diffs or because of skipped chunks? if chunk_diffs > 0: checksum_status = 'ROW_DIFFS_FOUND' else: checksum_status = 'CHUNKS_WERE_SKIPPED' sync_cmd, sync_out, sync_err, sync_ret, \ row_diffs = checksum_tbl_via_sync(slave, db, tbl) # Add in the time it took to do the sync. elapsed_time_ms += int(time.time()*1000) - start_time if not args.quiet: log.info("Sync command executed was:\n{cmd} ".format(cmd=sync_cmd)) log.info("Standard out:\n {out}".format(out=sync_out)) log.info("Standard error:\n {err}".format(err=sync_err)) log.info("Return code: {ret}".format(ret=sync_ret)) log.info("Row diffs found: {cnt}".format(cnt=row_diffs)) # Checksum process is complete, store the results. # data = {'instance': slave, 'master_instance': instance, 'db': db, 'tbl': tbl, 'elapsed_time_ms': elapsed_time_ms, 'chunk_count': chunk_count, 'chunk_errors': chunk_errors, 'chunk_diffs': chunk_diffs, 'chunk_skips': chunk_skips, 'row_count': row_count, 'row_diffs': row_diffs, 'rows_checked': rows_checked, 'checksum_status': checksum_status, 'checksum_cmd': None, 'checksum_stdout': None, 'checksum_stderr': None, 'checksum_rc': c_ret, 'sync_cmd': None, 'sync_stdout': None, 'sync_stderr': None, 'sync_rc': sync_ret} if args.verbose: data.update({'checksum_cmd': c_cmd, 'checksum_stdout': c_out, 'checksum_stderr': c_err, 'sync_cmd': sync_cmd, 'sync_stdout': sync_out, 'sync_stderr': sync_err, 'sync_rc': sync_ret}) write_checksum_status(instance, data)
def launch_replacement_db_host(original_server, dry_run=False, not_a_replacement=False, overrides=dict(), reason='', replace_again=False): """ Launch a replacement db server Args: original_server - A hostAddr object for the server to be replaced dry_run - If True, do not actually launch a replacement not_a_replacement - If set, don't log the replacement, therefore automation won't put it into prod use. overrides - A dict of overrides. Availible keys are 'mysql_minor_version', 'hostname', 'vpc_security_group', 'availability_zone', 'instance_type', and 'mysql_major_version'. reason - A description of why the host is being replaced. If the instance is still accessible and reason is not supply an exception will be thrown. replace_again - If True, ignore already existing replacements. """ reasons = set() if reason: reasons.add(reason) log.info('Trying to launch a replacement for host {host} which is part ' 'of replica set is {replica_set}'.format( host=original_server.hostname, replica_set=original_server.get_zk_replica_set()[0])) zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(original_server) except: raise Exception('Can not replace an instance which is not in zk') if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Can not replace an instance which is a master in zk') # Open a connection to MySQL Ops and check if a replacement has already # been requested reporting_conn = mysql_lib.get_mysqlops_connections() existing_replacement = find_existing_replacements(reporting_conn, original_server) if existing_replacement and not not_a_replacement: log.info('A replacement has already been requested: ' '{re}'.format(re=existing_replacement)) if replace_again: log.info('Argument replace_again is set, continuing on.') else: age_of_replacement = datetime.datetime.now( ) - existing_replacement['created_at'] if age_of_replacement.days < SERVER_BUILD_TIMEOUT: raise Exception('Argument replace_again is not True but a ' 'replacement already exists.') else: log.info("A replacement already exists, but was launched " "{days} days ago. The timeout for servers builds is " "{timeout} days so we are automatically setting " "replace_again.".format(days=age_of_replacement.days, timeout=SERVER_BUILD_TIMEOUT)) replace_again = True # Check to see if MySQL is up on the host try: # This is not multi instance compatible. If we move to multiple # instances this will need to be updated conn = mysql_lib.connect_mysql(original_server) conn.close() dead_server = False version_server = original_server except MySQLdb.OperationalError as detail: dead_server = True (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down, assuming hardware failure') reasons.add('hardware failure') version_server = zk.get_mysql_instance_from_replica_set( original_server.get_zk_replica_set()[0], repl_type=host_utils.REPLICA_ROLE_MASTER) # Pull some information from cmdb. cmdb_data = environment_specific.get_server_metadata( original_server.hostname) if not cmdb_data: raise Exception('Could not find information about server to be ' 'replaced in the cmdb') if 'aws_status.codes' in cmdb_data: reasons.add(cmdb_data['aws_status.codes']) log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data)) replacement_config = { 'availability_zone': cmdb_data['location'], 'vpc_security_group': cmdb_data['security_groups'], 'hostname': find_unused_server_name(original_server.get_standardized_replica_set(), reporting_conn, dry_run), 'instance_type': cmdb_data['config.instance_type'], 'mysql_major_version': mysql_lib.get_global_variables(version_server)['version'][0:3], 'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION, 'dry_run': dry_run, 'skip_name_check': True } # At this point, all our defaults should be good to go config_overridden = False # All other overrides for key in overrides.keys(): if key not in replacement_config: raise Exception('Invalid override {key}'.format(key=key)) if overrides[key]: if replacement_config[key] == overrides[key]: log.info('Override for key {key} does not modify ' 'configuration'.format(key=key)) else: log.info('Overriding {key} to value {new} from {old}' ''.format(key=key, old=replacement_config[key], new=overrides[key])) reasons.add('changing {key} from {old} to ' '{new}'.format(key=key, old=replacement_config[key], new=overrides[key])) replacement_config[key] = overrides[key] config_overridden = True if config_overridden: log.info('Configuration after overrides: {replacement_config}' ''.format(replacement_config=replacement_config)) if not dead_server: try: mysql_lib.assert_replication_sanity(original_server) except Exception as e: log.info('Replication problem: {e}'.format(e=e)) reasons.add('replication broken') # If we get to here and there is no reason, bail out if not reasons and not replacement_config['dry_run']: raise Exception(('MySQL appears to be up and no reason for ' 'replacement is supplied. You can specify a reason ' 'with the --reason argument')) reason = ', '.join(reasons) log.info('Reason for launch: {reason}'.format(reason=reason)) new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server( **replacement_config) if not (replacement_config['dry_run'] or not_a_replacement): log_replacement_host(reporting_conn, cmdb_data, new_instance_id, replace_again, replacement_config, reason)
# in theory, we could allow multiple instances of this script to run # on one server, as long as they are checksumming different replica sets. # try: lock = host_utils.bind_lock_socket('CHECKSUM_{}'.format(replica_set)) except socket.error, (code, msg): log.error("Unable to bind socket for checksum on {rs} " "(msg: {m}, code:{c})".format(rs=replica_set, m=msg, c=code)) sys.exit(code) log.info("Locked replica set {} for checksum on this " "server".format(replica_set)) # before we even start this, make sure replication is OK. for slave in slaves: mysql_lib.assert_replication_sanity(slave) if args.dbs: db_to_check = set(args.dbs.split(',')) else: dbs = mysql_lib.get_dbs(instance) if args.all: db_to_check = dbs else: # default behaviour, check a given DB every N days based on # day of year. minimizes month-boundary issues. db_to_check = set() check_modulus = int(time.strftime("%j")) % int(args.check_fraction) counter = 0 for db in dbs:
def main(): description = ( "MySQL checksum wrapper\n\n" "Wrapper of pt-table-checksum and pt-table-sync.\n" "Defaults to checksumming 1/{k}th of databases on instance.\n" "If diffs are found, use pt-table-sync to measure actual " "divergence,\nbut only if the number of diffs is between " "--min_diffs and --max_diffs.").format(k=DB_CHECK_FRACTION) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-i', '--instance', help='Instance to act on if other than localhost:3306', default=''.join((socket.getfqdn(), ':3306'))) parser.add_argument('-a', '--all', help='Checksums all dbs rather than the default', action='store_true', default=False) parser.add_argument('-d', '--dbs', help=("Comma separated list of db's to check rather " "than the default"), default=False) parser.add_argument('-q', '--quiet', help=("Do not print output to stdout"), action='store_true', default=False) parser.add_argument('-m', '--min_diffs', help=("Do per-row check if chunk diff count is at " "least this value"), dest='min_diffs', default=MIN_DIFFS) parser.add_argument('-M', '--max_diffs', help=("Do not do per-row check if chunk diff count " "is greater than this value"), dest='max_diffs', default=MAX_DIFFS) parser.add_argument('-C', '--no_create_table', help=("If test.checksum_detail is missing, do " "not try to create it."), dest='create_table', action='store_false', default=True) parser.add_argument('-v', '--verbose', help=("Store raw output from PT tools in the DB?"), action='store_true', default=False) parser.add_argument('-c', '--check_fraction', help=('Check this fraction of databases.'), default=DB_CHECK_FRACTION) args = parser.parse_args() instance = host_utils.HostAddr(args.instance) zk = host_utils.MysqlZookeeper() if instance not in \ zk.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER): raise Exception("Instance is not a master in ZK") # If enabled, try to create the table that holds the checksum info. # If not enabled, make sure that the table exists. if not mysql_lib.does_table_exist(instance, mysql_lib.METADATA_DB, CHECKSUM_TBL): if args.create_table: create_checksum_detail_table(instance) else: raise Exception("Checksum table not found. Unable to continue." "Consider not using the -C option or create it " "yourself.") # Determine what replica set we belong to and get a list of slaves. replica_set = zk.get_replica_set_from_instance(instance)[0] slaves = set() for rtype in host_utils.REPLICA_ROLE_SLAVE, host_utils.REPLICA_ROLE_DR_SLAVE: s = zk.get_mysql_instance_from_replica_set(replica_set, rtype) if s: slaves.add(s) if len(slaves) == 0: log.info("This server has no slaves. Nothing to do.") sys.exit(0) # before we even start this, make sure replication is OK. for slave in slaves: mysql_lib.assert_replication_sanity(slave) if args.dbs: db_to_check = set(args.dbs.split(',')) else: dbs = mysql_lib.get_dbs(instance) if args.all: db_to_check = dbs else: # default behaviour, check a given DB every N days based on # day of year. minimizes month-boundary issues. db_to_check = set() check_modulus = int(time.strftime("%j")) % int(args.check_fraction) counter = 0 for db in dbs: modulus = counter % int(args.check_fraction) if modulus == check_modulus: db_to_check.add(db) counter = counter + 1 # Iterate through the list of DBs and check one table at a time. # We do it this way to ensure more coverage in case pt-table-checksum # loses its DB connection and errors out before completing a full scan # of a given database. # for db in db_to_check: tables_to_check = mysql_lib.get_tables(instance, db, skip_views=True) for tbl in tables_to_check: c_cmd, c_out, c_err, c_ret = checksum_tbl(instance, db, tbl) if not args.quiet: log.info( "Checksum command executed was:\n{cmd}".format(cmd=c_cmd)) log.info("Standard out:\n{out}".format(out=c_out)) log.info("Standard error:\n{err}".format(err=c_err)) log.info("Return code: {ret}".format(ret=c_ret)) # parse each line of STDOUT (there should only be one with # actual data). We only care about errors, rows, chunks, and # skipped, since we'll need to figure out diffs separately for # each slave box. for line in c_out.split("\n"): results = parse_checksum_row(line) if results: chunk_errors = int(results[1]) row_count = int(results[3]) chunk_count = int(results[4]) chunk_skips = int(results[5]) for slave in slaves: rows_checked = 'NO' sync_cmd = "" sync_out = "" sync_err = "" sync_ret = -1 row_diffs = 0 elapsed_time_ms,\ chunk_diffs = check_one_replica(slave, db, tbl) # if we skipped some chunks or there were errors, # this means we can't have complete information about the # state of the replica. in the case of a hard error, # we'll just stop. in the case of a skipped chunk, we will # treat it as a different chunk for purposes of deciding # whether or not to do a more detailed analysis. # checkable_chunks = chunk_skips + chunk_diffs if chunk_errors > 0: checksum_status = 'ERRORS_IN_CHECKSUM_PROCESS' elif checkable_chunks == 0: checksum_status = 'GOOD' else: if checkable_chunks > int(args.max_diffs): # too many chunk diffs, don't bother checking # further. not good. checksum_status = 'TOO_MANY_CHUNK_DIFFS' elif checkable_chunks < int(args.min_diffs): # some diffs, but not enough that we care. checksum_status = 'CHUNK_DIFFS_FOUND_BUT_OK' else: start_time = int(time.time() * 1000) rows_checked = 'YES' # set the proper status - did we do a sync-based check # because of explicit diffs or because of skipped chunks? if chunk_diffs > 0: checksum_status = 'ROW_DIFFS_FOUND' else: checksum_status = 'CHUNKS_WERE_SKIPPED' sync_cmd, sync_out, sync_err, sync_ret, \ row_diffs = checksum_tbl_via_sync(slave, db, tbl) # Add in the time it took to do the sync. elapsed_time_ms += int( time.time() * 1000) - start_time if not args.quiet: log.info( "Sync command executed was:\n{cmd} ". format(cmd=sync_cmd)) log.info("Standard out:\n {out}".format( out=sync_out)) log.info("Standard error:\n {err}".format( err=sync_err)) log.info("Return code: {ret}".format( ret=sync_ret)) log.info("Row diffs found: {cnt}".format( cnt=row_diffs)) # Checksum process is complete, store the results. # data = { 'instance': slave, 'master_instance': instance, 'db': db, 'tbl': tbl, 'elapsed_time_ms': elapsed_time_ms, 'chunk_count': chunk_count, 'chunk_errors': chunk_errors, 'chunk_diffs': chunk_diffs, 'chunk_skips': chunk_skips, 'row_count': row_count, 'row_diffs': row_diffs, 'rows_checked': rows_checked, 'checksum_status': checksum_status, 'checksum_cmd': None, 'checksum_stdout': None, 'checksum_stderr': None, 'checksum_rc': c_ret, 'sync_cmd': None, 'sync_stdout': None, 'sync_stderr': None, 'sync_rc': sync_ret } if args.verbose: data.update({ 'checksum_cmd': c_cmd, 'checksum_stdout': c_out, 'checksum_stderr': c_err, 'sync_cmd': sync_cmd, 'sync_stdout': sync_out, 'sync_stderr': sync_err, 'sync_rc': sync_ret }) write_checksum_status(instance, data)
def launch_replacement_db_host(original_server, dry_run=False, not_a_replacement=False, overrides=dict(), reason='', replace_again=False): """ Launch a replacement db server Args: original_server - A hostAddr object for the server to be replaced dry_run - If True, do not actually launch a replacement not_a_replacement - If set, don't log the replacement, therefore automation won't put it into prod use. overrides - A dict of overrides. Availible keys are 'mysql_minor_version', 'hostname', 'vpc_security_group', 'availability_zone', 'classic_security_group', 'instance_type', and 'mysql_major_version'. reason - A description of why the host is being replaced. If the instance is still accessible and reason is not supply an exception will be thrown. replace_again - If True, ignore already existing replacements. """ reasons = set() if reason: reasons.add(reason) log.info('Trying to launch a replacement for host {host} which is part ' 'of replica set is {replica_set}'.format(host=original_server.hostname, replica_set=original_server.get_zk_replica_set()[0])) zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(original_server) except: raise Exception('Can not replace an instance which is not in zk') if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Can not replace an instance which is a master in zk') # Open a connection to MySQL Ops and check if a replacement has already # been requested reporting_conn = mysql_lib.get_mysqlops_connections() existing_replacement = find_existing_replacements(reporting_conn, original_server) if existing_replacement and not not_a_replacement: log.info('A replacement has already been requested: ' '{re}'.format(re=existing_replacement)) if replace_again: log.info('Argument replace_again is set, continuing on.') else: age_of_replacement = datetime.datetime.now() - existing_replacement['created_at'] if age_of_replacement.days < SERVER_BUILD_TIMEOUT: raise Exception('Argument replace_again is not True but a ' 'replacement already exists.') else: log.info("A replacement already exists, but was launched " "{days} days ago. The timeout for servers builds is " "{timeout} days so we are automatically setting " "replace_again.".format(days=age_of_replacement.days, timeout=SERVER_BUILD_TIMEOUT)) replace_again = True # Pull some information from cmdb. cmdb_data = environment_specific.get_server_metadata(original_server.hostname) if not cmdb_data: raise Exception('Could not find information about server to be ' 'replaced in the cmdb') if 'aws_status.codes' in cmdb_data: reasons.add(cmdb_data['aws_status.codes']) log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data)) replacement_config = {'availability_zone': cmdb_data['location'], 'hostname': find_unused_server_name(original_server.get_standardized_replica_set(), reporting_conn, dry_run), 'instance_type': cmdb_data['config.instance_type'], 'mysql_major_version': get_master_mysql_major_version(original_server), 'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION, 'dry_run': dry_run, 'skip_name_check': True} if cmdb_data.pop('cloud.aws.vpc_id', None): # Existing server is in VPC replacement_config['classic_security_group'] = None replacement_config['vpc_security_group'] = cmdb_data['security_groups'] else: # Existing server is in Classic replacement_config['classic_security_group'] = cmdb_data['security_groups'] replacement_config['vpc_security_group'] = None # At this point, all our defaults should be good to go config_overridden = False if replacement_config['classic_security_group'] and overrides['vpc_security_group']: # a VPC migration vpc_migration(replacement_config, overrides) reasons.add('vpc migration') config_overridden = True # All other overrides for key in overrides.keys(): if key not in replacement_config: raise Exception('Invalid override {key}'.format(key=key)) if overrides[key]: if replacement_config[key] == overrides[key]: log.info('Override for key {key} does not modify ' 'configuration'.format(key=key)) else: log.info('Overriding {key} to value {new} from {old}' ''.format(key=key, old=replacement_config[key], new=overrides[key])) reasons.add('changing {key} from {old} to ' '{new}'.format(key=key, old=replacement_config[key], new=overrides[key])) replacement_config[key] = overrides[key] config_overridden = True if config_overridden: log.info('Configuration after overrides: {replacement_config}' ''.format(replacement_config=replacement_config)) # Check to see if MySQL is up on the host try: # This is not multi instance compatible. If we move to multiple # instances this will need to be updated conn = mysql_lib.connect_mysql(original_server) conn.close() dead_server = False except MySQLdb.OperationalError as detail: dead_server = True (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down, assuming hardware failure') reasons.add('hardware failure') if not dead_server: try: mysql_lib.assert_replication_sanity(original_server) except Exception as e: log.info('Replication problem: {e}'.format(e=e)) reasons.add('replication broken') # If we get to here and there is no reason, bail out if not reasons and not replacement_config['dry_run']: raise Exception(('MySQL appears to be up and no reason for ' 'replacement is supplied. You can specify a reason ' 'with the --reason argument')) reason = ', '.join(reasons) log.info('Reason for launch: {reason}'.format(reason=reason)) new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server(**replacement_config) if not (replacement_config['dry_run'] or not_a_replacement): log_replacement_host(reporting_conn, cmdb_data, new_instance_id, replace_again, replacement_config, reason)
def add_replica_to_zk(instance, replica_type, dry_run): """ Add a replica to zk Args: instance - A hostaddr object of the replica to add to zk replica_type - Either 'slave' or 'dr_slave'. dry_run - If set, do not modify zk """ try: if replica_type not in [ host_utils.REPLICA_ROLE_DR_SLAVE, host_utils.REPLICA_ROLE_SLAVE ]: raise Exception('Invalid value "{replica_type}" for argument ' "replica_type").format(replica_type=replica_type) zk_local = host_utils.MysqlZookeeper() kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not get a zk connection') log.info('Instance is {inst}'.format(inst=instance)) mysql_lib.assert_replication_sanity(instance) mysql_lib.assert_replication_unlagged( instance, mysql_lib.REPLICATION_TOLERANCE_NORMAL) master = mysql_lib.get_master_from_instance(instance) if master not in zk_local.get_all_mysql_instances_by_type( host_utils.REPLICA_ROLE_MASTER): raise Exception('Instance {master} is not a master in zk' ''.format(master=master)) log.info('Detected master of {instance} ' 'as {master}'.format(instance=instance, master=master)) (replica_set, _) = zk_local.get_replica_set_from_instance(master) log.info('Detected replica_set as ' '{replica_set}'.format(replica_set=replica_set)) if replica_type == host_utils.REPLICA_ROLE_SLAVE: (zk_node, parsed_data, version) = get_zk_node_for_replica_set(kazoo_client, replica_set) log.info('Replica set {replica_set} is held in zk_node ' '{zk_node}'.format(zk_node=zk_node, replica_set=replica_set)) log.info('Existing config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) new_data = copy.deepcopy(parsed_data) new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]['host'] = \ instance.hostname new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]['port'] = \ instance.port log.info('New config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) if new_data == parsed_data: raise Exception('No change would be made to zk, ' 'will not write new config') elif dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(zk_node, simplejson.dumps(new_data), version) elif replica_type == host_utils.REPLICA_ROLE_DR_SLAVE: znode_data, dr_meta = kazoo_client.get(environment_specific.DR_ZK) parsed_data = simplejson.loads(znode_data) new_data = copy.deepcopy(parsed_data) if replica_set in parsed_data: log.info('Existing dr config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) else: log.info('Replica set did not previously have a dr slave') new_data[replica_set] = \ {host_utils.REPLICA_ROLE_DR_SLAVE: {'host': instance.hostname, 'port': instance.port}} log.info('New dr config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) if new_data == parsed_data: raise Exception('No change would be made to zk, ' 'will not write new config') elif dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new dr configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(environment_specific.DR_ZK, simplejson.dumps(new_data), dr_meta.version) else: # we should raise an exception above rather than getting to here pass except Exception, e: log.exception(e) raise
def add_replica_to_zk(instance, replica_type, dry_run): """ Add a replica to zk Args: instance - A hostaddr object of the replica to add to zk replica_type - Either 'slave' or 'dr_slave'. dry_run - If set, do not modify zk """ try: if replica_type not in [host_utils.REPLICA_ROLE_DR_SLAVE, host_utils.REPLICA_ROLE_SLAVE]: raise Exception('Invalid value "{replica_type}" for argument ' "replica_type").format(replica_type=replica_type) zk_local = host_utils.MysqlZookeeper() kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not get a zk connection') log.info('Instance is {inst}'.format(inst=instance)) mysql_lib.assert_replication_sanity(instance) mysql_lib.assert_replication_unlagged(instance, mysql_lib.REPLICATION_TOLERANCE_NORMAL) master = mysql_lib.get_master_from_instance(instance) if master not in zk_local.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER): raise Exception('Instance {master} is not a master in zk' ''.format(master=master)) log.info('Detected master of {instance} ' 'as {master}'.format(instance=instance, master=master)) (replica_set, _) = zk_local.get_replica_set_from_instance(master) log.info('Detected replica_set as ' '{replica_set}'.format(replica_set=replica_set)) if replica_type == host_utils.REPLICA_ROLE_SLAVE: (zk_node, parsed_data, version) = get_zk_node_for_replica_set(kazoo_client, replica_set) log.info('Replica set {replica_set} is held in zk_node ' '{zk_node}'.format(zk_node=zk_node, replica_set=replica_set)) log.info('Existing config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) new_data = copy.deepcopy(parsed_data) new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]['host'] = \ instance.hostname new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]['port'] = \ instance.port log.info('New config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) if new_data == parsed_data: raise Exception('No change would be made to zk, ' 'will not write new config') elif dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(zk_node, simplejson.dumps(new_data), version) elif replica_type == host_utils.REPLICA_ROLE_DR_SLAVE: znode_data, dr_meta = kazoo_client.get(environment_specific.DR_ZK) parsed_data = simplejson.loads(znode_data) new_data = copy.deepcopy(parsed_data) if replica_set in parsed_data: log.info('Existing dr config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) else: log.info('Replica set did not previously have a dr slave') new_data[replica_set] = \ {host_utils.REPLICA_ROLE_DR_SLAVE: {'host': instance.hostname, 'port': instance.port}} log.info('New dr config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) if new_data == parsed_data: raise Exception('No change would be made to zk, ' 'will not write new config') elif dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new dr configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(environment_specific.DR_ZK, simplejson.dumps(new_data), dr_meta.version) else: # we should raise an exception above rather than getting to here pass except Exception, e: log.exception(e) raise
def mysql_backup(instance, backup_type=backup.BACKUP_TYPE_XBSTREAM, initial_build=False, lock_handle=None): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object backup_type - backup.BACKUP_TYPE_LOGICAL or backup.BACKUP_TYPE_XBSTREAM initial_build - Boolean, if this is being created right after the server was built lock_handle - A lock handle, if we have one from the caller. """ if backup_type == backup.BACKUP_TYPE_XBSTREAM and \ os.path.isfile(backup.XTRABACKUP_SKIP_FILE): log.info('Found {}. Skipping xtrabackup ' 'run.'.format(backup.XTRABACKUP_SKIP_FILE)) return log.info('Confirming sanity of replication (if applicable)') zk = host_utils.MysqlZookeeper() try: replica_type = zk.get_replica_type_from_instance(instance) except: # instance is not in production replica_type = None if replica_type and replica_type != host_utils.REPLICA_ROLE_MASTER: mysql_lib.assert_replication_sanity(instance) log.info('Logging initial status to mysqlops') start_timestamp = time.localtime() backup_id = mysql_lib.start_backup_log(instance, backup_type, start_timestamp) # Take a lock to prevent multiple backups from running concurrently # unless we already have a lock from the caller. This means we # also don't have to release the lock at the end; either we # exit the script entirely, and it gets cleaned up or the caller # maintains it. if lock_handle is None: log.info('Taking backup lock') lock_handle = host_utils.bind_lock_socket( backup.STD_BACKUP_LOCK_SOCKET) else: log.info('Not acquiring backup lock, we already have one.') # Actually run the backup log.info('Running backup') if backup_type == backup.BACKUP_TYPE_XBSTREAM: backup_file = backup.xtrabackup_instance(instance, start_timestamp, initial_build) elif backup_type == backup.BACKUP_TYPE_LOGICAL: # We don't need a backup-skip file here since this isn't # regularly scheduled. backup_file = backup.logical_backup_instance(instance, start_timestamp, initial_build) else: raise Exception('Unsupported backup type {}'.format(backup_type)) # Update database with additional info now that backup is done. if backup_id: log.info("Updating database log entry with final backup info") mysql_lib.finalize_backup_log(backup_id, backup_file) else: log.info("The backup is complete, but we were not able to " "write to the central log DB.")