def check_replication_for_migration(source_replica_set, destination_replica_set): """ Confirm that replication is sane for finishing a shard migration Args: source_replica_set - Where shards are coming from destination_replica_set - Where shards are being sent """ zk = host_utils.MysqlZookeeper() source_master = zk.get_mysql_instance_from_replica_set(source_replica_set) destination_master = zk.get_mysql_instance_from_replica_set( destination_replica_set) source_slave = zk.get_mysql_instance_from_replica_set( source_replica_set, host_utils.REPLICA_ROLE_SLAVE) destination_slave = zk.get_mysql_instance_from_replica_set( destination_replica_set, host_utils.REPLICA_ROLE_SLAVE) # First we will confirm that the slave of the source is caught up # this is important for row count comparisons mysql_lib.assert_replication_unlagged( source_slave, mysql_lib.REPLICATION_TOLERANCE_NORMAL) # Next, the slave of the destination replica set for the same reason mysql_lib.assert_replication_unlagged( destination_slave, mysql_lib.REPLICATION_TOLERANCE_NORMAL) # Next, the destination master is relatively caught up to the source master mysql_lib.assert_replication_unlagged( destination_master, mysql_lib.REPLICATION_TOLERANCE_NORMAL) # We will also verify that the source master is not replicating. A scary # scenario is if the there is some sort of ring replication going and db # drops of blackhole db's would propegate to the source db. try: source_slave_status = mysql_lib.get_slave_status(source_master) except mysql_lib.ReplicationError: source_slave_status = None if source_slave_status: raise Exception('Source master is setup for replication ' 'this is super dangerous!') # We will also verify that the destination master is replicating from the # source master slave_status = mysql_lib.get_slave_status(destination_master) master_of_destination_master = host_utils.HostAddr(':'.join( (slave_status['Master_Host'], str(slave_status['Master_Port'])))) if source_master != master_of_destination_master: raise Exception('Master of destination {d} is {actual} rather than ' 'expected {expected} ' ''.format(d=destination_master, actual=master_of_destination_master, expected=destination_master)) log.info('Replication looks ok for migration')
def clean_up_migration(source_replica_set): migration = start_shard_migration.check_migration_lock(source_replica_set) destination_replica_set = migration['destination_replica_set'] mig_lock_identifier = migration['lock_identifier'] zk = host_utils.MysqlZookeeper() destination_master = zk.get_mysql_instance_from_replica_set( destination_replica_set) try: mysql_lib.get_slave_status(destination_master) reset_repl = True except: reset_repl = False if reset_repl: log.info('Taking promotion locks') dest_lock_identifier = mysql_failover.get_promotion_lock( destination_replica_set) log.info('Removing replication from destination master {}' ''.format(destination_master)) try: mysql_lib.reset_slave(destination_master) except: raise finally: mysql_failover.release_promotion_lock(dest_lock_identifier) (orphans_tmp, orphaned_but_used_tmp, _) = \ find_shard_mismatches.find_shard_mismatches(destination_master) orphans = orphans_tmp[destination_master] if \ destination_master in orphans_tmp else [] orphaned_but_used = orphaned_but_used_tmp[destination_master] if \ destination_master in orphaned_but_used_tmp else [] if orphaned_but_used: log.info('Orphaned but used dbs: {}'.format( ', '.join(orphaned_but_used))) raise Exception('Cowardly refusing to do anything') if orphans: log.info('Orphaned dbs: {}'.format(', '.join(orphans))) fix_orphaned_shards.rename_db_to_drop(destination_master, orphans) start_shard_migration.finish_migration_log( mig_lock_identifier, start_shard_migration.STATUS_ABORTED)
def is_master_alive(master, replicas): """ Determine if the master is alive The function will: 1. Attempt to connect to the master via the mysql protcol. If successful the master is considered alive. 2. If #1 fails, check the io thread of the replica instance(s). If the io thread is not running, the master will be considered dead. If step #1 fails and step #2 succeeds, we are in a weird state and will throw an exception. Args: master - A hostaddr object for the master instance replicas - A set of hostaddr objects for the replica instances Returns: A mysql connection to the master if the master is alive, False otherwise. """ if len(replicas) == 0: raise Exception('At least one replica must be present to determine ' 'a master is dead') try: master_conn = mysql_lib.connect_mysql(master) return master_conn except MySQLdb.OperationalError as detail: (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise master_conn = False log.info('Unable to connect to current master {master} from ' '{hostname}, will check replica servers beforce declaring ' 'the master dead'.format(master=master, hostname=host_utils.HOSTNAME)) except: log.info('This is an unknown connection error. If you are very sure ' 'that the master is dead, please put a "return False" at the ' 'top of is_master_alive and then send rwultsch a stack trace') raise # We can not get a connection to the master, so poll the replica servers for replica in replicas: conn = mysql_lib.connect_mysql(replica) # If replication has not hit a timeout, a dead master can still have # a replica which thinks it is ok. "STOP SLAVE; START SLAVE" followed # by a sleep will get us truthyness. mysql_lib.restart_replication(conn) ss = mysql_lib.get_slave_status(conn) if ss['Slave_IO_Running'] == 'Yes': raise Exception('Replica {replica} thinks it can connect to ' 'master {master}, but failover script can not. ' 'Possible network partition!' ''.format(replica=replica, master=master)) else: log.info('Replica {replica} also can not connect to master ' '{master}.'.format(replica=replica, master=master)) return False
def confirm_replicas_in_sync(replicas): """ Confirm that all replicas are in sync in terms of replication Args: replicas - A set of hostAddr objects """ replication_progress = set() for replica in replicas: slave_status = mysql_lib.get_slave_status(replica) replication_progress.add(':'.join((slave_status['Relay_Master_Log_File'], str(slave_status['Exec_Master_Log_Pos'])))) if len(replication_progress) == 1: return True else: return False
def confirm_replica_topology(master, replicas): """ Confirm that replica servers are actually replicating off of a master Args: master - A hostaddr object for the master instance replicas - A set of hostaddr objects for the replica instance """ for replica in replicas: conn = mysql_lib.connect_mysql(replica) ss = mysql_lib.get_slave_status(conn) repl_master = host_utils.HostAddr(':'.join( (ss['Master_Host'], str(ss['Master_Port'])))) if repl_master != master: raise Exception('Slave {replica} is not a replica of master ' '{master}, but is instead a replica of ' '{repl_master}'.format(replica=replica, repl_master=repl_master, master=master)) else: log.info('Replica {replica} is replicating from expected master ' 'server {master}'.format(replica=replica, master=master))
def confirm_replica_topology(master, replicas): """ Confirm that replica servers are actually replicating off of a master Args: master - A hostaddr object for the master instance replicas - A set of hostaddr objects for the replica instance """ for replica in replicas: conn = mysql_lib.connect_mysql(replica) ss = mysql_lib.get_slave_status(conn) repl_master = host_utils.HostAddr(':'.join((ss['Master_Host'], str(ss['Master_Port'])))) if repl_master != master: raise Exception('Slave {replica} is not a replica of master ' '{master}, but is instead a replica of ' '{repl_master}'.format(replica=replica, repl_master=repl_master, master=master)) else: log.info('Replica {replica} is replicating from expected master ' 'server {master}'.format(replica=replica, master=master))
def main(): description = ("MySQL checksum wrapper\n\n" "Wrapper of pt-table-checksum and pt-table-sync.\n" "Defaults to checksumming 1/{k}th of databases on instance.\n" "If diffs are found, use pt-table-sync to measure actual " "divergence,\nbut only if the number of diffs is between " "--min_diffs and --max_diffs.").format(k=DB_CHECK_FRACTION) parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-i', '--instance', help='Instance to act on if other than localhost:3306', default=''.join((socket.getfqdn(), ':3306'))) parser.add_argument('-a', '--all', help='Checksums all dbs rather than the default', action='store_true', default=False) parser.add_argument('-d', '--dbs', help=("Comma separated list of db's to check rather " "than the default"), default=False) parser.add_argument('-q', '--quiet', help=("Do not print output to stdout"), action='store_true', default=False) parser.add_argument('-m', '--min_diffs', help=("Do per-row check if chunk diff count is at " "least this value"), dest='min_diffs', default=MIN_DIFFS) parser.add_argument('-M', '--max_diffs', help=("Do not do per-row check if chunk diff count " "is greater than this value"), dest='max_diffs', default=MAX_DIFFS) parser.add_argument('-C', '--no_create_table', help=("If test.checksum_detail is missing, do " "not try to create it."), dest='create_table', action='store_false', default=True) parser.add_argument('-v', '--verbose', help=("Store raw output from PT tools in the DB?"), action='store_true', default=False) parser.add_argument('-c', '--check_fraction', help=('Check this fraction of databases.'), default=DB_CHECK_FRACTION) args = parser.parse_args() instance = host_utils.HostAddr(args.instance) zk = host_utils.MysqlZookeeper() if instance not in \ zk.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER): raise Exception("Instance is not a master in ZK") # If enabled, try to create the table that holds the checksum info. # If not enabled, make sure that the table exists. conn = mysql_lib.connect_mysql(instance, 'scriptro') if not mysql_lib.does_table_exist(conn, mysql_lib.METADATA_DB, CHECKSUM_TBL): if args.create_table: create_checksum_detail_table(instance) else: raise Exception("Checksum table not found. Unable to continue." "Consider not using the -C option or create it " "yourself.") # Determine what replica set we belong to and get a list of slaves. replica_set = zk.get_replica_set_from_instance(instance)[0] slaves = set() for rtype in host_utils.REPLICA_ROLE_SLAVE, host_utils.REPLICA_ROLE_DR_SLAVE: s = zk.get_mysql_instance_from_replica_set(replica_set, rtype) if s: slaves.add(s) if len(slaves) == 0: log.info("This server has no slaves. Nothing to do.") sys.exit(0) # before we even start this, make sure replication is OK. for slave in slaves: slave_conn = mysql_lib.connect_mysql(slave, 'scriptrw') ss = mysql_lib.get_slave_status(slave_conn) if ss['Slave_SQL_Running'] != "Yes" or ss['Slave_IO_Running'] != "Yes": raise Exception("Replication is NOT RUNNING on slave {s}: " "SQL: {st} | IO: {it}".format(st=ss['Slave_SQL_Running'], it=ss['Slave_IO_Running'])) if args.dbs: db_to_check = set(args.dbs.split(',')) else: dbs = mysql_lib.get_dbs(conn) if args.all: db_to_check = dbs else: # default behaviour, check a given DB every N days based on # day of year. minimizes month-boundary issues. db_to_check = set() check_modulus = int(time.strftime("%j")) % int(args.check_fraction) counter = 0 for db in dbs: modulus = counter % int(args.check_fraction) if modulus == check_modulus: db_to_check.add(db) counter = counter + 1 # Iterate through the list of DBs and check one table at a time. # We do it this way to ensure more coverage in case pt-table-checksum # loses its DB connection and errors out before completing a full scan # of a given database. # for db in db_to_check: conn = mysql_lib.connect_mysql(instance, 'scriptro') tables_to_check = mysql_lib.get_tables(conn, db, skip_views=True) for tbl in tables_to_check: c_cmd, c_out, c_err, c_ret = checksum_tbl(instance, db, tbl) if not args.quiet: log.info("Checksum command executed was:\n{cmd}".format(cmd=c_cmd)) log.info("Standard out:\n{out}".format(out=c_out)) log.info("Standard error:\n{err}".format(err=c_err)) log.info("Return code: {ret}".format(ret=c_ret)) # parse each line of STDOUT (there should only be one with # actual data). We only care about errors, rows, chunks, and # skipped, since we'll need to figure out diffs separately for # each slave box. for line in c_out.split("\n"): results = parse_checksum_row(line) if results: chunk_errors = int(results[1]) row_count = int(results[3]) chunk_count = int(results[4]) chunk_skips = int(results[5]) for slave in slaves: rows_checked = 'NO' sync_cmd = "" sync_out = "" sync_err = "" sync_ret = -1 row_diffs = 0 elapsed_time_ms,\ chunk_diffs = check_one_replica(slave, db, tbl) # if we skipped some chunks or there were errors, # this means we can't have complete information about the # state of the replica. in the case of a hard error, # we'll just stop. in the case of a skipped chunk, we will # treat it as a different chunk for purposes of deciding # whether or not to do a more detailed analysis. # checkable_chunks = chunk_skips + chunk_diffs if chunk_errors > 0: checksum_status = 'ERRORS_IN_CHECKSUM_PROCESS' elif checkable_chunks == 0: checksum_status = 'GOOD' else: if checkable_chunks > int(args.max_diffs): # too many chunk diffs, don't bother checking # further. not good. checksum_status = 'TOO_MANY_CHUNK_DIFFS' elif checkable_chunks < int(args.min_diffs): # some diffs, but not enough that we care. checksum_status = 'CHUNK_DIFFS_FOUND_BUT_OK' else: start_time = int(time.time()*1000) rows_checked = 'YES' # set the proper status - did we do a sync-based check # because of explicit diffs or because of skipped chunks? if chunk_diffs > 0: checksum_status = 'ROW_DIFFS_FOUND' else: checksum_status = 'CHUNKS_WERE_SKIPPED' sync_cmd, sync_out, sync_err, sync_ret, \ row_diffs = checksum_tbl_via_sync(slave, db, tbl) # Add in the time it took to do the sync. elapsed_time_ms += int(time.time()*1000) - start_time if not args.quiet: log.info("Sync command executed was:\n{cmd} ".format(cmd=sync_cmd)) log.info("Standard out:\n {out}".format(out=sync_out)) log.info("Standard error:\n {err}".format(err=sync_err)) log.info("Return code: {ret}".format(ret=sync_ret)) log.info("Row diffs found: {cnt}".format(cnt=row_diffs)) # Checksum process is complete, store the results. # data = {'instance': slave, 'master_instance': instance, 'db': db, 'tbl': tbl, 'elapsed_time_ms': elapsed_time_ms, 'chunk_count': chunk_count, 'chunk_errors': chunk_errors, 'chunk_diffs': chunk_diffs, 'chunk_skips': chunk_skips, 'row_count': row_count, 'row_diffs': row_diffs, 'rows_checked': rows_checked, 'checksum_status': checksum_status, 'checksum_cmd': None, 'checksum_stdout': None, 'checksum_stderr': None, 'checksum_rc': c_ret, 'sync_cmd': None, 'sync_stdout': None, 'sync_stderr': None, 'sync_rc': sync_ret} if args.verbose: data.update({'checksum_cmd': c_cmd, 'checksum_stdout': c_out, 'checksum_stderr': c_err, 'sync_cmd': sync_cmd, 'sync_stdout': sync_out, 'sync_stderr': sync_err, 'sync_rc': sync_ret}) write_checksum_status(instance, data) conn.close()