def csv_backup_success_logged(instance, date): """ Check for log entries created by log_csv_backup_success Args: instance - A hostaddr object date - a string for the date Returns: True if already backed up, False otherwise """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist( master, mysql_lib.METADATA_DB, environment_specific.CSV_BACKUP_LOG_TABLE): return False sql = ('SELECT COUNT(*) as "cnt" ' 'FROM {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} ' 'WHERE backup_date = %(date)s ' ''.format( METADATA_DB=mysql_lib.METADATA_DB, CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE)) cursor.execute(sql, {'date': date}) if cursor.fetchone()["cnt"]: return True else: return False
def ensure_backup_locks_sanity(self): """ Release any backup locks that aren't sane. This means locks created by the same host as the caller. The instance level flock should allow this assumption to be correct. """ zk = host_utils.MysqlZookeeper() (replica_set, _) = self.instance.get_zk_replica_set() master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='scriptrw') cursor = master_conn.cursor() if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, CSV_BACKUP_LOCK_TABLE_NAME): log.debug('Creating missing metadata table') cursor.execute(CSV_BACKUP_LOCK_TABLE.format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME)) params = {'hostname': self.instance.hostname, 'port': self.instance.port} sql = ('UPDATE {db}.{tbl} ' 'SET lock_active = NULL, released = NOW() ' 'WHERE hostname = %(hostname)s AND ' ' port = %(port)s' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql, params) master_conn.commit()
def ensure_binlog_archiving_table_sanity(instance): """ Create binlog archiving log table if missing, purge old data Args: instance - A hostAddr object. Note: this function will find the master of the instance if the instance is not a master """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist( master, mysql_lib.METADATA_DB, environment_specific.BINLOG_ARCHIVING_TABLE_NAME): log.debug('Creating missing metadata table') cursor.execute( BINLOG_ARCHIVING_TABLE.format( db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME)) sql = ("DELETE FROM {metadata_db}.{tbl} " "WHERE binlog_creation < now() - INTERVAL {d} DAY" "").format(metadata_db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME, d=(environment_specific.S3_BINLOG_RETENTION + 1)) log.info(sql) cursor.execute(sql) conn.commit()
def log_csv_backup_success(instance, date): """ The CSV backup check can be expensive, so let's log that it is done Args: instance - A hostaddr object date - a string for the date """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist( master, mysql_lib.METADATA_DB, environment_specific.CSV_BACKUP_LOG_TABLE): print 'Creating missing metadata table' cursor.execute( CSV_BACKUP_LOG_TABLE_DEFINITION.format( db=mysql_lib.METADATA_DB, tbl=environment_specific.CSV_BACKUP_LOG_TABLE)) sql = ('INSERT IGNORE INTO {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} ' 'SET backup_date = %(date)s, ' 'completion = NOW()' ''.format( METADATA_DB=mysql_lib.METADATA_DB, CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE)) cursor.execute(sql, {'date': date}) conn.commit()
def ensure_binlog_archiving_table_sanity(instance): """ Create binlog archiving log table if missing, purge old data Args: instance - A hostAddr object. Note: this function will find the master of the instance if the instance is not a master """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, environment_specific.BINLOG_ARCHIVING_TABLE_NAME): log.debug('Creating missing metadata table') cursor.execute(BINLOG_ARCHIVING_TABLE.format(db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME)) sql = ("DELETE FROM {metadata_db}.{tbl} " "WHERE binlog_creation < now() - INTERVAL {d} DAY" "").format(metadata_db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME, d=(environment_specific.S3_BINLOG_RETENTION+1)) log.info(sql) cursor.execute(sql) conn.commit()
def ensure_backup_locks_sanity(self): """ Release any backup locks that aren't sane. This means locks created by the same host as the caller. The instance level lock should allow this assumption to be correct. """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(self.instance) master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='dbascript') cursor = master_conn.cursor() if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, CSV_BACKUP_LOCK_TABLE_NAME): log.debug('Creating missing metadata table') cursor.execute( CSV_BACKUP_LOCK_TABLE.format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME)) params = { 'hostname': self.instance.hostname, 'port': self.instance.port } sql = ('UPDATE {db}.{tbl} ' 'SET lock_active = NULL, released = NOW() ' 'WHERE hostname = %(hostname)s AND ' ' port = %(port)s' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql, params) master_conn.commit()
def csv_backup_success_logged(instance, date): """ Check for log entries created by log_csv_backup_success Args: instance - A hostaddr object date - a string for the date Returns: True if already backed up, False otherwise """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, environment_specific.CSV_BACKUP_LOG_TABLE): return False sql = ('SELECT COUNT(*) as "cnt" ' 'FROM {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} ' 'WHERE backup_date = %(date)s ' ''.format(METADATA_DB=mysql_lib.METADATA_DB, CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE)) cursor.execute(sql, {'date': date}) if cursor.fetchone()["cnt"]: return True else: return False
def log_csv_backup_success(instance, date): """ The CSV backup check can be expensive, so let's log that it is done Args: instance - A hostaddr object date - a string for the date """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, environment_specific.CSV_BACKUP_LOG_TABLE): print 'Creating missing metadata table' cursor.execute(CSV_BACKUP_LOG_TABLE_DEFINITION.format( db=mysql_lib.METADATA_DB, tbl=environment_specific.CSV_BACKUP_LOG_TABLE)) sql = ('INSERT IGNORE INTO {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} ' 'SET backup_date = %(date)s, ' 'completion = NOW()' ''.format(METADATA_DB=mysql_lib.METADATA_DB, CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE)) cursor.execute(sql, {'date': date}) conn.commit()
def log_table_sizes(port): """ Determine and record the size of tables on a MySQL instance Args: port - int """ instance = host_utils.HostAddr(':'.join((host_utils.HOSTNAME, port))) zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance) master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, TABLE_SIZE_TBL): create_table_size_table(master) sizes = get_all_table_sizes(instance) conn = mysql_lib.connect_mysql(master, 'dbascript') for db in sizes: for table in sizes[db]: for partition in sizes[db][table]: cursor = conn.cursor() sql = ('REPLACE INTO {metadata_db}.{tbl} ' 'SET ' 'hostname = %(hostname)s, ' 'port = %(port)s, ' 'db = %(db)s, ' 'table_name = %(table)s, ' 'partition_name = %(partition)s, ' 'reported_at = CURDATE(), ' 'size_mb = %(size)s ') cursor.execute( sql.format(metadata_db=mysql_lib.METADATA_DB, tbl=TABLE_SIZE_TBL), { 'hostname': instance.hostname, 'port': instance.port, 'db': db, 'table': table, 'partition': partition, 'size': sizes[db][table][partition] }) conn.commit() log.info(cursor._executed) cursor.close()
def log_table_sizes(port): """ Determine and record the size of tables on a MySQL instance Args: port - int """ instance = host_utils.HostAddr(':'.join((host_utils.HOSTNAME, port))) zk = host_utils.MysqlZookeeper() replica_set = instance.get_zk_replica_set()[0] master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, TABLE_SIZE_TBL): create_table_size_table(master) sizes = get_all_table_sizes(instance) conn = mysql_lib.connect_mysql(master, 'scriptrw') for db in sizes: for table in sizes[db]: for partition in sizes[db][table]: cursor = conn.cursor() sql = ('REPLACE INTO {metadata_db}.{tbl} ' 'SET ' 'hostname = %(hostname)s, ' 'port = %(port)s, ' 'db = %(db)s, ' 'table_name = %(table)s, ' 'partition_name = %(partition)s, ' 'reported_at = CURDATE(), ' 'size_mb = %(size)s ') cursor.execute(sql.format(metadata_db=mysql_lib.METADATA_DB, tbl=TABLE_SIZE_TBL), {'hostname': instance.hostname, 'port': instance.port, 'db': db, 'table': table, 'partition': partition, 'size': sizes[db][table][partition]}) conn.commit() log.info(cursor._executed) cursor.close()
def main(): description = ("MySQL checksum wrapper\n\n" "Wrapper of pt-table-checksum and pt-table-sync.\n" "Defaults to checksumming 1/{k}th of databases on instance.\n" "If diffs are found, use pt-table-sync to measure actual " "divergence,\nbut only if the number of diffs is between " "--min_diffs and --max_diffs.").format(k=DB_CHECK_FRACTION) parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-i', '--instance', help='Instance to act on if other than localhost:3306', default=''.join((socket.getfqdn(), ':3306'))) parser.add_argument('-a', '--all', help='Checksums all dbs rather than the default', action='store_true', default=False) parser.add_argument('-d', '--dbs', help=("Comma separated list of db's to check rather " "than the default"), default=False) parser.add_argument('-q', '--quiet', help=("Do not print output to stdout"), action='store_true', default=False) parser.add_argument('-m', '--min_diffs', help=("Do per-row check if chunk diff count is at " "least this value"), dest='min_diffs', default=MIN_DIFFS) parser.add_argument('-M', '--max_diffs', help=("Do not do per-row check if chunk diff count " "is greater than this value"), dest='max_diffs', default=MAX_DIFFS) parser.add_argument('-C', '--no_create_table', help=("If test.checksum_detail is missing, do " "not try to create it."), dest='create_table', action='store_false', default=True) parser.add_argument('-v', '--verbose', help=("Store raw output from PT tools in the DB?"), action='store_true', default=False) parser.add_argument('-c', '--check_fraction', help=('Check this fraction of databases.'), default=DB_CHECK_FRACTION) args = parser.parse_args() instance = host_utils.HostAddr(args.instance) zk = host_utils.MysqlZookeeper() if instance not in \ zk.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER): raise Exception("Instance is not a master in ZK") # If enabled, try to create the table that holds the checksum info. # If not enabled, make sure that the table exists. conn = mysql_lib.connect_mysql(instance, 'scriptro') if not mysql_lib.does_table_exist(conn, mysql_lib.METADATA_DB, CHECKSUM_TBL): if args.create_table: create_checksum_detail_table(instance) else: raise Exception("Checksum table not found. Unable to continue." "Consider not using the -C option or create it " "yourself.") # Determine what replica set we belong to and get a list of slaves. replica_set = zk.get_replica_set_from_instance(instance)[0] slaves = set() for rtype in host_utils.REPLICA_ROLE_SLAVE, host_utils.REPLICA_ROLE_DR_SLAVE: s = zk.get_mysql_instance_from_replica_set(replica_set, rtype) if s: slaves.add(s) if len(slaves) == 0: log.info("This server has no slaves. Nothing to do.") sys.exit(0) # before we even start this, make sure replication is OK. for slave in slaves: slave_conn = mysql_lib.connect_mysql(slave, 'scriptrw') ss = mysql_lib.get_slave_status(slave_conn) if ss['Slave_SQL_Running'] != "Yes" or ss['Slave_IO_Running'] != "Yes": raise Exception("Replication is NOT RUNNING on slave {s}: " "SQL: {st} | IO: {it}".format(st=ss['Slave_SQL_Running'], it=ss['Slave_IO_Running'])) if args.dbs: db_to_check = set(args.dbs.split(',')) else: dbs = mysql_lib.get_dbs(conn) if args.all: db_to_check = dbs else: # default behaviour, check a given DB every N days based on # day of year. minimizes month-boundary issues. db_to_check = set() check_modulus = int(time.strftime("%j")) % int(args.check_fraction) counter = 0 for db in dbs: modulus = counter % int(args.check_fraction) if modulus == check_modulus: db_to_check.add(db) counter = counter + 1 # Iterate through the list of DBs and check one table at a time. # We do it this way to ensure more coverage in case pt-table-checksum # loses its DB connection and errors out before completing a full scan # of a given database. # for db in db_to_check: conn = mysql_lib.connect_mysql(instance, 'scriptro') tables_to_check = mysql_lib.get_tables(conn, db, skip_views=True) for tbl in tables_to_check: c_cmd, c_out, c_err, c_ret = checksum_tbl(instance, db, tbl) if not args.quiet: log.info("Checksum command executed was:\n{cmd}".format(cmd=c_cmd)) log.info("Standard out:\n{out}".format(out=c_out)) log.info("Standard error:\n{err}".format(err=c_err)) log.info("Return code: {ret}".format(ret=c_ret)) # parse each line of STDOUT (there should only be one with # actual data). We only care about errors, rows, chunks, and # skipped, since we'll need to figure out diffs separately for # each slave box. for line in c_out.split("\n"): results = parse_checksum_row(line) if results: chunk_errors = int(results[1]) row_count = int(results[3]) chunk_count = int(results[4]) chunk_skips = int(results[5]) for slave in slaves: rows_checked = 'NO' sync_cmd = "" sync_out = "" sync_err = "" sync_ret = -1 row_diffs = 0 elapsed_time_ms,\ chunk_diffs = check_one_replica(slave, db, tbl) # if we skipped some chunks or there were errors, # this means we can't have complete information about the # state of the replica. in the case of a hard error, # we'll just stop. in the case of a skipped chunk, we will # treat it as a different chunk for purposes of deciding # whether or not to do a more detailed analysis. # checkable_chunks = chunk_skips + chunk_diffs if chunk_errors > 0: checksum_status = 'ERRORS_IN_CHECKSUM_PROCESS' elif checkable_chunks == 0: checksum_status = 'GOOD' else: if checkable_chunks > int(args.max_diffs): # too many chunk diffs, don't bother checking # further. not good. checksum_status = 'TOO_MANY_CHUNK_DIFFS' elif checkable_chunks < int(args.min_diffs): # some diffs, but not enough that we care. checksum_status = 'CHUNK_DIFFS_FOUND_BUT_OK' else: start_time = int(time.time()*1000) rows_checked = 'YES' # set the proper status - did we do a sync-based check # because of explicit diffs or because of skipped chunks? if chunk_diffs > 0: checksum_status = 'ROW_DIFFS_FOUND' else: checksum_status = 'CHUNKS_WERE_SKIPPED' sync_cmd, sync_out, sync_err, sync_ret, \ row_diffs = checksum_tbl_via_sync(slave, db, tbl) # Add in the time it took to do the sync. elapsed_time_ms += int(time.time()*1000) - start_time if not args.quiet: log.info("Sync command executed was:\n{cmd} ".format(cmd=sync_cmd)) log.info("Standard out:\n {out}".format(out=sync_out)) log.info("Standard error:\n {err}".format(err=sync_err)) log.info("Return code: {ret}".format(ret=sync_ret)) log.info("Row diffs found: {cnt}".format(cnt=row_diffs)) # Checksum process is complete, store the results. # data = {'instance': slave, 'master_instance': instance, 'db': db, 'tbl': tbl, 'elapsed_time_ms': elapsed_time_ms, 'chunk_count': chunk_count, 'chunk_errors': chunk_errors, 'chunk_diffs': chunk_diffs, 'chunk_skips': chunk_skips, 'row_count': row_count, 'row_diffs': row_diffs, 'rows_checked': rows_checked, 'checksum_status': checksum_status, 'checksum_cmd': None, 'checksum_stdout': None, 'checksum_stderr': None, 'checksum_rc': c_ret, 'sync_cmd': None, 'sync_stdout': None, 'sync_stderr': None, 'sync_rc': sync_ret} if args.verbose: data.update({'checksum_cmd': c_cmd, 'checksum_stdout': c_out, 'checksum_stderr': c_err, 'sync_cmd': sync_cmd, 'sync_stdout': sync_out, 'sync_stderr': sync_err, 'sync_rc': sync_ret}) write_checksum_status(instance, data) conn.close()
def main(): description = ( "MySQL checksum wrapper\n\n" "Wrapper of pt-table-checksum and pt-table-sync.\n" "Defaults to checksumming 1/{k}th of databases on instance.\n" "If diffs are found, use pt-table-sync to measure actual " "divergence,\nbut only if the number of diffs is between " "--min_diffs and --max_diffs.").format(k=DB_CHECK_FRACTION) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-i', '--instance', help='Instance to act on if other than localhost:3306', default=''.join((socket.getfqdn(), ':3306'))) parser.add_argument('-a', '--all', help='Checksums all dbs rather than the default', action='store_true', default=False) parser.add_argument('-d', '--dbs', help=("Comma separated list of db's to check rather " "than the default"), default=False) parser.add_argument('-q', '--quiet', help=("Do not print output to stdout"), action='store_true', default=False) parser.add_argument('-m', '--min_diffs', help=("Do per-row check if chunk diff count is at " "least this value"), dest='min_diffs', default=MIN_DIFFS) parser.add_argument('-M', '--max_diffs', help=("Do not do per-row check if chunk diff count " "is greater than this value"), dest='max_diffs', default=MAX_DIFFS) parser.add_argument('-C', '--no_create_table', help=("If test.checksum_detail is missing, do " "not try to create it."), dest='create_table', action='store_false', default=True) parser.add_argument('-v', '--verbose', help=("Store raw output from PT tools in the DB?"), action='store_true', default=False) parser.add_argument('-c', '--check_fraction', help=('Check this fraction of databases.'), default=DB_CHECK_FRACTION) args = parser.parse_args() instance = host_utils.HostAddr(args.instance) zk = host_utils.MysqlZookeeper() if instance not in \ zk.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER): raise Exception("Instance is not a master in ZK") # If enabled, try to create the table that holds the checksum info. # If not enabled, make sure that the table exists. if not mysql_lib.does_table_exist(instance, mysql_lib.METADATA_DB, CHECKSUM_TBL): if args.create_table: create_checksum_detail_table(instance) else: raise Exception("Checksum table not found. Unable to continue." "Consider not using the -C option or create it " "yourself.") # Determine what replica set we belong to and get a list of slaves. replica_set = zk.get_replica_set_from_instance(instance) slaves = set() for rtype in host_utils.REPLICA_ROLE_SLAVE, host_utils.REPLICA_ROLE_DR_SLAVE: s = zk.get_mysql_instance_from_replica_set(replica_set, rtype) if s: slaves.add(s) if len(slaves) == 0: log.info("This server has no slaves. Nothing to do.") return # in theory, we could allow multiple instances of this script to run # on one server, as long as they are checksumming different replica sets. # try: lock = host_utils.bind_lock_socket('CHECKSUM_{}'.format(replica_set)) except socket.error, (code, msg): log.error("Unable to bind socket for checksum on {rs} " "(msg: {m}, code:{c})".format(rs=replica_set, m=msg, c=code)) sys.exit(code)