def backup_instance(self): """ Back up a replica instance to s3 in csv """ host_lock_handle = None try: log.info('Backup for instance {i} started at {t}' ''.format(t=str(self.timestamp), i=self.instance)) log.info('Checking heartbeat to make sure replicaiton is not too ' 'lagged.') self.check_replication_for_backup() log.info('Taking host backup lock') host_lock_handle = host_utils.take_flock_lock( backup.BACKUP_LOCK_FILE) log.info('Setting up export directory structure') self.setup_and_get_tmp_path() log.info('Will temporarily dump inside of {path}' ''.format(path=self.dump_base_path)) log.info('Releasing any invalid shard backup locks') self.ensure_backup_locks_sanity() log.info('Deleting old expired locks') self.purge_old_expired_locks() log.info('Stopping replication SQL thread to get a snapshot') mysql_lib.stop_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) workers = [] for _ in range(multiprocessing.cpu_count() / 2): proc = multiprocessing.Process( target=self.mysql_backup_csv_dbs) proc.daemon = True proc.start() workers.append(proc) # throw in a sleep to make sure all threads have started dumps time.sleep(2) log.info('Restarting replication') mysql_lib.start_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) for worker in workers: worker.join() if not self.dbs_to_backup.empty(): raise Exception('All worker processes have completed, but ' 'work remains in the queue') log.info('CSV backup is complete, will run a check') mysql_backup_status.verify_csv_backup(self.instance.replica_type, self.datestamp, self.instance) finally: if host_lock_handle: log.info('Releasing general host backup lock') host_utils.release_flock_lock(host_lock_handle)
def mysql_backup(instance, backup_type=backup.BACKUP_TYPE_XBSTREAM, initial_build=False): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object backup_type - backup.BACKUP_TYPE_LOGICAL or backup.BACKUP_TYPE_XBSTREAM initial_build - Boolean, if this is being created right after the server was built """ log.info('Confirming sanity of replication (if applicable)') zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(instance) except: # instance is not in production replica_type = None if replica_type and replica_type != host_utils.REPLICA_ROLE_MASTER: mysql_lib.assert_replication_sanity(instance) log.info('Logging initial status to mysqlops') start_timestamp = time.localtime() lock_handle = None backup_id = mysql_lib.start_backup_log(instance, backup_type, start_timestamp) # Take a lock to prevent multiple backups from running concurrently try: log.info('Taking backup lock') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) # Actually run the backup log.info('Running backup') if backup_type == backup.BACKUP_TYPE_XBSTREAM: backup_file = backup.xtrabackup_instance(instance, start_timestamp, initial_build) elif backup_type == backup.BACKUP_TYPE_LOGICAL: backup_file = backup.logical_backup_instance( instance, start_timestamp, initial_build) else: raise Exception('Unsupported backup type {backup_type}' ''.format(backup_type=backup_type)) finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) # Update database with additional info now that backup is done. if backup_id: log.info("Updating database log entry with final backup info") mysql_lib.finalize_backup_log(backup_id, backup_file) else: log.info("The backup is complete, but we were not able to " "write to the central log DB.")
def backup_instance(self): """ Back up a replica instance to s3 in csv """ host_lock_handle = None try: log.info('Backup for instance {i} started at {t}' ''.format(t=str(self.timestamp), i=self.instance)) log.info('Checking heartbeat to make sure replicaiton is not too ' 'lagged.') self.check_replication_for_backup() log.info('Taking host backup lock') host_lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Setting up export directory structure') self.setup_and_get_tmp_path() log.info('Will temporarily dump inside of {path}' ''.format(path=self.dump_base_path)) log.info('Releasing any invalid shard backup locks') self.ensure_backup_locks_sanity() log.info('Deleting old expired locks') self.purge_old_expired_locks() log.info('Stopping replication SQL thread to get a snapshot') mysql_lib.stop_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) workers = [] for _ in range(multiprocessing.cpu_count() / 2): proc = multiprocessing.Process(target=self.mysql_backup_csv_dbs) proc.daemon = True proc.start() workers.append(proc) # throw in a sleep to make sure all threads have started dumps time.sleep(2) log.info('Restarting replication') mysql_lib.start_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) for worker in workers: worker.join() if not self.dbs_to_backup.empty(): raise Exception('All worker processes have completed, but ' 'work remains in the queue') log.info('CSV backup is complete, will run a check') mysql_backup_status.verify_csv_backup(self.instance.replica_type, self.datestamp, self.instance) finally: if host_lock_handle: log.info('Releasing general host backup lock') host_utils.release_flock_lock(host_lock_handle)
def mysql_backup(instance, backup_type=backup.BACKUP_TYPE_XBSTREAM): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object """ log.info('Logging initial status to mysqlops') start_timestamp = time.localtime() lock_handle = None backup_id = mysql_lib.start_backup_log(instance, backup_type, start_timestamp) # Take a lock to prevent multiple backups from running concurrently try: log.info('Taking backup lock') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Cleaning up old backups') purge_mysql_backups.purge_mysql_backups(instance, skip_lock=True) # Actually run the backup log.info('Running backup') if backup_type == backup.BACKUP_TYPE_XBSTREAM: backup_file = backup.xtrabackup_instance(instance, start_timestamp) elif backup_type == backup.BACKUP_TYPE_LOGICAL: backup_file = backup.logical_backup_instance( instance, start_timestamp) else: raise Exception('Unsupported backup type {backup_type}' ''.format(backup_type=backup_type)) # Upload file to s3 log.info('Uploading file to s3') backup.s3_upload(backup_file) finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) # Update database with additional info now that backup is done. if backup_id: log.info("Updating database log entry with final backup info") mysql_lib.finalize_backup_log(backup_id, backup_file, size=os.stat(backup_file).st_size) else: log.info("The backup is complete, but we were not able to " "write to the central log DB.") # Running purge again log.info('Purging backups again') purge_mysql_backups.purge_mysql_backups(instance)
def mysql_backup(instance, backup_type=backup.BACKUP_TYPE_XBSTREAM, initial_build=False): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object backup_type - backup.BACKUP_TYPE_LOGICAL or backup.BACKUP_TYPE_XBSTREAM initial_build - Boolean, if this is being created right after the server was built """ log.info('Confirming sanity of replication (if applicable)') zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(instance) except: # instance is not in production replica_type = None if replica_type and replica_type != host_utils.REPLICA_ROLE_MASTER: mysql_lib.assert_replication_sanity(instance) log.info('Logging initial status to mysqlops') start_timestamp = time.localtime() lock_handle = None backup_id = mysql_lib.start_backup_log(instance, backup_type, start_timestamp) # Take a lock to prevent multiple backups from running concurrently try: log.info('Taking backup lock') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) # Actually run the backup log.info('Running backup') if backup_type == backup.BACKUP_TYPE_XBSTREAM: backup_file = backup.xtrabackup_instance(instance, start_timestamp, initial_build) elif backup_type == backup.BACKUP_TYPE_LOGICAL: backup_file = backup.logical_backup_instance(instance, start_timestamp, initial_build) else: raise Exception('Unsupported backup type {backup_type}' ''.format(backup_type=backup_type)) finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) # Update database with additional info now that backup is done. if backup_id: log.info("Updating database log entry with final backup info") mysql_lib.finalize_backup_log(backup_id, backup_file) else: log.info("The backup is complete, but we were not able to " "write to the central log DB.")
def archive_mysql_binlogs(port, dry_run): """ Flush logs and upload all binary logs that don't exist to s3 Arguments: port - Port of the MySQL instance on which to act dry_run - Display output but do not uplad """ binlog_rotator.rotate_binlogs_if_needed(port, dry_run) zk = host_utils.MysqlZookeeper() instance = host_utils.HostAddr(':'.join((host_utils.HOSTNAME, str(port)))) if zk.get_replica_set_from_instance(instance)[0] is None: log.info('Instance is not in production, exiting') return lock_handle = None ensure_binlog_archiving_table_sanity(instance) try: log.info('Taking binlog archiver lock') lock_handle = host_utils.take_flock_lock(BINLOG_LOCK_FILE) log_bin_dir = host_utils.get_cnf_setting('log_bin', port) bin_logs = mysql_lib.get_master_logs(instance) logged_uploads = get_logged_binlog_uploads(instance) for binlog in bin_logs[:-1]: err_count = 0 local_file = os.path.join(os.path.dirname(log_bin_dir), binlog['Log_name']) if already_uploaded(instance, local_file, logged_uploads): continue success = False while not success: try: upload_binlog(instance, local_file, dry_run) success = True except: if err_count > MAX_ERRORS: log.error('Error count in thread > MAX_THREAD_ERROR. ' 'Aborting :(') raise log.error('error: {e}'.format(e=traceback.format_exc())) err_count = err_count + 1 time.sleep(err_count*2) log.info('Archiving complete') finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle)
def mysql_backup(instance, backup_type=backup.BACKUP_TYPE_XBSTREAM): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object """ log.info("Logging initial status to mysqlops") start_timestamp = time.localtime() lock_handle = None backup_id = mysql_lib.start_backup_log(instance, backup_type, start_timestamp) # Take a lock to prevent multiple backups from running concurrently try: log.info("Taking backup lock") lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info("Cleaning up old backups") purge_mysql_backups.purge_mysql_backups(instance, skip_lock=True) # Actually run the backup log.info("Running backup") if backup_type == backup.BACKUP_TYPE_XBSTREAM: backup_file = backup.xtrabackup_instance(instance, start_timestamp) elif backup_type == backup.BACKUP_TYPE_LOGICAL: backup_file = backup.logical_backup_instance(instance, start_timestamp) else: raise Exception("Unsupported backup type {backup_type}" "".format(backup_type=backup_type)) # Upload file to s3 log.info("Uploading file to s3") backup.s3_upload(backup_file) finally: if lock_handle: log.info("Releasing lock") host_utils.release_flock_lock(lock_handle) # Update database with additional info now that backup is done. if backup_id: log.info("Updating database log entry with final backup info") mysql_lib.finalize_backup_log(backup_id, backup_file, size=os.stat(backup_file).st_size) else: log.info("The backup is complete, but we were not able to " "write to the central log DB.") # Running purge again log.info("Purging backups again") purge_mysql_backups.purge_mysql_backups(instance)
def purge_mysql_backups(instance, skip_lock=False): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object skip_lock - Don't take out a lock against other backup related functions running """ lock_handle = None try: if not skip_lock: log.info('Taking backup lock') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) (temp_path, target_path) = backup.get_paths(str(instance.port)) log.info("Cleaning up any partial backups") backup.remove_backups(temp_path, keep_newest=0, extension=('.xbstream', '.sql.gz')) log.info("Purge xtrabackup backups to the " "last {cnt}".format(cnt=KEEP_OLD_XTRABACKUP)) backup.remove_backups(target_path, keep_newest=KEEP_OLD_XTRABACKUP, extension=('.xbstream')) log.info("Purge logical backup backups to the " "last {cnt}".format(cnt=KEEP_OLD_LOGICAL)) backup.remove_backups(target_path, keep_newest=KEEP_OLD_LOGICAL, extension=('.sql.gz')) log.info("Chmod'ing {target}".format(target=backup.TARGET_DIR)) host_utils.change_perms(backup.TARGET_DIR, 777) log.info("Chmod'ing {temp}".format(temp=backup.TEMP_DIR)) host_utils.change_perms(backup.TEMP_DIR, 777) finally: if not skip_lock and lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle)
def restore_instance(restore_source, destination, restore_type, restore_file, no_repl, date, add_to_zk, skip_production_check, test_restore): """ Restore a MySQL backup on to localhost Args: restore_source - A hostaddr object for where to pull a backup from destination - A hostaddr object for where to restore the backup restore_type - How to pull the backup, options are 's3', 'remote_server' and 'local_file' no_repl - Should replication be not started. It will always be setup. date - What date should the backup be from add_to_zk - Should the instnace be added to zk. If so, the log from the host being launched will be consulted. skip_production_check - Do not check if the host is already in zk for production use. test_restore - Use less ram and shutdown the instance after going through the motions of a restore. """ (temp_dir, target_dir) = backup.get_paths(str(destination.port)) log.info('Supplied source is {source}'.format(source=restore_source)) log.info('Supplied destination is {dest}'.format(dest=destination)) log.info('Restore type is {rest}'.format(rest=restore_type)) log.info('Local restore file is {file}'.format(file=restore_file)) log.info('Desired date of restore {date}'.format(date=date)) if test_restore == 'test': log.info('Running restore in test mode') # Try to prevent unintentional destruction of prod servers zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(destination) except: # instance is not in production replica_type = None if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Restore script must never run on a master') if replica_type: if skip_production_check: log.info('Ignoring production check. We hope you know what you ' 'are doing and we will try to take a backup in case ' 'you are wrong.') try: mysql_backup.mysql_backup(destination) except Exception as e: log.error(e) log.warning('Unable to take a backup. We will give you {time} ' 'seconds to change your mind and ^c.' ''.format(time=SCARY_TIMEOUT)) time.sleep(SCARY_TIMEOUT) else: raise Exception("It appears {instance} is in use. This is" " very dangerous!".format(instance=destination)) # Take a lock to prevent multiple restores from running concurrently log.info('Taking a flock to block another restore from starting') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Rebuilding cnf files just in case') mysql_cnf_builder.build_cnf() mysql_init_server.create_and_chown_dirs(destination.port) # load some data from the mysql conf file datadir = host_utils.get_cnf_setting('datadir', destination.port) # Where will we look for a backup? if restore_type != 'local_file': (restore_type, restore_source, restore_file, restore_size) = find_a_backup_to_restore(restore_type, restore_source, destination, date) # Not using an if/else because find_a_backup_to_restore could set to # local_file if the file has already been downloaded. if restore_type == 'local_file': restore_source = backup.get_host_from_backup(restore_file) # restore_size will be computed in the unpack function restore_size = None log.info('Detected the source of backup as {src}'.format(src=restore_source)) if restore_source.get_zk_replica_set(): replica_set = restore_source.get_zk_replica_set()[0] master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) else: # ZK has no idea what this replica set is, probably a new replica set. master = restore_source # Start logging row_id = backup.start_restore_log(master, {'restore_type': restore_type, 'test_restore': test_restore, 'restore_source': restore_source, 'restore_port': destination.port, 'restore_file': restore_file, 'source_instance': destination.hostname, 'restore_date': date, 'replication': no_repl, 'zookeeper': add_to_zk}) # Giant try to allow logging if anything goes wrong. try: # If we hit an exception, this status will be used. If not, it will # be overwritten restore_log_update = {'restore_status': 'BAD'} log.info('Quick sanity check') mysql_init_server.basic_host_sanity() log.info('Shutting down MySQL') host_utils.stop_mysql(destination.port) log.info('Removing any existing MySQL data') mysql_init_server.delete_mysql_data(destination.port) log.info('Unpacking {rfile} into {ddir}'.format(rfile=restore_file, ddir=datadir)) backup.xbstream_unpack(restore_file, destination.port, restore_source, restore_type, restore_size) log.info('Decompressing files in {path}'.format(path=datadir)) backup.innobackup_decompress(destination.port) log.info('Applying logs') if test_restore == 'test': # We don't really need a lot of memory if we're just # verifying that it works. backup.apply_log(destination.port, memory='1G') else: backup.apply_log(destination.port, memory='10G') log.info('Removing old innodb redo logs') mysql_init_server.delete_innodb_log_files(destination.port) log.info('Setting permissions for MySQL on {dir}'.format(dir=datadir)) host_utils.change_owner(datadir, 'mysql', 'mysql') log.info('Starting MySQL') host_utils.upgrade_auth_tables(destination.port) restore_log_update = {'restore_status': 'OK'} log.info('Running MySQL upgrade') host_utils.start_mysql(destination.port, options=host_utils.DEFAULTS_FILE_EXTRA_ARG.format(defaults_file=host_utils.MYSQL_NOREPL_CNF_FILE)) if master == backup.get_host_from_backup(restore_file): log.info('Pulling replication info from restore to backup source') (binlog_file, binlog_pos) = backup.parse_xtrabackup_binlog_info(datadir) else: log.info('Pulling replication info from restore to ' 'master of backup source') (binlog_file, binlog_pos) = backup.parse_xtrabackup_slave_info(datadir) log.info('Setting up MySQL replication') restore_log_update['replication'] = 'FAIL' # Since we haven't started the slave yet, make sure we've got these # plugins installed, whether we use them or not. mysql_lib.setup_semisync_plugins(destination) # Try to configure replication. If this was just a test restore, # don't wait for it to catch up - don't even start the slave. if test_restore == 'test': mysql_lib.change_master(destination, master, binlog_file, binlog_pos, no_start=True) backup.quick_test_replication(destination) else: mysql_lib.change_master(destination, master, binlog_file, binlog_pos, no_start=(no_repl == 'SKIP')) mysql_lib.wait_replication_catch_up(destination) host_utils.restart_pt_daemons(destination.port) restore_log_update['replication'] = 'OK' mysql_lib.setup_response_time_metrics(destination) except Exception as e: log.error(e) if row_id is not None: restore_log_update['status_message'] = e restore_log_update['finished_at'] = True raise finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) backup.update_restore_log(master, row_id, restore_log_update) # If this was a test restore, we don't need to keep the 3307 # instance running, so let's shut it off. if test_restore == 'test': log.info('Shutting down MySQL backup/restore test instance') host_utils.stop_mysql(destination.port) backup.update_restore_log(master, row_id, {'finished_at': True}) return try: if add_to_zk == 'REQ': log.info('Adding instance to zk') modify_mysql_zk.auto_add_instance_to_zk(destination, dry_run=False) backup.update_restore_log(master, row_id, {'zookeeper': 'OK'}) else: log.info('add_to_zk is not set, therefore not adding to zk') except Exception as e: log.warning("An exception occurred: {e}".format(e=e)) log.warning("If this is a DB issue, that's fine. " "Otherwise, you should check ZK.") backup.update_restore_log(master, row_id, {'finished_at': True}) log.info('Starting a new backup') mysql_backup.mysql_backup(destination)
def restore_instance(backup_type, restore_source, destination, no_repl, date, add_to_zk, skip_production_check): """ Restore a MySQL backup on to localhost Args: backup_type - Type of backup to restore restore_source - A hostaddr object for where to pull a backup from destination - A hostaddr object for where to restore the backup no_repl - Should replication be not started. It will always be setup. date - What date should the backup be from add_to_zk - Should the instnace be added to zk. If so, the log from the host being launched will be consulted. skip_production_check - Do not check if the host is already in zk for production use. """ log.info('Supplied source is {source}'.format(source=restore_source)) log.info('Supplied destination is {dest}'.format(dest=destination)) log.info('Desired date of restore {date}'.format(date=date)) zk = host_utils.MysqlZookeeper() # Try to prevent unintentional destruction of prod servers log.info('Confirming no prod instances running on destination') prod_check(destination, skip_production_check) # Take a lock to prevent multiple restores from running concurrently log.info('Taking a flock to block another restore from starting') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Looking for a backup to restore') if restore_source: possible_sources = [restore_source] else: possible_sources = get_possible_sources(destination, backup_type) backup_key = find_a_backup_to_restore(possible_sources, destination, backup_type, date) # Figure out what what we use to as the master when we setup replication (restore_source, _) = backup.get_metadata_from_backup_file(backup_key.name) if restore_source.get_zk_replica_set(): replica_set = restore_source.get_zk_replica_set()[0] master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) else: # ZK has no idea what this replica set is, probably a new replica set. master = restore_source # Start logging row_id = backup.start_restore_log(master, {'restore_source': restore_source, 'restore_port': destination.port, 'restore_file': backup_key.name, 'source_instance': destination.hostname, 'restore_date': date, 'replication': no_repl, 'zookeeper': add_to_zk}) # Giant try to allow logging if anything goes wrong. try: # If we hit an exception, this status will be used. If not, it will # be overwritten restore_log_update = {'restore_status': 'BAD'} # This also ensures that all needed directories exist log.info('Rebuilding local mysql instance') mysql_init_server.mysql_init_server(destination, skip_production_check=True, skip_backup=True, skip_locking=True) if backup_type == backup.BACKUP_TYPE_XBSTREAM: xbstream_restore(backup_key, destination.port) if master == restore_source: log.info('Pulling replication info from restore to backup source') (binlog_file, binlog_pos) = backup.parse_xtrabackup_binlog_info(destination.port) else: log.info('Pulling replication info from restore to ' 'master of backup source') (binlog_file, binlog_pos) = backup.parse_xtrabackup_slave_info(destination.port) elif backup_type == backup.BACKUP_TYPE_LOGICAL: logical_restore(backup_key, destination) host_utils.stop_mysql(destination.port) log.info('Running MySQL upgrade') host_utils.upgrade_auth_tables(destination.port) log.info('Starting MySQL') host_utils.start_mysql(destination.port, options=host_utils.DEFAULTS_FILE_EXTRA_ARG.format(defaults_file=host_utils.MYSQL_NOREPL_CNF_FILE)) # Since we haven't started the slave yet, make sure we've got these # plugins installed, whether we use them or not. mysql_lib.setup_semisync_plugins(destination) restore_log_update = {'restore_status': 'OK'} # Try to configure replication. log.info('Setting up MySQL replication') restore_log_update['replication'] = 'FAIL' if backup_type == backup.BACKUP_TYPE_XBSTREAM: mysql_lib.change_master(destination, master, binlog_file, binlog_pos, no_start=(no_repl == 'SKIP')) elif backup_type == backup.BACKUP_TYPE_LOGICAL: if no_repl == 'SKIP': log.info('As requested, not starting replication.') else: mysql_lib.restart_replication(destination) if no_repl == 'REQ': mysql_lib.wait_replication_catch_up(destination) restore_log_update['replication'] = 'OK' host_utils.restart_pt_daemons(destination.port) mysql_lib.setup_response_time_metrics(destination) except Exception as e: log.error(e) if row_id is not None: restore_log_update['status_message'] = e restore_log_update['finished_at'] = True raise finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) backup.update_restore_log(master, row_id, restore_log_update) try: if add_to_zk == 'REQ': log.info('Adding instance to zk') modify_mysql_zk.auto_add_instance_to_zk(destination.port, dry_run=False) backup.update_restore_log(master, row_id, {'zookeeper': 'OK'}) else: log.info('add_to_zk is not set, therefore not adding to zk') except Exception as e: log.warning("An exception occurred: {e}".format(e=e)) log.warning("If this is a DB issue, that's fine. " "Otherwise, you should check ZK.") backup.update_restore_log(master, row_id, {'finished_at': True}) if no_repl == 'REQ': log.info('Starting a new backup') mysql_backup.mysql_backup(destination, initial_build=True)
def mysql_init_server(instance, skip_production_check=False, skip_locking=False, skip_backup=True): """ Remove any data and initialize a MySQL instance Args: instance - A hostaddr object pointing towards localhost to act upon skip_production_check - Dangerous! will not run safety checks to protect production data skip_locking - Do not take a lock on localhost. Useful when the caller has already has taken the lock (ie mysql_restore_xtrabackup) skip_backup - Don't run a backup after the instance is setup """ lock_handle = None if not skip_locking: # Take a lock to prevent multiple restores from running concurrently log.info('Taking a flock to block race conditions') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) try: # sanity check zk = host_utils.MysqlZookeeper() if (not skip_production_check and instance in zk.get_all_mysql_instances()): raise Exception("It appears {instance} is in use. This is" " very dangerous!".format(instance=instance)) log.info('Checking host for mounts, etc...') basic_host_sanity() log.info('(re)Generating MySQL cnf files') mysql_cnf_builder.build_cnf() log.info('Creating any missing directories') create_and_chown_dirs(instance.port) log.info('Shutting down MySQL (if applicable)') host_utils.stop_mysql(instance.port) log.info('Deleting existing MySQL data') delete_mysql_data(instance.port) log.info('Creating MySQL privileges tables') init_privileges_tables(instance.port) log.info('Clearing innodb log files') delete_innodb_log_files(instance.port) log.info('Starting up instance') host_utils.start_mysql(instance.port) log.info('Importing MySQL users') mysql_grants.manage_mysql_grants(instance, 'nuke_then_import') log.info('Creating test database') mysql_lib.create_db(instance, 'test') log.info('Setting up query response time plugins') mysql_lib.setup_response_time_metrics(instance) log.info('Setting up semi-sync replication plugins') mysql_lib.setup_semisync_plugins(instance) log.info('Restarting pt daemons') host_utils.restart_pt_daemons(instance.port) log.info('MySQL initalization complete') finally: if not skip_locking and lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) if not skip_backup: log.info('Taking a backup') mysql_backup.mysql_backup(instance)
def archive_mysql_binlogs(port, dry_run): """ Flush logs and upload all binary logs that don't exist to s3 Arguments: port - Port of the MySQL instance on which to act dry_run - Display output but do not uplad """ lock_handle = None try: log.info('Taking binlog archiver lock') lock_handle = host_utils.take_flock_lock(BINLOG_LOCK_FILE) log_bin_dir = host_utils.get_cnf_setting('log_bin', port) instance = host_utils.HostAddr(':'.join((host_utils.HOSTNAME, str(port)))) s3_conn = boto.connect_s3() bucket = s3_conn.get_bucket(environment_specific.S3_BUCKET, validate=False) mysql_conn = mysql_lib.connect_mysql(instance) bin_logs = mysql_lib.get_master_logs(mysql_conn) prefix = os.path.join(BINLOG_S3_DIR, instance.hostname, str(instance.port)) uploaded_binlogs = bucket.get_all_keys(prefix=prefix) for binlog in bin_logs[:-1]: compressed_file = ''.join((binlog['Log_name'], '.gz')) local_file = os.path.join(os.path.dirname(log_bin_dir), binlog['Log_name']) local_file_gz = os.path.join(TMP_DIR, compressed_file) remote_path = os.path.join(BINLOG_S3_DIR, instance.hostname, str(instance.port), compressed_file) log.info('Local file {local_file} will compress to {local_file_gz} ' 'and upload to {remote_path}'.format(local_file=local_file, local_file_gz=local_file_gz, remote_path=remote_path)) new_key = boto.s3.key.Key(bucket) new_key.key = remote_path if already_uploaded(remote_path, uploaded_binlogs): log.info('Binlog has already been uploaded') continue if dry_run: log.info('In dry_run mode, skipping compression and upload') continue log.info('Compressing file') f_in = open(local_file, 'r') f_out = gzip.open(local_file_gz, 'w', compresslevel=2) f_out.writelines(f_in) f_out.close() f_in.close() log.info('Uploading file') new_key.set_contents_from_filename(local_file_gz) log.info('Deleting local compressed file') os.remove(local_file_gz) log.info('Archiving complete') finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle)
def xtrabackup_backup_instance(instance): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object """ starttime_sql = time.strftime('%Y-%m-%d %H:%M:%S') log.info('Logging initial status to mysqlops') row_id = None lock_handle = None try: reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() sql = ("INSERT INTO mysqlops.mysql_backups " "SET " "hostname = %(hostname)s, " "port = %(port)s, " "started = %(started)s, " "backup_type = 'xbstream' ") metadata = {'hostname': instance.hostname, 'port': instance.port, 'started': starttime_sql} cursor.execute(sql, metadata) row_id = cursor.lastrowid reporting_conn.commit() except Exception as e: log.warning("Unable to write log entry to " "mysqlopsdb001: {e}".format(e=e)) log.warning("However, we will attempt to continue with the backup.") # Take a lock to prevent multiple backups from running concurrently try: log.info('Taking backup lock') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Cleaning up old backups') purge_mysql_backups.purge_mysql_backups(instance, skip_lock=True) # Actually run the backup log.info('Running backup') backup_file = backup.xtrabackup_instance(instance) finished = time.strftime('%Y-%m-%d %H:%M:%S') # Upload file to s3 log.info('Uploading file to s3') backup.s3_upload(backup_file) # Update database with additional info now that backup is done. if row_id is None: log.info("The backup is complete, but we were not able to " "write to the central log DB.") else: log.info("Updating database log entry with final backup info") try: sql = ("UPDATE mysqlops.mysql_backups " "SET " "filename = %(filename)s, " "finished = %(finished)s, " "size = %(size)s " "WHERE id = %(id)s") metadata = {'filename': backup_file, 'finished': finished, 'size': os.stat(backup_file).st_size, 'id': row_id} cursor.execute(sql, metadata) reporting_conn.commit() reporting_conn.close() except Exception as e: log.warning("Unable to update mysqlopsdb with " "backup status: {e}".format(e=e)) # Running purge again most for the chmod purge_mysql_backups.purge_mysql_backups(instance, skip_lock=True) finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle)
def restore_instance(restore_source, destination, no_repl, date, add_to_zk, skip_production_check): """ Restore a MySQL backup on to localhost Args: restore_source - A hostaddr object for where to pull a backup from destination - A hostaddr object for where to restore the backup no_repl - Should replication be not started. It will always be setup. date - What date should the backup be from add_to_zk - Should the instnace be added to zk. If so, the log from the host being launched will be consulted. skip_production_check - Do not check if the host is already in zk for production use. """ log.info('Supplied source is {source}'.format(source=restore_source)) log.info('Supplied destination is {dest}'.format(dest=destination)) log.info('Desired date of restore {date}'.format(date=date)) # Try to prevent unintentional destruction of prod servers zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(destination) except: # instance is not in production replica_type = None if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Restore script must never run on a master') if replica_type: if skip_production_check: log.info('Ignoring production check. We hope you know what you ' 'are doing and we will try to take a backup in case ' 'you are wrong.') try: mysql_backup.mysql_backup(destination) except Exception as e: log.error(e) log.warning('Unable to take a backup. We will give you {time} ' 'seconds to change your mind and ^c.' ''.format(time=SCARY_TIMEOUT)) time.sleep(SCARY_TIMEOUT) else: raise Exception("It appears {instance} is in use. This is" " very dangerous!".format(instance=destination)) # Take a lock to prevent multiple restores from running concurrently log.info('Taking a flock to block another restore from starting') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Rebuilding cnf files just in case') mysql_cnf_builder.build_cnf() mysql_init_server.create_and_chown_dirs(destination.port) # load some data from the mysql conf file datadir = host_utils.get_cnf_setting('datadir', destination.port) (restore_source, restore_file, restore_size) = find_a_backup_to_restore(restore_source, destination, date) if restore_source.get_zk_replica_set(): replica_set = restore_source.get_zk_replica_set()[0] master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) else: # ZK has no idea what this replica set is, probably a new replica set. master = restore_source # Start logging row_id = backup.start_restore_log( master, { 'restore_source': restore_source, 'restore_port': destination.port, 'restore_file': restore_file, 'source_instance': destination.hostname, 'restore_date': date, 'replication': no_repl, 'zookeeper': add_to_zk }) # Giant try to allow logging if anything goes wrong. try: # If we hit an exception, this status will be used. If not, it will # be overwritten restore_log_update = {'restore_status': 'BAD'} log.info('Quick sanity check') mysql_init_server.basic_host_sanity() log.info('Shutting down MySQL') host_utils.stop_mysql(destination.port) log.info('Removing any existing MySQL data') mysql_init_server.delete_mysql_data(destination.port) log.info('Unpacking {rfile} into {ddir}'.format(rfile=restore_file, ddir=datadir)) backup.xbstream_unpack(restore_file, destination.port, restore_source, restore_size) log.info('Decompressing files in {path}'.format(path=datadir)) backup.innobackup_decompress(destination.port) # Determine how much RAM to use for applying logs based on the # system's total RAM size; all our boxes have 32G or more, so # this will always be better than before, but not absurdly high. log_apply_ram = psutil.phymem_usage()[0] / 1024 / 1024 / 1024 / 3 log.info('Applying logs') backup.apply_log(destination.port, memory='{}G'.format(log_apply_ram)) log.info('Removing old innodb redo logs') mysql_init_server.delete_innodb_log_files(destination.port) log.info('Setting permissions for MySQL on {dir}'.format(dir=datadir)) host_utils.change_owner(datadir, 'mysql', 'mysql') log.info('Starting MySQL') host_utils.upgrade_auth_tables(destination.port) restore_log_update = {'restore_status': 'OK'} log.info('Running MySQL upgrade') host_utils.start_mysql( destination.port, options=host_utils.DEFAULTS_FILE_EXTRA_ARG.format( defaults_file=host_utils.MYSQL_NOREPL_CNF_FILE)) if master == backup.get_metadata_from_backup_file(restore_file)[0]: log.info('Pulling replication info from restore to backup source') (binlog_file, binlog_pos) = backup.parse_xtrabackup_binlog_info(datadir) else: log.info('Pulling replication info from restore to ' 'master of backup source') (binlog_file, binlog_pos) = backup.parse_xtrabackup_slave_info(datadir) log.info('Setting up MySQL replication') restore_log_update['replication'] = 'FAIL' # Since we haven't started the slave yet, make sure we've got these # plugins installed, whether we use them or not. mysql_lib.setup_semisync_plugins(destination) # Try to configure replication. mysql_lib.change_master(destination, master, binlog_file, binlog_pos, no_start=(no_repl == 'SKIP')) mysql_lib.wait_replication_catch_up(destination) host_utils.restart_pt_daemons(destination.port) restore_log_update['replication'] = 'OK' mysql_lib.setup_response_time_metrics(destination) except Exception as e: log.error(e) if row_id is not None: restore_log_update['status_message'] = e restore_log_update['finished_at'] = True raise finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) backup.update_restore_log(master, row_id, restore_log_update) try: if add_to_zk == 'REQ': log.info('Adding instance to zk') modify_mysql_zk.auto_add_instance_to_zk(destination, dry_run=False) backup.update_restore_log(master, row_id, {'zookeeper': 'OK'}) else: log.info('add_to_zk is not set, therefore not adding to zk') except Exception as e: log.warning("An exception occurred: {e}".format(e=e)) log.warning("If this is a DB issue, that's fine. " "Otherwise, you should check ZK.") backup.update_restore_log(master, row_id, {'finished_at': True}) log.info('Starting a new backup') mysql_backup.mysql_backup(destination)
def archive_mysql_binlogs(port, dry_run): """ Flush logs and upload all binary logs that don't exist to s3 Arguments: port - Port of the MySQL instance on which to act dry_run - Display output but do not uplad """ lock_handle = None try: log.info('Taking binlog archiver lock') lock_handle = host_utils.take_flock_lock(BINLOG_LOCK_FILE) log_bin_dir = host_utils.get_cnf_setting('log_bin', port) instance = host_utils.HostAddr(':'.join( (host_utils.HOSTNAME, str(port)))) s3_conn = boto.connect_s3() bucket = s3_conn.get_bucket(environment_specific.S3_BUCKET, validate=False) mysql_conn = mysql_lib.connect_mysql(instance) bin_logs = mysql_lib.get_master_logs(mysql_conn) prefix = os.path.join(BINLOG_S3_DIR, instance.hostname, str(instance.port)) uploaded_binlogs = bucket.get_all_keys(prefix=prefix) for binlog in bin_logs[:-1]: compressed_file = ''.join((binlog['Log_name'], '.gz')) local_file = os.path.join(os.path.dirname(log_bin_dir), binlog['Log_name']) local_file_gz = os.path.join(TMP_DIR, compressed_file) remote_path = os.path.join(BINLOG_S3_DIR, instance.hostname, str(instance.port), compressed_file) log.info( 'Local file {local_file} will compress to {local_file_gz} ' 'and upload to {remote_path}'.format( local_file=local_file, local_file_gz=local_file_gz, remote_path=remote_path)) new_key = boto.s3.key.Key(bucket) new_key.key = remote_path if already_uploaded(remote_path, uploaded_binlogs): log.info('Binlog has already been uploaded') continue if dry_run: log.info('In dry_run mode, skipping compression and upload') continue log.info('Compressing file') f_in = open(local_file, 'r') f_out = gzip.open(local_file_gz, 'w', compresslevel=2) f_out.writelines(f_in) f_out.close() f_in.close() log.info('Uploading file') new_key.set_contents_from_filename(local_file_gz) log.info('Deleting local compressed file') os.remove(local_file_gz) log.info('Archiving complete') finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle)
def mysql_init_server(instance, skip_production_check=False, skip_locking=False, skip_backup=True): """ Remove any data and initialize a MySQL instance Args: instance - A hostaddr object pointing towards localhost to act upon skip_production_check - Dangerous! will not run safety checks to protect production data skip_locking - Do not take a lock on localhost. Useful when the caller has already has taken the lock (ie mysql_restore_xtrabackup) skip_backup - Don't run a backup after the instance is setup """ lock_handle = None if not skip_locking: # Take a lock to prevent multiple restores from running concurrently log.info('Taking a flock to block race conditions') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) try: # sanity check zk = host_utils.MysqlZookeeper() if (not skip_production_check and instance in zk.get_all_mysql_instances()): raise Exception("It appears {instance} is in use. This is" " very dangerous!".format(instance=instance)) log.info('Checking host for mounts, etc...') basic_host_sanity() log.info('(re)Generating MySQL cnf files') mysql_cnf_builder.build_cnf() log.info('Creating any missing directories') create_and_chown_dirs(instance.port) log.info('Shutting down MySQL (if applicable)') host_utils.stop_mysql(instance.port) log.info('Deleting existing MySQL data') delete_mysql_data(instance.port) log.info('Creating MySQL privileges tables') init_privileges_tables(instance.port) log.info('Clearing innodb log files') delete_innodb_log_files(instance.port) log.info('Starting up instance') host_utils.start_mysql(instance.port) log.info('Importing MySQL users') mysql_grants.manage_mysql_grants(instance, 'nuke_then_import') log.info('Creating test database') conn = mysql_lib.connect_mysql(instance) mysql_lib.create_db(conn, 'test') log.info('Setting up query response time plugins') mysql_lib.setup_response_time_metrics(instance) log.info('Setting up semi-sync replication plugins') mysql_lib.setup_semisync_plugins(instance) log.info('Restarting pt daemons') host_utils.restart_pt_daemons(instance.port) log.info('MySQL initalization complete') finally: if not skip_locking and lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) if not skip_backup: log.info('Taking a backup') mysql_backup.mysql_backup(instance)
def xtrabackup_backup_instance(instance): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object """ starttime_sql = time.strftime('%Y-%m-%d %H:%M:%S') log.info('Logging initial status to mysqlops') row_id = None lock_handle = None try: reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() sql = ("INSERT INTO mysqlops.mysql_backups " "SET " "hostname = %(hostname)s, " "port = %(port)s, " "started = %(started)s, " "backup_type = 'xbstream' ") metadata = { 'hostname': instance.hostname, 'port': instance.port, 'started': starttime_sql } cursor.execute(sql, metadata) row_id = cursor.lastrowid reporting_conn.commit() except Exception as e: log.warning("Unable to write log entry to " "mysqlopsdb001: {e}".format(e=e)) log.warning("However, we will attempt to continue with the backup.") # Take a lock to prevent multiple backups from running concurrently try: log.info('Taking backup lock') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Cleaning up old backups') purge_mysql_backups.purge_mysql_backups(instance, skip_lock=True) # Actually run the backup log.info('Running backup') backup_file = backup.xtrabackup_instance(instance) finished = time.strftime('%Y-%m-%d %H:%M:%S') # Upload file to s3 log.info('Uploading file to s3') backup.s3_upload(backup_file) # Update database with additional info now that backup is done. if row_id is None: log.info("The backup is complete, but we were not able to " "write to the central log DB.") else: log.info("Updating database log entry with final backup info") try: sql = ("UPDATE mysqlops.mysql_backups " "SET " "filename = %(filename)s, " "finished = %(finished)s, " "size = %(size)s " "WHERE id = %(id)s") metadata = { 'filename': backup_file, 'finished': finished, 'size': os.stat(backup_file).st_size, 'id': row_id } cursor.execute(sql, metadata) reporting_conn.commit() reporting_conn.close() except Exception as e: log.warning("Unable to update mysqlopsdb with " "backup status: {e}".format(e=e)) # Running purge again most for the chmod purge_mysql_backups.purge_mysql_backups(instance, skip_lock=True) finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle)