def xbstream_restore(xbstream, port): """ Restore an xtrabackup file xbstream - An xbstream file in S3 port - The port on which to act on on localhost """ datadir = host_utils.get_cnf_setting('datadir', port) log.info('Shutting down MySQL') host_utils.stop_mysql(port) log.info('Removing any existing MySQL data') mysql_init_server.delete_mysql_data(port) log.info('Downloading and unpacking backup') backup.xbstream_unpack(xbstream, datadir) log.info('Decompressing compressed ibd files') backup.innobackup_decompress(datadir) log.info('Applying logs') backup.apply_log(datadir) log.info('Removing old innodb redo logs') mysql_init_server.delete_innodb_log_files(port) log.info('Setting permissions for MySQL on {dir}'.format(dir=datadir)) host_utils.change_owner(datadir, 'mysql', 'mysql')
def mysql_backup_csv_db(self, db, conn, pitr_data): """ Back up a single db Args: db - the db to be backed up conn - a connection the the mysql instance pitr_data - data describing the position of the db data in replication """ proc_id = multiprocessing.current_process().name if not self.force_reupload and self.already_backed_up(db): log.info('{proc_id}: {db} is already backed up, skipping' ''.format(proc_id=proc_id, db=db)) return # attempt to take lock by writing a lock to the master tmp_dir_db = None lock_identifier = None extend_lock_thread = None try: self.release_expired_locks() lock_identifier = self.take_backup_lock(db) extend_lock_stop_event = threading.Event() extend_lock_thread = threading.Thread(target=self.extend_backup_lock, args=(lock_identifier, extend_lock_stop_event)) extend_lock_thread.daemon = True extend_lock_thread.start() if not lock_identifier: return log.info('{proc_id}: {db} db backup start' ''.format(db=db, proc_id=proc_id)) tmp_dir_db = os.path.join(self.dump_base_path, db) if not os.path.exists(tmp_dir_db): os.makedirs(tmp_dir_db) host_utils.change_owner(tmp_dir_db, 'mysql', 'mysql') self.upload_pitr_data(db, pitr_data) for table in self.get_tables_to_backup(db): self.mysql_backup_csv_table(db, table, tmp_dir_db, conn) log.info('{proc_id}: {db} db backup complete' ''.format(db=db, proc_id=proc_id)) finally: if extend_lock_thread: extend_lock_stop_event.set() log.debug('{proc_id}: {db} waiting for lock expiry thread to' 'end'.format(db=db, proc_id=proc_id)) extend_lock_thread.join() if lock_identifier: log.debug('{proc_id}: {db} releasing lock' ''.format(db=db, proc_id=proc_id)) self.release_db_backup_lock(lock_identifier)
def setup_and_get_tmp_path(self): """ Figure out where to temporarily store csv backups, and clean it up """ tmp_dir_root = os.path.join(host_utils.find_root_volume(), 'csv_export', str(self.instance.port)) if not os.path.exists(tmp_dir_root): os.makedirs(tmp_dir_root) host_utils.change_owner(tmp_dir_root, 'mysql', 'mysql') self.dump_base_path = tmp_dir_root
def mysql_backup_csv_db(self, db, conn, pitr_data): """ Back up a single db Args: db - the db to be backed up conn - a connection the the mysql instance pitr_data - data describing the position of the db data in replication """ proc_id = multiprocessing.current_process().name if not self.force_reupload and self.already_backed_up(db): log.info('{proc_id}: {db} is already backed up, skipping' ''.format(proc_id=proc_id, db=db)) return # attempt to take lock by writing a lock to the master tmp_dir_db = None lock_identifier = None extend_lock_thread = None try: self.release_expired_locks() lock_identifier = self.take_backup_lock(db) extend_lock_stop_event = threading.Event() extend_lock_thread = threading.Thread( target=self.extend_backup_lock, args=(lock_identifier, extend_lock_stop_event)) extend_lock_thread.daemon = True extend_lock_thread.start() if not lock_identifier: return log.info('{proc_id}: {db} db backup start' ''.format(db=db, proc_id=proc_id)) tmp_dir_db = os.path.join(self.dump_base_path, db) if not os.path.exists(tmp_dir_db): os.makedirs(tmp_dir_db) host_utils.change_owner(tmp_dir_db, 'mysql', 'mysql') self.upload_pitr_data(db, pitr_data) for table in self.get_tables_to_backup(db): self.mysql_backup_csv_table(db, table, tmp_dir_db, conn) log.info('{proc_id}: {db} db backup complete' ''.format(db=db, proc_id=proc_id)) finally: if extend_lock_thread: extend_lock_stop_event.set() log.debug('{proc_id}: {db} waiting for lock expiry thread to' 'end'.format(db=db, proc_id=proc_id)) extend_lock_thread.join() if lock_identifier: log.debug('{proc_id}: {db} releasing lock' ''.format(db=db, proc_id=proc_id)) self.release_db_backup_lock(lock_identifier)
def create_and_chown_dirs(port): """ Create and chown any missing directories needed for mysql """ for variable in DIRS_TO_CREATE: try: path = os.path.dirname(host_utils.get_cnf_setting(variable, port)) except ConfigParser.NoOptionError: # Not defined, so must not matter return if not os.path.isdir(path): log.info('Creating and chowning {path}'.format(path=path)) os.makedirs(path) host_utils.change_owner(path, 'mysql', 'mysql')
def create_fifo(self, fifo): """ Create a fifo to be used for dumping a mysql table Args: fifo - The path to the fifo """ if os.path.exists(fifo): self.cleanup_fifo(fifo) log.debug('{proc_id}: creating fifo {fifo}' ''.format(proc_id=multiprocessing.current_process().name, fifo=fifo)) os.mkfifo(fifo) # Could not get os.mkfifo(fifo, 0777) to work due to umask host_utils.change_owner(fifo, 'mysql', 'mysql')
def mysql_backup_csv_db(self, db, conn, pitr_data): """ Back up a single db Args: db - the db to be backed up conn - a connection the the mysql instance pitr_data - data describing the position of the db data in replication """ # attempt to take lock by writing a lock to the master proc_id = multiprocessing.current_process().name tmp_dir_db = None lock_identifier = None try: lock_identifier = self.take_backup_lock(db) if not lock_identifier: return if not self.force_reupload and self.already_backed_up(db): log.info('{proc_id}: {db} is already backed up, skipping' ''.format(proc_id=proc_id, db=db)) return log.info('{proc_id}: {db} db backup start' ''.format(db=db, proc_id=proc_id)) tmp_dir_db = os.path.join(self.dump_base_path, db) if not os.path.exists(tmp_dir_db): os.makedirs(tmp_dir_db) host_utils.change_owner(tmp_dir_db, 'mysql', 'mysql') self.upload_pitr_data(db, pitr_data) for table in self.get_tables_to_backup(db): self.mysql_backup_csv_table(db, table, tmp_dir_db, conn) log.info('{proc_id}: {db} db backup complete' ''.format(db=db, proc_id=proc_id)) finally: if lock_identifier: log.debug('{proc_id}: {db} releasing lock' ''.format(db=db, proc_id=proc_id)) self.release_db_backup_lock(lock_identifier)
def delete_mysql_data(port): """ Purge all data on disk for a MySQL instance Args: port - The port on which to act upon on localhost """ for dir_key in DIRS_TO_CLEAR: directory = host_utils.get_cnf_setting(dir_key, port) if not os.path.isdir(directory): directory = os.path.dirname(directory) log.info('Removing contents of {dir}'.format(dir=directory)) host_utils.clean_directory(directory) # This should not bomb if one of the files to truncate # isn't specified in the config file. for file_keys in FILES_TO_CLEAR: try: del_file = host_utils.get_cnf_setting(file_keys, port) log.info('Truncating {del_file}'.format(del_file=del_file)) open(del_file, 'w').close() host_utils.change_owner(del_file, 'mysql', 'mysql') except Exception: log.warning('Option {f} not specified ' 'in my.cnf - continuing.'.format(f=file_keys))
def restore_instance(restore_source, destination, restore_type, restore_file, no_repl, date, add_to_zk, skip_production_check, test_restore): """ Restore a MySQL backup on to localhost Args: restore_source - A hostaddr object for where to pull a backup from destination - A hostaddr object for where to restore the backup restore_type - How to pull the backup, options are 's3', 'remote_server' and 'local_file' no_repl - Should replication be not started. It will always be setup. date - What date should the backup be from add_to_zk - Should the instnace be added to zk. If so, the log from the host being launched will be consulted. skip_production_check - Do not check if the host is already in zk for production use. test_restore - Use less ram and shutdown the instance after going through the motions of a restore. """ (temp_dir, target_dir) = backup.get_paths(str(destination.port)) log.info('Supplied source is {source}'.format(source=restore_source)) log.info('Supplied destination is {dest}'.format(dest=destination)) log.info('Restore type is {rest}'.format(rest=restore_type)) log.info('Local restore file is {file}'.format(file=restore_file)) log.info('Desired date of restore {date}'.format(date=date)) if test_restore == 'test': log.info('Running restore in test mode') # Try to prevent unintentional destruction of prod servers zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(destination) except: # instance is not in production replica_type = None if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Restore script must never run on a master') if replica_type: if skip_production_check: log.info('Ignoring production check. We hope you know what you ' 'are doing and we will try to take a backup in case ' 'you are wrong.') try: mysql_backup.mysql_backup(destination) except Exception as e: log.error(e) log.warning('Unable to take a backup. We will give you {time} ' 'seconds to change your mind and ^c.' ''.format(time=SCARY_TIMEOUT)) time.sleep(SCARY_TIMEOUT) else: raise Exception("It appears {instance} is in use. This is" " very dangerous!".format(instance=destination)) # Take a lock to prevent multiple restores from running concurrently log.info('Taking a flock to block another restore from starting') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Rebuilding cnf files just in case') mysql_cnf_builder.build_cnf() mysql_init_server.create_and_chown_dirs(destination.port) # load some data from the mysql conf file datadir = host_utils.get_cnf_setting('datadir', destination.port) # Where will we look for a backup? if restore_type != 'local_file': (restore_type, restore_source, restore_file, restore_size) = find_a_backup_to_restore(restore_type, restore_source, destination, date) # Not using an if/else because find_a_backup_to_restore could set to # local_file if the file has already been downloaded. if restore_type == 'local_file': restore_source = backup.get_host_from_backup(restore_file) # restore_size will be computed in the unpack function restore_size = None log.info('Detected the source of backup as {src}'.format(src=restore_source)) if restore_source.get_zk_replica_set(): replica_set = restore_source.get_zk_replica_set()[0] master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) else: # ZK has no idea what this replica set is, probably a new replica set. master = restore_source # Start logging row_id = backup.start_restore_log(master, {'restore_type': restore_type, 'test_restore': test_restore, 'restore_source': restore_source, 'restore_port': destination.port, 'restore_file': restore_file, 'source_instance': destination.hostname, 'restore_date': date, 'replication': no_repl, 'zookeeper': add_to_zk}) # Giant try to allow logging if anything goes wrong. try: # If we hit an exception, this status will be used. If not, it will # be overwritten restore_log_update = {'restore_status': 'BAD'} log.info('Quick sanity check') mysql_init_server.basic_host_sanity() log.info('Shutting down MySQL') host_utils.stop_mysql(destination.port) log.info('Removing any existing MySQL data') mysql_init_server.delete_mysql_data(destination.port) log.info('Unpacking {rfile} into {ddir}'.format(rfile=restore_file, ddir=datadir)) backup.xbstream_unpack(restore_file, destination.port, restore_source, restore_type, restore_size) log.info('Decompressing files in {path}'.format(path=datadir)) backup.innobackup_decompress(destination.port) log.info('Applying logs') if test_restore == 'test': # We don't really need a lot of memory if we're just # verifying that it works. backup.apply_log(destination.port, memory='1G') else: backup.apply_log(destination.port, memory='10G') log.info('Removing old innodb redo logs') mysql_init_server.delete_innodb_log_files(destination.port) log.info('Setting permissions for MySQL on {dir}'.format(dir=datadir)) host_utils.change_owner(datadir, 'mysql', 'mysql') log.info('Starting MySQL') host_utils.upgrade_auth_tables(destination.port) restore_log_update = {'restore_status': 'OK'} log.info('Running MySQL upgrade') host_utils.start_mysql(destination.port, options=host_utils.DEFAULTS_FILE_EXTRA_ARG.format(defaults_file=host_utils.MYSQL_NOREPL_CNF_FILE)) if master == backup.get_host_from_backup(restore_file): log.info('Pulling replication info from restore to backup source') (binlog_file, binlog_pos) = backup.parse_xtrabackup_binlog_info(datadir) else: log.info('Pulling replication info from restore to ' 'master of backup source') (binlog_file, binlog_pos) = backup.parse_xtrabackup_slave_info(datadir) log.info('Setting up MySQL replication') restore_log_update['replication'] = 'FAIL' # Since we haven't started the slave yet, make sure we've got these # plugins installed, whether we use them or not. mysql_lib.setup_semisync_plugins(destination) # Try to configure replication. If this was just a test restore, # don't wait for it to catch up - don't even start the slave. if test_restore == 'test': mysql_lib.change_master(destination, master, binlog_file, binlog_pos, no_start=True) backup.quick_test_replication(destination) else: mysql_lib.change_master(destination, master, binlog_file, binlog_pos, no_start=(no_repl == 'SKIP')) mysql_lib.wait_replication_catch_up(destination) host_utils.restart_pt_daemons(destination.port) restore_log_update['replication'] = 'OK' mysql_lib.setup_response_time_metrics(destination) except Exception as e: log.error(e) if row_id is not None: restore_log_update['status_message'] = e restore_log_update['finished_at'] = True raise finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) backup.update_restore_log(master, row_id, restore_log_update) # If this was a test restore, we don't need to keep the 3307 # instance running, so let's shut it off. if test_restore == 'test': log.info('Shutting down MySQL backup/restore test instance') host_utils.stop_mysql(destination.port) backup.update_restore_log(master, row_id, {'finished_at': True}) return try: if add_to_zk == 'REQ': log.info('Adding instance to zk') modify_mysql_zk.auto_add_instance_to_zk(destination, dry_run=False) backup.update_restore_log(master, row_id, {'zookeeper': 'OK'}) else: log.info('add_to_zk is not set, therefore not adding to zk') except Exception as e: log.warning("An exception occurred: {e}".format(e=e)) log.warning("If this is a DB issue, that's fine. " "Otherwise, you should check ZK.") backup.update_restore_log(master, row_id, {'finished_at': True}) log.info('Starting a new backup') mysql_backup.mysql_backup(destination)
def restore_instance(restore_source, destination, no_repl, date, add_to_zk, skip_production_check): """ Restore a MySQL backup on to localhost Args: restore_source - A hostaddr object for where to pull a backup from destination - A hostaddr object for where to restore the backup no_repl - Should replication be not started. It will always be setup. date - What date should the backup be from add_to_zk - Should the instnace be added to zk. If so, the log from the host being launched will be consulted. skip_production_check - Do not check if the host is already in zk for production use. """ log.info('Supplied source is {source}'.format(source=restore_source)) log.info('Supplied destination is {dest}'.format(dest=destination)) log.info('Desired date of restore {date}'.format(date=date)) # Try to prevent unintentional destruction of prod servers zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(destination) except: # instance is not in production replica_type = None if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Restore script must never run on a master') if replica_type: if skip_production_check: log.info('Ignoring production check. We hope you know what you ' 'are doing and we will try to take a backup in case ' 'you are wrong.') try: mysql_backup.mysql_backup(destination) except Exception as e: log.error(e) log.warning('Unable to take a backup. We will give you {time} ' 'seconds to change your mind and ^c.' ''.format(time=SCARY_TIMEOUT)) time.sleep(SCARY_TIMEOUT) else: raise Exception("It appears {instance} is in use. This is" " very dangerous!".format(instance=destination)) # Take a lock to prevent multiple restores from running concurrently log.info('Taking a flock to block another restore from starting') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Rebuilding cnf files just in case') mysql_cnf_builder.build_cnf() mysql_init_server.create_and_chown_dirs(destination.port) # load some data from the mysql conf file datadir = host_utils.get_cnf_setting('datadir', destination.port) (restore_source, restore_file, restore_size) = find_a_backup_to_restore(restore_source, destination, date) if restore_source.get_zk_replica_set(): replica_set = restore_source.get_zk_replica_set()[0] master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) else: # ZK has no idea what this replica set is, probably a new replica set. master = restore_source # Start logging row_id = backup.start_restore_log( master, { 'restore_source': restore_source, 'restore_port': destination.port, 'restore_file': restore_file, 'source_instance': destination.hostname, 'restore_date': date, 'replication': no_repl, 'zookeeper': add_to_zk }) # Giant try to allow logging if anything goes wrong. try: # If we hit an exception, this status will be used. If not, it will # be overwritten restore_log_update = {'restore_status': 'BAD'} log.info('Quick sanity check') mysql_init_server.basic_host_sanity() log.info('Shutting down MySQL') host_utils.stop_mysql(destination.port) log.info('Removing any existing MySQL data') mysql_init_server.delete_mysql_data(destination.port) log.info('Unpacking {rfile} into {ddir}'.format(rfile=restore_file, ddir=datadir)) backup.xbstream_unpack(restore_file, destination.port, restore_source, restore_size) log.info('Decompressing files in {path}'.format(path=datadir)) backup.innobackup_decompress(destination.port) # Determine how much RAM to use for applying logs based on the # system's total RAM size; all our boxes have 32G or more, so # this will always be better than before, but not absurdly high. log_apply_ram = psutil.phymem_usage()[0] / 1024 / 1024 / 1024 / 3 log.info('Applying logs') backup.apply_log(destination.port, memory='{}G'.format(log_apply_ram)) log.info('Removing old innodb redo logs') mysql_init_server.delete_innodb_log_files(destination.port) log.info('Setting permissions for MySQL on {dir}'.format(dir=datadir)) host_utils.change_owner(datadir, 'mysql', 'mysql') log.info('Starting MySQL') host_utils.upgrade_auth_tables(destination.port) restore_log_update = {'restore_status': 'OK'} log.info('Running MySQL upgrade') host_utils.start_mysql( destination.port, options=host_utils.DEFAULTS_FILE_EXTRA_ARG.format( defaults_file=host_utils.MYSQL_NOREPL_CNF_FILE)) if master == backup.get_metadata_from_backup_file(restore_file)[0]: log.info('Pulling replication info from restore to backup source') (binlog_file, binlog_pos) = backup.parse_xtrabackup_binlog_info(datadir) else: log.info('Pulling replication info from restore to ' 'master of backup source') (binlog_file, binlog_pos) = backup.parse_xtrabackup_slave_info(datadir) log.info('Setting up MySQL replication') restore_log_update['replication'] = 'FAIL' # Since we haven't started the slave yet, make sure we've got these # plugins installed, whether we use them or not. mysql_lib.setup_semisync_plugins(destination) # Try to configure replication. mysql_lib.change_master(destination, master, binlog_file, binlog_pos, no_start=(no_repl == 'SKIP')) mysql_lib.wait_replication_catch_up(destination) host_utils.restart_pt_daemons(destination.port) restore_log_update['replication'] = 'OK' mysql_lib.setup_response_time_metrics(destination) except Exception as e: log.error(e) if row_id is not None: restore_log_update['status_message'] = e restore_log_update['finished_at'] = True raise finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) backup.update_restore_log(master, row_id, restore_log_update) try: if add_to_zk == 'REQ': log.info('Adding instance to zk') modify_mysql_zk.auto_add_instance_to_zk(destination, dry_run=False) backup.update_restore_log(master, row_id, {'zookeeper': 'OK'}) else: log.info('add_to_zk is not set, therefore not adding to zk') except Exception as e: log.warning("An exception occurred: {e}".format(e=e)) log.warning("If this is a DB issue, that's fine. " "Otherwise, you should check ZK.") backup.update_restore_log(master, row_id, {'finished_at': True}) log.info('Starting a new backup') mysql_backup.mysql_backup(destination)
def mysql_backup_csv_table_wrapper(self, table_tuple, conn, pitr_data): """ Back up a single table or partition Args: table_tuple - A tuple containing the fully-qualified table name, the partition name, and the partition number conn - a connection the the mysql instance pitr_data - data describing the position of the db data in replication """ proc_id = multiprocessing.current_process().name if not self.force_reupload and self.already_backed_up(table_tuple): log.info('{proc_id}: {tbl} partition {p} is already backed up, ' 'skipping'.format(proc_id=proc_id, tbl=table_tuple[0], p=table_tuple[2])) return # attempt to take lock by writing a lock to the master tmp_dir_db = None lock_identifier = None extend_lock_thread = None try: self.release_expired_locks() lock_identifier = self.take_backup_lock(table_tuple) extend_lock_stop_event = threading.Event() extend_lock_thread = threading.Thread( target=self.extend_backup_lock, args=(lock_identifier, extend_lock_stop_event)) extend_lock_thread.daemon = True extend_lock_thread.start() if not lock_identifier: return log.info('{proc_id}: {tbl} table, partition {p} backup start' ''.format(tbl=table_tuple[0], p=table_tuple[2], proc_id=proc_id)) tmp_dir_db = os.path.join(self.dump_base_path, table_tuple[0].split('.')[0]) if not os.path.exists(tmp_dir_db): os.makedirs(tmp_dir_db) host_utils.change_owner(tmp_dir_db, 'mysql', 'mysql') self.upload_pitr_data(*table_tuple[0].split('.'), pitr_data=pitr_data) self.mysql_backup_one_partition(table_tuple, tmp_dir_db, conn) log.info('{proc_id}: {tbl} table, partition {p} backup complete' ''.format(tbl=table_tuple[0], p=table_tuple[2], proc_id=proc_id)) finally: if extend_lock_thread: extend_lock_stop_event.set() log.debug('{proc_id}: {tbl} table, partition {p} waiting for ' 'lock expiry thread to end'.format( tbl=table_tuple[0], p=table_tuple[2], proc_id=proc_id)) extend_lock_thread.join() if lock_identifier: log.debug( '{proc_id}: {tbl} table, partition {p} releasing lock' ''.format(tbl=table_tuple[0], p=table_tuple[2], proc_id=proc_id)) self.release_table_backup_lock(lock_identifier)