def backup_instance(self): """ Back up a replica instance to s3 in csv """ host_lock_handle = None try: log.info('Backup for instance {i} started at {t}' ''.format(t=str(self.timestamp), i=self.instance)) log.info('Checking heartbeat to make sure replicaiton is not too ' 'lagged.') self.check_replication_for_backup() log.info('Taking host backup lock') host_lock_handle = host_utils.take_flock_lock( backup.BACKUP_LOCK_FILE) log.info('Setting up export directory structure') self.setup_and_get_tmp_path() log.info('Will temporarily dump inside of {path}' ''.format(path=self.dump_base_path)) log.info('Releasing any invalid shard backup locks') self.ensure_backup_locks_sanity() log.info('Deleting old expired locks') self.purge_old_expired_locks() log.info('Stopping replication SQL thread to get a snapshot') mysql_lib.stop_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) workers = [] for _ in range(multiprocessing.cpu_count() / 2): proc = multiprocessing.Process( target=self.mysql_backup_csv_dbs) proc.daemon = True proc.start() workers.append(proc) # throw in a sleep to make sure all threads have started dumps time.sleep(2) log.info('Restarting replication') mysql_lib.start_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) for worker in workers: worker.join() if not self.dbs_to_backup.empty(): raise Exception('All worker processes have completed, but ' 'work remains in the queue') log.info('CSV backup is complete, will run a check') mysql_backup_status.verify_csv_backup(self.instance.replica_type, self.datestamp, self.instance) finally: if host_lock_handle: log.info('Releasing general host backup lock') host_utils.release_flock_lock(host_lock_handle)
def backup_instance(self): """ Back up a replica instance to s3 in csv """ host_lock_handle = None try: log.info('Backup for instance {i} started at {t}' ''.format(t=str(self.timestamp), i=self.instance)) log.info('Checking heartbeat to make sure replicaiton is not too ' 'lagged.') self.check_replication_for_backup() log.info('Taking host backup lock') host_lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Setting up export directory structure') self.setup_and_get_tmp_path() log.info('Will temporarily dump inside of {path}' ''.format(path=self.dump_base_path)) log.info('Releasing any invalid shard backup locks') self.ensure_backup_locks_sanity() log.info('Deleting old expired locks') self.purge_old_expired_locks() log.info('Stopping replication SQL thread to get a snapshot') mysql_lib.stop_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) workers = [] for _ in range(multiprocessing.cpu_count() / 2): proc = multiprocessing.Process(target=self.mysql_backup_csv_dbs) proc.daemon = True proc.start() workers.append(proc) # throw in a sleep to make sure all threads have started dumps time.sleep(2) log.info('Restarting replication') mysql_lib.start_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) for worker in workers: worker.join() if not self.dbs_to_backup.empty(): raise Exception('All worker processes have completed, but ' 'work remains in the queue') log.info('CSV backup is complete, will run a check') mysql_backup_status.verify_csv_backup(self.instance.replica_type, self.datestamp, self.instance) finally: if host_lock_handle: log.info('Releasing general host backup lock') host_utils.release_flock_lock(host_lock_handle)
def start_shard_migration(source_replica_set, destination_replica_set, mig_dbs): """ Move shards from one replica set to another Args: source_replica_set - Which replica set to take the shards from destination_replica_set - Which replica set to put the shards on mig_dbs - A set of databases to be migrated """ # In 2017Q1 shardb and modsharddb will learn how to deal with shard # migrations. We will block them for now. if source_replica_set.startswith('db') or \ source_replica_set.startswith('moddb'): raise Exception('Sharddb and modsharddb migrations are not yet ' 'supported') if source_replica_set == destination_replica_set: raise Exception('Source and destination can not be the same!') # Dealing with failures, potentially due to failovers seems scary # here. We are intentionally not catching exception as this seems racy # and it would be far better for the entire process to fail than to mess # with replication during a failover. log.info('Requested to migrate from {s} to {d} databases: {db}' ''.format(s=source_replica_set, d=destination_replica_set, db=', '.join(mig_dbs))) zk = host_utils.MysqlZookeeper() source_master = zk.get_mysql_instance_from_replica_set(source_replica_set) source_slave = zk.get_mysql_instance_from_replica_set( source_replica_set, host_utils.REPLICA_ROLE_DR_SLAVE) if not source_slave: source_slave = zk.get_mysql_instance_from_replica_set( source_replica_set, host_utils.REPLICA_ROLE_SLAVE) log.info('Source host for dumping data {}'.format(source_slave)) destination_master = zk.get_mysql_instance_from_replica_set( destination_replica_set) log.info('Destination host for restoring data {}' ''.format(destination_master)) expected_dbs_on_source = zk.get_sharded_dbs_by_replica_set()[source_replica_set] non_mig_dbs = mysql_lib.get_dbs(source_slave).difference(mig_dbs) unexpected_dbs = mig_dbs.difference(expected_dbs_on_source) if unexpected_dbs: raise Exception('Unexpected database supplied for migraton: {}' ''.format(unexpected_dbs)) # Make sure there are no missing or extra shards precheck_schema(source_master) precheck_schema(destination_master) # Check disk space required_disk_space = get_required_disk_space(mig_dbs, source_master) available_disk_space = disk_space_available_for_migration(destination_master) if available_disk_space < required_disk_space: raise Exception('Insufficent disk space to migrate, ' 'available {a}MB, ' 'requred {r}MB' ''.format(a=available_disk_space, r=required_disk_space)) else: log.info('Disk space looks ok: ' 'available {a}MB, ' 'requred {r}MB' ''.format(a=available_disk_space, r=required_disk_space)) # Let's take out a lock to make sure we don't have multiple migrations # running on the same replica sets (either source or destination). lock_id = take_migration_lock(source_replica_set, destination_replica_set, mig_dbs, non_mig_dbs) try: if(non_mig_dbs): # First we will dump the schema for the shards that are not moving log.info('Backing up non-migrating schema: {}'.format(non_mig_dbs)) no_mig_backup = backup.logical_backup_instance( source_slave, time.localtime(), blackhole=True, databases=non_mig_dbs) time.sleep(1) # And next the metadata db log.info('Backing up metadata db: {}'.format(mysql_lib.METADATA_DB)) metadata_backup = backup.logical_backup_instance( source_slave, time.localtime(), databases=[mysql_lib.METADATA_DB]) time.sleep(1) # Next we will backup the data for the shards that are moving log.info('Backing up migrating schema data: {}'.format(mig_dbs)) mig_backup = backup.logical_backup_instance( source_slave, time.localtime(), databases=mig_dbs) except: finish_migration_log(lock_id, STATUS_EXPORT_FAILED) raise if(non_mig_dbs): # Finally import the backups log.info('Importing all the blackhole tables') mysql_restore.logical_restore(no_mig_backup, destination_master) log.info('Import metadata') mysql_restore.logical_restore(metadata_backup, destination_master) log.info('Setting up replication') mysql_lib.change_master(destination_master, source_master, 'BOGUS', 0, no_start=True, skip_set_readonly=True, gtid_auto_pos=False) mysql_restore.logical_restore(mig_backup, destination_master) # add start slave, catchup mysql_lib.start_replication(destination_master) mysql_lib.wait_for_catch_up(destination_master, migration=True) # And update the log/locks update_migration_status(lock_id, STATUS_FAILOVER_READY) log.info('The migration is ready to be finished by running:') log.info('/usr/local/bin/mysql_utils/finish_shard_migration.py {src}' ''.format(src=source_replica_set))
def backup_instance(self): """ Back up a replica instance to s3 in csv """ log.info('Backup for instance {i} started at {t}' ''.format(t=str(self.timestamp), i=self.instance)) log.info('Checking heartbeat to make sure replication is not too ' 'lagged.') self.check_replication_for_backup() log.info('Taking host backup lock') host_lock = host_utils.bind_lock_socket(backup.CSV_BACKUP_LOCK_SOCKET) log.info('Setting up export directory structure') self.setup_and_get_tmp_path() log.info('Will temporarily dump inside of {path}' ''.format(path=self.dump_base_path)) log.info('Releasing any invalid shard backup locks') self.ensure_backup_locks_sanity() log.info('Deleting old expired locks') self.purge_old_expired_locks() log.info('Stopping replication SQL thread to get a snapshot') mysql_lib.stop_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) # starting a consistent snapshot here and retrieving the thread ID conn = mysql_lib.connect_mysql(self.instance, backup.USER_ROLE_MYSQLDUMP) mysql_lib.start_consistent_snapshot(conn, read_only=True) cursor = conn.cursor() cursor.execute('SET SESSION wait_timeout=28800') cursor.execute("SELECT VARIABLE_VALUE AS conn_id FROM " "INFORMATION_SCHEMA.SESSION_VARIABLES " "WHERE VARIABLE_NAME='pseudo_thread_id'") self.session_id = cursor.fetchone()['conn_id'] workers = [] for _ in range(multiprocessing.cpu_count() / 2): proc = multiprocessing.Process(target=self.mysql_backup_csv_tables) proc.daemon = True proc.start() workers.append(proc) # throw in a sleep to make sure all threads have started dumps time.sleep(2) log.info('Restarting replication') mysql_lib.start_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) for worker in workers: worker.join() if not (self.tables_to_backup.empty() and self.tables_to_retry.empty()): raise Exception('All worker processes have completed, but ' 'work remains in the queue') log.info('CSV backup is complete, will run a check') self.release_expired_locks() mysql_backup_status.verify_csv_instance_backup(self.instance, self.datestamp, self.dev_bucket) host_utils.release_lock_socket(host_lock)