def _test_vtctl_copyschemashard(self, source): # Apply initial schema to the whole keyspace before creating shard 2. self._apply_initial_schema() _setup_shard_2() try: # InitShardMaster creates the db, but there shouldn't be any tables yet. self._check_tables(shard_2_master, 0) self._check_tables(shard_2_replica1, 0) # Run the command twice to make sure it's idempotent. for _ in range(2): utils.run_vtctl(['CopySchemaShard', source, 'test_keyspace/2'], auto_log=True) # shard_2_master should look the same as the replica we copied from self._check_tables(shard_2_master, 4) utils.wait_for_replication_pos(shard_2_master, shard_2_replica1) self._check_tables(shard_2_replica1, 4) shard_0_schema = self._get_schema(shard_0_master.tablet_alias) shard_2_schema = self._get_schema(shard_2_master.tablet_alias) self.assertEqual(shard_0_schema, shard_2_schema) finally: _teardown_shard_2()
def external_reparent(): # Demote master. master.mquery('', mysql_flavor().demote_master_commands()) if master.semi_sync_enabled(): master.set_semi_sync_enabled(master=False) # Wait for replica to catch up to master. utils.wait_for_replication_pos(master, replica) # Promote replica to new master. replica.mquery('', mysql_flavor().promote_slave_commands()) if replica.semi_sync_enabled(): replica.set_semi_sync_enabled(master=True) old_master = master new_master = replica # Configure old master to use new master. new_pos = mysql_flavor().master_position(new_master) logging.debug('New master position: %s', str(new_pos)) # Use 'localhost' as hostname because Travis CI worker hostnames # are too long for MySQL replication. change_master_cmds = mysql_flavor().change_master_commands( 'localhost', new_master.mysql_port, new_pos) old_master.mquery('', ['RESET SLAVE'] + change_master_cmds + ['START SLAVE']) # Notify the new vttablet master about the reparent. utils.run_vtctl( ['TabletExternallyReparented', new_master.tablet_alias])
def setUp(self): """Creates the necessary shards, starts the tablets, and inserts some data.""" self.run_shard_tablets('0', shard_tablets) # create the split shards self.run_shard_tablets('-80', shard_0_tablets, create_db=False, create_table=False, wait_state='NOT_SERVING') self.run_shard_tablets('80-', shard_1_tablets, create_db=False, create_table=False, wait_state='NOT_SERVING') # Copy the schema to the destination shards for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) logging.debug("Start inserting initial data: %s rows", utils.options.num_insert_rows) self.insert_values(shard_master, utils.options.num_insert_rows, 2) logging.debug( "Done inserting initial data, waiting for replication to catch up") utils.wait_for_replication_pos(shard_master, shard_rdonly1) logging.debug("Replication on source rdonly tablet is caught up")
def setUp(self): """Creates shards, starts the tablets, and inserts some data.""" self.run_shard_tablets("0", all_shard_tablets) # create the split shards self.run_shard_tablets("-80", shard_0_tablets, create_db=False, create_table=False, wait_state="NOT_SERVING") self.run_shard_tablets("80-", shard_1_tablets, create_db=False, create_table=False, wait_state="NOT_SERVING") logging.debug("Start inserting initial data: %s rows", utils.options.num_insert_rows) self.insert_values(shard_master, utils.options.num_insert_rows, 2) logging.debug("Done inserting initial data, waiting for replication to catch up") utils.wait_for_replication_pos(shard_master, shard_rdonly1) logging.debug("Replication on source rdonly tablet is caught up")
def setUp(self): """Creates shards, starts the tablets, and inserts some data.""" self.run_shard_tablets('0', shard_tablets) # create the split shards self.run_shard_tablets('-80', shard_0_tablets, create_db=False, create_table=False, wait_state='NOT_SERVING') self.run_shard_tablets('80-', shard_1_tablets, create_db=False, create_table=False, wait_state='NOT_SERVING') logging.debug( 'Start inserting initial data: %s rows', utils.options.num_insert_rows) self.insert_values(shard_master, utils.options.num_insert_rows, 2) logging.debug( 'Done inserting initial data, waiting for replication to catch up') utils.wait_for_replication_pos(shard_master, shard_rdonly1) logging.debug('Replication on source rdonly tablet is caught up')
def external_reparent(self): # Demote master. start = time.time() master.mquery('', mysql_flavor().demote_master_commands(), log_query=True) if master.semi_sync_enabled(): master.set_semi_sync_enabled(master=False) # Wait for replica to catch up to master. utils.wait_for_replication_pos(master, replica) # Wait for at least one second to articially prolong the failover and give # the buffer a chance to observe it. d = time.time() - start min_unavailability_s = 1 if d < min_unavailability_s: w = min_unavailability_s - d logging.debug( 'Waiting for %.1f seconds because the failover was too fast' ' (took only %.3f seconds)', w, d) time.sleep(w) # Promote replica to new master. replica.mquery('', mysql_flavor().promote_slave_commands(), log_query=True) if replica.semi_sync_enabled(): replica.set_semi_sync_enabled(master=True) old_master = master new_master = replica # Configure old master to use new master. new_pos = mysql_flavor().master_position(new_master) logging.debug('New master position: %s', str(new_pos)) # Use 'localhost' as hostname because Travis CI worker hostnames # are too long for MySQL replication. change_master_cmds = mysql_flavor().change_master_commands( 'localhost', new_master.mysql_port, new_pos) old_master.mquery('', ['RESET SLAVE'] + change_master_cmds + ['START SLAVE'], log_query=True) # Notify the new vttablet master about the reparent. utils.run_vtctl( ['TabletExternallyReparented', new_master.tablet_alias], auto_log=True)
def setUp(self): """Creates shards, starts the tablets, and inserts some data.""" try: self.run_shard_tablets('0', all_shard_tablets) # create the split shards self.run_shard_tablets( '-80', shard_0_tablets, create_table=False) self.run_shard_tablets( '80-', shard_1_tablets, create_table=False) logging.debug( 'Start inserting initial data: %s rows', utils.options.num_insert_rows) self.insert_values(shard_master, utils.options.num_insert_rows, 2) logging.debug( 'Done inserting initial data, waiting for replication to catch up') utils.wait_for_replication_pos(shard_master, shard_rdonly1) logging.debug('Replication on source rdonly tablet is caught up') except: self.tearDown()
def setUp(self): """Creates shards, starts the tablets, and inserts some data.""" try: self.run_shard_tablets('0', all_shard_tablets) # create the split shards self.run_shard_tablets('-80', shard_0_tablets, create_table=False) self.run_shard_tablets('80-', shard_1_tablets, create_table=False) logging.debug('Start inserting initial data: %s rows', self.num_insert_rows) self.insert_values(shard_master, self.num_insert_rows, 2) logging.debug( 'Done inserting initial data, waiting for replication to catch up' ) utils.wait_for_replication_pos(shard_master, shard_rdonly1) logging.debug('Replication on source rdonly tablet is caught up') except: self.tearDown() raise
def external_reparent(self): # Demote master. start = time.time() master.mquery('', mysql_flavor().demote_master_commands(), log_query=True) if master.semi_sync_enabled(): master.set_semi_sync_enabled(master=False) # Wait for replica to catch up to master. utils.wait_for_replication_pos(master, replica) # Wait for at least one second to articially prolong the failover and give # the buffer a chance to observe it. d = time.time() - start min_unavailability_s = 1 if d < min_unavailability_s: w = min_unavailability_s - d logging.debug('Waiting for %.1f seconds because the failover was too fast' ' (took only %.3f seconds)', w, d) time.sleep(w) # Promote replica to new master. replica.mquery('', mysql_flavor().promote_slave_commands(), log_query=True) if replica.semi_sync_enabled(): replica.set_semi_sync_enabled(master=True) old_master = master new_master = replica # Configure old master to use new master. new_pos = mysql_flavor().master_position(new_master) logging.debug('New master position: %s', str(new_pos)) # Use 'localhost' as hostname because Travis CI worker hostnames # are too long for MySQL replication. change_master_cmds = mysql_flavor().change_master_commands( 'localhost', new_master.mysql_port, new_pos) old_master.mquery('', ['RESET SLAVE'] + change_master_cmds + ['START SLAVE'], log_query=True) # Notify the new vttablet master about the reparent. utils.run_vtctl(['TabletExternallyReparented', new_master.tablet_alias], auto_log=True)
def _test_vtctl_copyschemashard(self, source): self._apply_initial_schema() self._setUp_tablets_shard_2() # CopySchemaShard is responsible for creating the db; one shouldn't exist before # the command is run. self._check_db_not_created(shard_2_master) self._check_db_not_created(shard_2_replica1) # Run the command twice to make sure it's idempotent. for _ in range(2): utils.run_vtctl(["CopySchemaShard", source, "test_keyspace/2"], auto_log=True) # shard_2_master should look the same as the replica we copied from self._check_tables(shard_2_master, 4) utils.wait_for_replication_pos(shard_2_master, shard_2_replica1) self._check_tables(shard_2_replica1, 4) shard_0_schema = self._get_schema(shard_0_master.tablet_alias) shard_2_schema = self._get_schema(shard_2_master.tablet_alias) self.assertEqual(shard_0_schema, shard_2_schema)
def _test_vtctl_copyschemashard(self, source): self._apply_initial_schema() self._setUp_tablets_shard_2() # CopySchemaShard is responsible for creating the db; one shouldn't exist before # the command is run. self._check_db_not_created(shard_2_master) self._check_db_not_created(shard_2_replica1) # Run the command twice to make sure it's idempotent. for _ in range(2): utils.run_vtctl(['CopySchemaShard', source, 'test_keyspace/2'], auto_log=True) # shard_2_master should look the same as the replica we copied from self._check_tables(shard_2_master, 4) utils.wait_for_replication_pos(shard_2_master, shard_2_replica1) self._check_tables(shard_2_replica1, 4) shard_0_schema = self._get_schema(shard_0_master.tablet_alias) shard_2_schema = self._get_schema(shard_2_master.tablet_alias) self.assertEqual(shard_0_schema, shard_2_schema)
def setUp(self): """Creates the necessary shards, starts the tablets, and inserts some data.""" self.run_shard_tablets('0', shard_tablets) # create the split shards self.run_shard_tablets('-80', shard_0_tablets, create_db=False, create_table=False, wait_state='NOT_SERVING') self.run_shard_tablets('80-', shard_1_tablets, create_db=False, create_table=False, wait_state='NOT_SERVING') # Copy the schema to the destination shards for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard], auto_log=True) logging.debug("Start inserting initial data: %s rows", utils.options.num_insert_rows) self.insert_values(shard_master, utils.options.num_insert_rows, 2) logging.debug("Done inserting initial data, waiting for replication to catch up") utils.wait_for_replication_pos(shard_master, shard_rdonly1) logging.debug("Replication on source rdonly tablet is caught up")
def _test_reparent_from_outside(self, brutal=False): """This test will start a master and 3 slaves. Then: - one slave will be the new master - one slave will be reparented to that new master - one slave will be busted and dead in the water and we'll call TabletExternallyReparented. Args: brutal: kills the old master first """ utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) # wait for all tablets to start for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') # Reparent as a starting point utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias ], auto_log=True) # now manually reparent 1 out of 2 tablets # 62044 will be the new master # 31981 won't be re-parented, so it will be busted # Shutdown the old master first. if not brutal: tablet_62344.mquery('', mysql_flavor().demote_master_commands()) # Get the position of the old master and wait for the new one to catch up. utils.wait_for_replication_pos(tablet_62344, tablet_62044) # Promote the new master. tablet_62044.mquery('', mysql_flavor().promote_slave_commands()) new_pos = mysql_flavor().master_position(tablet_62044) logging.debug('New master position: %s', str(new_pos)) # Use 'localhost' as hostname because Travis CI worker hostnames # are too long for MySQL replication. change_master_cmds = mysql_flavor().change_master_commands( 'localhost', tablet_62044.mysql_port, new_pos) # 62344 will now be a slave of 62044 tablet_62344.mquery('', ['RESET MASTER', 'RESET SLAVE'] + change_master_cmds + ['START SLAVE']) # 41983 will be a slave of 62044 tablet_41983.mquery('', ['STOP SLAVE'] + change_master_cmds + ['START SLAVE']) # in brutal mode, we kill the old master first # and delete its tablet record if brutal: tablet_62344.kill_vttablet() utils.run_vtctl( ['DeleteTablet', '-allow_master', tablet_62344.tablet_alias], auto_log=True) base_time = time.time() # update topology with the new server utils.run_vtctl( ['TabletExternallyReparented', tablet_62044.tablet_alias], mode=utils.VTCTL_VTCTL, auto_log=True) self._test_reparent_from_outside_check(brutal, base_time) if not brutal: tablet_62344.kill_vttablet() tablet.kill_tablets([tablet_31981, tablet_62044, tablet_41983])
def _test_reparent_from_outside(self, brutal=False): """This test will start a master and 3 slaves. Then: - one slave will be the new master - one slave will be reparented to that new master - one slave will be busted and dead in the water and we'll call TabletExternallyReparented. Args: brutal: kills the old master first """ utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) # wait for all tablets to start for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') # Reparent as a starting point utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias], auto_log=True) # now manually reparent 1 out of 2 tablets # 62044 will be the new master # 31981 won't be re-parented, so it will be busted # Shutdown the old master first. if not brutal: tablet_62344.mquery('', mysql_flavor().demote_master_commands()) # Get the position of the old master and wait for the new one to catch up. utils.wait_for_replication_pos(tablet_62344, tablet_62044) # Promote the new master. tablet_62044.mquery('', mysql_flavor().promote_slave_commands()) new_pos = mysql_flavor().master_position(tablet_62044) logging.debug('New master position: %s', str(new_pos)) # Use 'localhost' as hostname because Travis CI worker hostnames # are too long for MySQL replication. change_master_cmds = mysql_flavor().change_master_commands( 'localhost', tablet_62044.mysql_port, new_pos) # 62344 will now be a slave of 62044 tablet_62344.mquery('', ['RESET MASTER', 'RESET SLAVE'] + change_master_cmds + ['START SLAVE']) # 41983 will be a slave of 62044 tablet_41983.mquery('', ['STOP SLAVE'] + change_master_cmds + ['START SLAVE']) # in brutal mode, we kill the old master first # and delete its tablet record if brutal: tablet_62344.kill_vttablet() utils.run_vtctl(['DeleteTablet', '-allow_master', tablet_62344.tablet_alias], auto_log=True) base_time = time.time() # update topology with the new server utils.run_vtctl(['TabletExternallyReparented', tablet_62044.tablet_alias], mode=utils.VTCTL_VTCTL, auto_log=True) self._test_reparent_from_outside_check(brutal, base_time) if not brutal: tablet_62344.kill_vttablet() tablet.kill_tablets([tablet_31981, tablet_62044, tablet_41983])
def verify_successful_worker_copy_with_reparent(self, mysql_down=False): """Verifies that vtworker can successfully copy data for a SplitClone. Order of operations: 1. Run a background vtworker 2. Wait until the worker successfully resolves the destination masters. 3. Reparent the destination tablets 4. Wait until the vtworker copy is finished 5. Verify that the worker was forced to reresolve topology and retry writes due to the reparent. 6. Verify that the data was copied successfully to both new shards Args: mysql_down: boolean. If True, we take down the MySQL instances on the destination masters at first, then bring them back and reparent away. Raises: AssertionError if things didn't go as expected. """ if mysql_down: logging.debug('Shutting down mysqld on destination masters.') utils.wait_procs([ shard_0_master.shutdown_mysql(), shard_1_master.shutdown_mysql() ]) worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj'], auto_log=True) # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. # --chunk_count is 2 because rows are currently ordered by primary key such # that all rows of the first shard come first and then the second shard. # TODO(mberlin): Remove --offline=false once vtworker ensures that the # destination shards are not behind the master's replication # position. args = [ 'SplitClone', '--offline=false', '--destination_writer_count', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999' ] if not mysql_down: # Make the clone as slow as necessary such that there is enough time to # run PlannedReparent in the meantime. # TOOD(mberlin): Once insert_values is fixed to uniformly distribute the # rows across shards when sorted by primary key, remove # --chunk_count 2, --min_rows_per_chunk 1 and set # --source_reader_count back to 1. args.extend([ '--source_reader_count', '2', '--chunk_count', '2', '--min_rows_per_chunk', '1', '--write_query_max_rows', '1' ]) args.append('test_keyspace/0') workerclient_proc = utils.run_vtworker_client_bg(args, worker_rpc_port) if mysql_down: # If MySQL is down, we wait until vtworker retried at least once to make # sure it reached the point where a write failed due to MySQL being down. # There should be two retries at least, one for each destination shard. utils.poll_for_vars( 'vtworker', worker_port, 'WorkerRetryCount >= 2', condition_fn=lambda v: v.get('WorkerRetryCount') >= 2) logging.debug( 'Worker has retried at least twice, starting reparent now') # vtworker is blocked at this point. This is a good time to test that its # throttler server is reacting to RPCs. self.check_throttler_service( 'localhost:%d' % worker_rpc_port, ['test_keyspace/-80', 'test_keyspace/80-'], 9999) # Bring back masters. Since we test with semi-sync now, we need at least # one replica for the new master. This test is already quite expensive, # so we bring back the old master as a replica rather than having a third # replica up the whole time. logging.debug('Restarting mysqld on destination masters') utils.wait_procs( [shard_0_master.start_mysql(), shard_1_master.start_mysql()]) # Reparent away from the old masters. utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/-80', shard_0_replica.tablet_alias ], auto_log=True) utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-', shard_1_replica.tablet_alias ], auto_log=True) else: # NOTE: There is a race condition around this: # It's possible that the SplitClone vtworker command finishes before the # PlannedReparentShard vtctl command, which we start below, succeeds. # Then the test would fail because vtworker did not have to retry. # # To workaround this, the test takes a parameter to increase the number of # rows that the worker has to copy (with the idea being to slow the worker # down). # You should choose a value for num_insert_rows, such that this test # passes for your environment (trial-and-error...) # Make sure that vtworker got past the point where it picked a master # for each destination shard ("finding targets" state). utils.poll_for_vars( 'vtworker', worker_port, 'WorkerState == cloning the data (online)', condition_fn=lambda v: v.get('WorkerState') == 'cloning the' ' data (online)') logging.debug('Worker is in copy state, starting reparent now') utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/-80', shard_0_replica.tablet_alias ], auto_log=True) utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-', shard_1_replica.tablet_alias ], auto_log=True) utils.wait_procs([workerclient_proc]) # Verify that we were forced to re-resolve and retry. worker_vars = utils.get_vars(worker_port) self.assertGreater( worker_vars['WorkerRetryCount'], 1, "expected vtworker to retry each of the two reparented" " destination masters at least once, but it didn't") self.assertNotEqual(worker_vars['WorkerRetryCount'], {}, "expected vtworker to retry, but it didn't") utils.kill_sub_process(worker_proc, soft=True) # Wait for the destination RDONLYs to catch up or the following offline # clone will try to insert rows which already exist. # TODO(mberlin): Remove this once SplitClone supports it natively. utils.wait_for_replication_pos(shard_0_replica, shard_0_rdonly1) utils.wait_for_replication_pos(shard_1_replica, shard_1_rdonly1) # Run final offline clone to enable filtered replication. _, _ = utils.run_vtworker([ '-cell', 'test_nj', 'SplitClone', '--online=false', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], auto_log=True) # Make sure that everything is caught up to the same replication point self.run_split_diff('test_keyspace/-80', all_shard_tablets, shard_0_tablets) self.run_split_diff('test_keyspace/80-', all_shard_tablets, shard_1_tablets) self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica) self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
def verify_successful_worker_copy_with_reparent(self, mysql_down=False): """Verifies that vtworker can successfully copy data for a SplitClone. Order of operations: 1. Run a background vtworker 2. Wait until the worker successfully resolves the destination masters. 3. Reparent the destination tablets 4. Wait until the vtworker copy is finished 5. Verify that the worker was forced to reresolve topology and retry writes due to the reparent. 6. Verify that the data was copied successfully to both new shards Args: mysql_down: boolean. If True, we take down the MySQL instances on the destination masters at first, then bring them back and reparent away. Raises: AssertionError if things didn't go as expected. """ if mysql_down: logging.debug('Shutting down mysqld on destination masters.') utils.wait_procs( [shard_0_master.shutdown_mysql(), shard_1_master.shutdown_mysql()]) worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--use_v3_resharding_mode=false'], auto_log=True) # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. # --chunk_count is 2 because rows are currently ordered by primary key such # that all rows of the first shard come first and then the second shard. # TODO(mberlin): Remove --offline=false once vtworker ensures that the # destination shards are not behind the master's replication # position. args = ['SplitClone', '--offline=false', '--destination_writer_count', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999'] if not mysql_down: # Make the clone as slow as necessary such that there is enough time to # run PlannedReparent in the meantime. # TODO(mberlin): Once insert_values is fixed to uniformly distribute the # rows across shards when sorted by primary key, remove # --chunk_count 2, --min_rows_per_chunk 1 and set # --source_reader_count back to 1. args.extend(['--source_reader_count', '2', '--chunk_count', '2', '--min_rows_per_chunk', '1', '--write_query_max_rows', '1']) args.append('test_keyspace/0') workerclient_proc = utils.run_vtworker_client_bg(args, worker_rpc_port) if mysql_down: # If MySQL is down, we wait until vtworker retried at least once to make # sure it reached the point where a write failed due to MySQL being down. # There should be two retries at least, one for each destination shard. utils.poll_for_vars( 'vtworker', worker_port, 'WorkerRetryCount >= 2', condition_fn=lambda v: v.get('WorkerRetryCount') >= 2) logging.debug('Worker has retried at least twice, starting reparent now') # vtworker is blocked at this point. This is a good time to test that its # throttler server is reacting to RPCs. self.check_throttler_service('localhost:%d' % worker_rpc_port, ['test_keyspace/-80', 'test_keyspace/80-'], 9999) # Bring back masters. Since we test with semi-sync now, we need at least # one replica for the new master. This test is already quite expensive, # so we bring back the old master as a replica rather than having a third # replica up the whole time. logging.debug('Restarting mysqld on destination masters') utils.wait_procs( [shard_0_master.start_mysql(), shard_1_master.start_mysql()]) # Reparent away from the old masters. utils.run_vtctl( ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/-80', '-new_master', shard_0_replica.tablet_alias], auto_log=True) utils.run_vtctl( ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-', '-new_master', shard_1_replica.tablet_alias], auto_log=True) else: # NOTE: There is a race condition around this: # It's possible that the SplitClone vtworker command finishes before the # PlannedReparentShard vtctl command, which we start below, succeeds. # Then the test would fail because vtworker did not have to retry. # # To workaround this, the test takes a parameter to increase the number of # rows that the worker has to copy (with the idea being to slow the worker # down). # You should choose a value for num_insert_rows, such that this test # passes for your environment (trial-and-error...) # Make sure that vtworker got past the point where it picked a master # for each destination shard ("finding targets" state). utils.poll_for_vars( 'vtworker', worker_port, 'WorkerState == cloning the data (online)', condition_fn=lambda v: v.get('WorkerState') == 'cloning the' ' data (online)') logging.debug('Worker is in copy state, starting reparent now') utils.run_vtctl( ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/-80', '-new_master', shard_0_replica.tablet_alias], auto_log=True) utils.run_vtctl( ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-', '-new_master', shard_1_replica.tablet_alias], auto_log=True) utils.wait_procs([workerclient_proc]) # Verify that we were forced to re-resolve and retry. worker_vars = utils.get_vars(worker_port) self.assertGreater(worker_vars['WorkerRetryCount'], 1, "expected vtworker to retry each of the two reparented" " destination masters at least once, but it didn't") self.assertNotEqual(worker_vars['WorkerRetryCount'], {}, "expected vtworker to retry, but it didn't") utils.kill_sub_process(worker_proc, soft=True) # Wait for the destination RDONLYs to catch up or the following offline # clone will try to insert rows which already exist. # TODO(mberlin): Remove this once SplitClone supports it natively. utils.wait_for_replication_pos(shard_0_replica, shard_0_rdonly1) utils.wait_for_replication_pos(shard_1_replica, shard_1_rdonly1) # Run final offline clone to enable filtered replication. _, _ = utils.run_vtworker(['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitClone', '--online=false', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'], auto_log=True) # Make sure that everything is caught up to the same replication point self.run_split_diff('test_keyspace/-80', all_shard_tablets, shard_0_tablets) self.run_split_diff('test_keyspace/80-', all_shard_tablets, shard_1_tablets) self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica) self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)