def test_restart_during_action(self): # Start up a master mysql and vttablet utils.run_vtctl('CreateKeyspace test_keyspace') tablet_62344.init_tablet('master', 'test_keyspace', '0') utils.run_vtctl('RebuildShardGraph test_keyspace/0') utils.validate_topology() tablet_62344.create_db('vt_test_keyspace') tablet_62344.start_vttablet() utils.run_vtctl('Ping ' + tablet_62344.tablet_alias) # schedule long action utils.run_vtctl('-no-wait Sleep %s 15s' % tablet_62344.tablet_alias, stdout=utils.devnull) # ping blocks until the sleep finishes unless we have a schedule race action_path, _ = utils.run_vtctl('-no-wait Ping ' + tablet_62344.tablet_alias, trap_output=True) # kill agent leaving vtaction running tablet_62344.kill_vttablet() # restart agent tablet_62344.start_vttablet() # we expect this action with a short wait time to fail. this isn't the best # and has some potential for flakiness. utils.run_fail(utils.vtroot+'/bin/vtctl -log_dir '+utils.tmp_root+' --alsologtostderr -wait-time 2s WaitForAction ' + action_path) # wait until the background sleep action is done, otherwise there will be # a leftover vtaction whose result may overwrite running actions # NOTE(alainjobart): Yes, I've seen it happen, it's a pain to debug: # the zombie Sleep clobbers the Clone command in the following tests utils.run_vtctl('-wait-time 20s WaitForAction ' + action_path, auto_log=True) # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) logging.debug("vars: %s" % str(v)) if v['Voltron']['States']['DurationSERVING'] < 10e9: raise utils.TestError('not enough time in Open state', v['Voltron']['States']['DurationSERVING']) # then the Zookeeper connections if v['ZkMetaConn']['test_nj']['Current'] != 'Connected': raise utils.TestError('invalid zk test_nj state: ', v['ZkMetaConn']['test_nj']['Current']) if v['ZkMetaConn']['global']['Current'] != 'Connected': raise utils.TestError('invalid zk global state: ', v['ZkMetaConn']['global']['Current']) if v['ZkMetaConn']['test_nj']['DurationConnected'] < 10e9: raise utils.TestError('not enough time in Connected state', v['ZkMetaConn']['test_nj']['DurationConnected']) if v['TabletType'] != 'master': raise utils.TestError('TabletType not exported correctly') tablet_62344.kill_vttablet()
def run_test_restart_during_action(): # Start up a master mysql and vttablet utils.run_vtctl("CreateKeyspace -force test_keyspace") tablet_62344.init_tablet("master", "test_keyspace", "0") utils.run_vtctl("RebuildShardGraph test_keyspace/0") utils.validate_topology() tablet_62344.create_db("vt_test_keyspace") tablet_62344.start_vttablet() utils.run_vtctl("Ping " + tablet_62344.tablet_alias) # schedule long action utils.run_vtctl("-no-wait Sleep %s 15s" % tablet_62344.tablet_alias, stdout=devnull) # ping blocks until the sleep finishes unless we have a schedule race action_path, _ = utils.run_vtctl("-no-wait Ping " + tablet_62344.tablet_alias, trap_output=True) # kill agent leaving vtaction running tablet_62344.kill_vttablet() # restart agent tablet_62344.start_vttablet() # we expect this action with a short wait time to fail. this isn't the best # and has some potential for flakiness. utils.run_fail(utils.vtroot + "/bin/vtctl --alsologtostderr -wait-time 2s WaitForAction " + action_path) # wait until the background sleep action is done, otherwise there will be # a leftover vtaction whose result may overwrite running actions # NOTE(alainjobart): Yes, I've seen it happen, it's a pain to debug: # the zombie Sleep clobbers the Clone command in the following tests utils.run(utils.vtroot + "/bin/vtctl --alsologtostderr -wait-time 20s WaitForAction " + action_path) # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) utils.debug("vars: %s" % str(v)) if v["Voltron"]["States"]["DurationOPEN"] < 10e9: raise utils.TestError("not enough time in Open state", v["Voltron"]["States"]["DurationOPEN"]) # then the Zookeeper connections if v["ZkMetaConn"]["test_nj"]["Current"] != "Connected": raise utils.TestError("invalid zk test_nj state: ", v["ZkMetaConn"]["test_nj"]["Current"]) if v["ZkMetaConn"]["global"]["Current"] != "Connected": raise utils.TestError("invalid zk global state: ", v["ZkMetaConn"]["global"]["Current"]) if v["ZkMetaConn"]["test_nj"]["DurationConnected"] < 10e9: raise utils.TestError("not enough time in Connected state", v["ZkMetaConn"]["test_nj"]["DurationConnected"]) if v["tablet-type"] != "master": raise utils.TestError("tablet-type not exported correctly") tablet_62344.kill_vttablet()
def run_test_rebuild(): utils.run_vtctl("CreateKeyspace -force test_keyspace") tablet_62344.init_tablet("master", "test_keyspace", "0") tablet_62044.init_tablet("replica", "test_keyspace", "0") tablet_31981.init_tablet("experimental", "test_keyspace", "0") # in ny by default utils.run_vtctl("RebuildKeyspaceGraph -cells=test_nj test_keyspace", auto_log=True) utils.run_fail(utils.vtroot + "/bin/zk cat /zk/test_ny/vt/ns/test_keyspace/0/master") utils.run_vtctl("RebuildKeyspaceGraph -cells=test_ny test_keyspace", auto_log=True) real_master = utils.zk_cat("/zk/test_nj/vt/ns/test_keyspace/0/master") master_alias = utils.zk_cat("/zk/test_ny/vt/ns/test_keyspace/0/master") if real_master != master_alias: raise utils.TestError("master serving graph in all cells failed:\n%s!=\n%s" % (real_master, master_alias))
def run_test_rebuild(): utils.run_vtctl('CreateKeyspace -force test_keyspace') tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('replica', 'test_keyspace', '0') tablet_31981.init_tablet('experimental', 'test_keyspace', '0') # in ny by default utils.run_vtctl('RebuildKeyspaceGraph -cells=test_nj test_keyspace', auto_log=True) utils.run_fail(utils.vtroot+'/bin/zk cat /zk/test_ny/vt/ns/test_keyspace/0/master') utils.run_vtctl('RebuildKeyspaceGraph -cells=test_ny test_keyspace', auto_log=True) real_master = utils.zk_cat('/zk/test_nj/vt/ns/test_keyspace/0/master') master_alias = utils.zk_cat('/zk/test_ny/vt/ns/test_keyspace/0/master') if real_master != master_alias: raise utils.TestError('master serving graph in all cells failed:\n%s!=\n%s' % (real_master, master_alias))
def run_test_reparent_down_master(): utils.zk_wipe() utils.run_vtctl('CreateKeyspace -force test_keyspace') # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', '0', start=True) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', '0', start=True) tablet_41983.init_tablet('replica', 'test_keyspace', '0', start=True) tablet_31981.init_tablet('replica', 'test_keyspace', '0', start=True) # Recompute the shard layout node - until you do that, it might not be valid. utils.run_vtctl('RebuildShardGraph test_keyspace/0') utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are identical. utils.run_vtctl('ReparentShard -force test_keyspace/0 ' + tablet_62344.tablet_alias, auto_log=True) utils.validate_topology() # Make the master agent and database unavailable. tablet_62344.kill_vttablet() tablet_62344.shutdown_mysql().wait() expected_addr = utils.hostname + ':' + str(tablet_62344.port) _check_db_addr('test_keyspace.0.master:_vtocc', expected_addr) # Perform a reparent operation - the Validate part will try to ping # the master and fail somewhat quickly stdout, stderr = utils.run_fail(utils.vtroot+'/bin/vtctl -logfile=/dev/null -log.level=INFO -wait-time 5s ReparentShard test_keyspace/0 ' + tablet_62044.tablet_alias) utils.debug("Failed ReparentShard output:\n" + stderr) if 'ValidateShard verification failed: timed out during validate' not in stderr: raise utils.TestError("didn't find the right error strings in failed ReparentShard: " + stderr) # Should timeout and fail stdout, stderr = utils.run_fail(utils.vtroot+'/bin/vtctl -logfile=/dev/null -log.level=INFO -wait-time 5s ScrapTablet ' + tablet_62344.tablet_alias) utils.debug("Failed ScrapTablet output:\n" + stderr) if 'deadline exceeded' not in stderr: raise utils.TestError("didn't find the right error strings in failed ScrapTablet: " + stderr) # Should interrupt and fail sp = utils.run_bg(utils.vtroot+'/bin/vtctl -log.level=INFO -wait-time 10s ScrapTablet ' + tablet_62344.tablet_alias, stdout=PIPE, stderr=PIPE) # Need time for the process to start before killing it. time.sleep(0.1) os.kill(sp.pid, signal.SIGINT) stdout, stderr = sp.communicate() utils.debug("Failed ScrapTablet output:\n" + stderr) if 'interrupted' not in stderr: raise utils.TestError("didn't find the right error strings in failed ScrapTablet: " + stderr) # Force the scrap action in zk even though tablet is not accessible. tablet_62344.scrap(force=True) utils.run_fail(utils.vtroot+'/bin/vtctl -logfile=/dev/null -log.level=WARNING ChangeSlaveType -force %s idle' % tablet_62344.tablet_alias) # Remove pending locks (make this the force option to ReparentShard?) utils.run_vtctl('PurgeActions /zk/global/vt/keyspaces/test_keyspace/shards/0/action') # Re-run reparent operation, this shoud now proceed unimpeded. utils.run_vtctl('-wait-time 1m ReparentShard test_keyspace/0 ' + tablet_62044.tablet_alias, auto_log=True) utils.validate_topology() expected_addr = utils.hostname + ':' + str(tablet_62044.port) _check_db_addr('test_keyspace.0.master:_vtocc', expected_addr) utils.run_vtctl('ChangeSlaveType -force %s idle' % tablet_62344.tablet_alias) idle_tablets, _ = utils.run_vtctl('ListAllTablets test_nj', trap_output=True) if '0000062344 <null> <null> idle' not in idle_tablets: raise utils.TestError('idle tablet not found', idle_tablets) tablet_62044.kill_vttablet() tablet_41983.kill_vttablet() tablet_31981.kill_vttablet() # sothe other tests don't have any surprise tablet_62344.start_mysql().wait()
def test_resharding(self): utils.run_vtctl('CreateKeyspace test_keyspace') shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') utils.run_vtctl( 'RebuildShardGraph /zk/global/vt/keyspaces/test_keyspace/shards/*', auto_log=True) utils.run_vtctl( 'RebuildKeyspaceGraph /zk/global/vt/keyspaces/test_keyspace', auto_log=True) # create databases so vttablet can start behaving normally shard_0_master.create_db('vt_test_keyspace') shard_0_replica.create_db('vt_test_keyspace') shard_1_master.create_db('vt_test_keyspace') shard_1_slave1.create_db('vt_test_keyspace') shard_1_slave2.create_db('vt_test_keyspace') # start the tablets shard_0_master.start_vttablet() shard_0_replica.start_vttablet() shard_1_master.start_vttablet() shard_1_slave1.start_vttablet() shard_1_slave2.start_vttablet(wait_for_state='NOT_SERVING') # spare # reparent to make the tablets work utils.run_vtctl('ReparentShard -force test_keyspace/-80 ' + shard_0_master.tablet_alias, auto_log=True) utils.run_vtctl('ReparentShard -force test_keyspace/80- ' + shard_1_master.tablet_alias, auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-C0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0') shard_3_master.init_tablet('master', 'test_keyspace', 'C0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'C0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state='CONNECTING') shard_2_replica1.start_vttablet(wait_for_state='NOT_SERVING') shard_2_replica2.start_vttablet(wait_for_state='NOT_SERVING') shard_3_master.start_vttablet(wait_for_state='CONNECTING') shard_3_replica.start_vttablet(wait_for_state='NOT_SERVING') utils.run_vtctl('ReparentShard -force test_keyspace/80-C0 ' + shard_2_master.tablet_alias, auto_log=True) utils.run_vtctl('ReparentShard -force test_keyspace/C0- ' + shard_3_master.tablet_alias, auto_log=True) utils.run_vtctl( 'RebuildShardGraph /zk/global/vt/keyspaces/test_keyspace/shards/*', auto_log=True) utils.run_vtctl('RebuildKeyspaceGraph -use-served-types test_keyspace', auto_log=True) self._check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica') # take the snapshot for the split utils.run_vtctl('MultiSnapshot --spec=80-C0- %s keyspace_id' % (shard_1_slave1.tablet_alias), auto_log=True) # wait for tablet's binlog server service to be enabled after snapshot, # and check all the others while we're at it self._wait_for_binlog_server_state(shard_1_master, "Disabled") self._wait_for_binlog_server_state(shard_1_slave1, "Enabled") # perform the restore. utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/80-C0', shard_1_slave1.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/C0-', shard_1_slave1.tablet_alias ], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl('ValidateSchemaKeyspace test_keyspace', auto_log=True) # check the binlog players are running self._wait_for_binlog_player_count(shard_2_master, 1) self._wait_for_binlog_player_count(shard_3_master, 1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('OPEN') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # now serve rdonly from the split shards utils.run_fail( utils.vtroot + '/bin/vtctl MigrateServedTypes test_keyspace/80- master') utils.run_vtctl('MigrateServedTypes test_keyspace/80- rdonly', auto_log=True) self._check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica') # then serve replica from the split shards utils.run_vtctl('MigrateServedTypes test_keyspace/80- replica', auto_log=True) self._check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,replica') # move replica back and forth utils.run_vtctl( 'MigrateServedTypes -reverse test_keyspace/80- replica', auto_log=True) self._check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica') utils.run_vtctl('MigrateServedTypes test_keyspace/80- replica', auto_log=True) self._check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,replica') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl('ReparentShard test_keyspace/80-C0 %s' % shard_2_replica1.tablet_alias) logging.debug( "Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # then serve master from the split shards utils.run_vtctl('MigrateServedTypes test_keyspace/80- master', auto_log=True) self._check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-C0 C0-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,replica') # check the binlog players are gone now self._wait_for_binlog_player_count(shard_2_master, 0) self._wait_for_binlog_player_count(shard_3_master, 0) # kill everything for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_slave1, shard_1_slave2, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica ]: t.kill_vttablet()
def test_resharding(self): utils.run_vtctl('CreateKeyspace test_keyspace') shard_0_master.init_tablet( 'master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_1_master.init_tablet( 'master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') utils.run_vtctl('RebuildShardGraph /zk/global/vt/keyspaces/test_keyspace/shards/*', auto_log=True) utils.run_vtctl('RebuildKeyspaceGraph /zk/global/vt/keyspaces/test_keyspace', auto_log=True) # create databases so vttablet can start behaving normally shard_0_master.create_db('vt_test_keyspace') shard_0_replica.create_db('vt_test_keyspace') shard_1_master.create_db('vt_test_keyspace') shard_1_slave1.create_db('vt_test_keyspace') shard_1_slave2.create_db('vt_test_keyspace') # start the tablets shard_0_master.start_vttablet() shard_0_replica.start_vttablet() shard_1_master.start_vttablet() shard_1_slave1.start_vttablet() shard_1_slave2.start_vttablet(wait_for_state='NOT_SERVING') # spare # reparent to make the tablets work utils.run_vtctl('ReparentShard -force test_keyspace/-80 ' + shard_0_master.tablet_alias, auto_log=True) utils.run_vtctl('ReparentShard -force test_keyspace/80- ' + shard_1_master.tablet_alias, auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # create the split shards shard_2_master.init_tablet( 'master', 'test_keyspace', '80-C0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0') shard_3_master.init_tablet( 'master', 'test_keyspace', 'C0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'C0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state='CONNECTING') shard_2_replica1.start_vttablet(wait_for_state='NOT_SERVING') shard_2_replica2.start_vttablet(wait_for_state='NOT_SERVING') shard_3_master.start_vttablet(wait_for_state='CONNECTING') shard_3_replica.start_vttablet(wait_for_state='NOT_SERVING') utils.run_vtctl('ReparentShard -force test_keyspace/80-C0 ' + shard_2_master.tablet_alias, auto_log=True) utils.run_vtctl('ReparentShard -force test_keyspace/C0- ' + shard_3_master.tablet_alias, auto_log=True) utils.run_vtctl('RebuildShardGraph /zk/global/vt/keyspaces/test_keyspace/shards/*', auto_log=True) utils.run_vtctl('RebuildKeyspaceGraph -use-served-types test_keyspace', auto_log=True) self._check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica') # take the snapshot for the split utils.run_vtctl('MultiSnapshot --spec=80-C0- %s keyspace_id' % (shard_1_slave1.tablet_alias), auto_log=True) # wait for tablet's binlog server service to be enabled after snapshot, # and check all the others while we're at it self._wait_for_binlog_server_state(shard_1_master, "Disabled") self._wait_for_binlog_server_state(shard_1_slave1, "Enabled") # perform the restore. utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/80-C0', shard_1_slave1.tablet_alias], auto_log=True) utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/C0-', shard_1_slave1.tablet_alias], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl('ValidateSchemaKeyspace test_keyspace', auto_log=True) # check the binlog players are running self._wait_for_binlog_player_count(shard_2_master, 1) self._wait_for_binlog_player_count(shard_3_master, 1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('OPEN') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # now serve rdonly from the split shards utils.run_fail(utils.vtroot+'/bin/vtctl MigrateServedTypes test_keyspace/80- master') utils.run_vtctl('MigrateServedTypes test_keyspace/80- rdonly', auto_log=True) self._check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica') # then serve replica from the split shards utils.run_vtctl('MigrateServedTypes test_keyspace/80- replica', auto_log=True) self._check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,replica') # move replica back and forth utils.run_vtctl('MigrateServedTypes -reverse test_keyspace/80- replica', auto_log=True) self._check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica') utils.run_vtctl('MigrateServedTypes test_keyspace/80- replica', auto_log=True) self._check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,replica') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl('ReparentShard test_keyspace/80-C0 %s' % shard_2_replica1.tablet_alias) logging.debug("Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # then serve master from the split shards utils.run_vtctl('MigrateServedTypes test_keyspace/80- master', auto_log=True) self._check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-C0 C0-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,replica') # check the binlog players are gone now self._wait_for_binlog_player_count(shard_2_master, 0) self._wait_for_binlog_player_count(shard_3_master, 0) # kill everything for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_slave1, shard_1_slave2, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica]: t.kill_vttablet()