def verify(self): self.assert_shard_data_equal(0, worker.shard_master, worker.shard_0_tablets.replica) self.assert_shard_data_equal(1, worker.shard_master, worker.shard_1_tablets.replica) # Verify effect of MigrateServedTypes. Dest shards are serving now. utils.check_srv_keyspace( 'test_nj', self.KEYSPACE, 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n') # source shard: query service must be disabled after MigrateServedTypes. source_shards = [ worker.shard_rdonly1, worker.shard_replica, worker.shard_master ] for shard in source_shards: utils.check_tablet_query_service(self, shard, serving=False, tablet_control_disabled=True) # dest shard -80, 80-: query service must be enabled # after MigrateServedTypes. dest_shards = [ worker.shard_0_rdonly1, worker.shard_0_replica, worker.shard_0_master, worker.shard_1_rdonly1, worker.shard_1_replica, worker.shard_1_master ] for shard in dest_shards: utils.check_tablet_query_service(self, shard, serving=True, tablet_control_disabled=False)
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', '--split_shard_count', '2', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', '-split_shard_count', '4', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['split_shard_count'], 4) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_rdonly1.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica']) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/80-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) self._check_stream_health_equals_binlog_player_vars(shard_2_master) self._check_stream_health_equals_binlog_player_vars(shard_3_master) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl( ['RunHealthCheck', shard_3_rdonly1.tablet_alias, 'rdonly']) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn( '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br>' '<b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low') monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high') # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl( ['RunHealthCheck', shard_1_slave2.tablet_alias, 'replica']) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl( ['RunHealthCheck', shard_3_master.tablet_alias, 'replica'], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias ]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug( 'Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug('DELAY 1: %s max_lag=%d avg_lag=%d', monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug('DELAY 2: %s max_lag=%d avg_lag=%d', monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def test_resharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', keyspace_id_type ]) shard_0_master.init_tablet('replica', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving somewhat normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, they won't be healthy) for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_3_master.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [ shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'LegacySplitClone', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_throttler_service(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_throttler_service(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all self.check_running_binlog_player(shard_2_master, 4000, 2000) self.check_running_binlog_player(shard_3_master, 4000, 2000) # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def test_resharding(self): # create the keyspace with just one shard shard_master.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=0) shard_replica.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') # replica is not started, InitShardMaster should timeout shard_master.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) shard_rdonly1.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [shard_master, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work - expect fail # because replica tablet is not up _, stderr = utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True, expect_fail=True) self.assertIn('tablet test_nj-0000062345 ResetReplication failed', stderr) # start replica shard_replica.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) shard_replica.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # We must start vtgate after tablets are up, or else wait until 1min refresh # (that is the tablet_refresh_interval parameter for discovery gateway) # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start( cache_ttl='0', tablets=[shard_master, shard_replica, shard_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type ]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias]) # create the split shards shard_0_master.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') sharded_tablets = [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.vtgate = None utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]) var = None # Wait for the endpoints, either local or remote. utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1, var=var) # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias]) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( [ '--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false' ], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 3, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_0_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Delete row 2 (provokes an insert). shard_1_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_1_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 (provokes a delete). self._insert_value(shard_1_master, 'resharding1', 4, 'msg4', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 1, 1, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 3) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') self.check_destination_master(shard_0_master, ['test_keyspace/0']) self.check_destination_master(shard_1_master, ['test_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) if base_sharding.use_multi_split_diff: logging.debug('Running vtworker MultiSplitDiff for 0') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'MultiSplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], auto_log=True) else: logging.debug('Running vtworker SplitDiff for -80') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-' ], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_0_master, 2000, 2000) self.check_running_binlog_player(shard_1_master, 6000, 2000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' timeout = 10.0 while True: try: s = utils.vtgate.split_query(sql, 'test_keyspace', 2) break except Exception: # pylint: disable=broad-except timeout = utils.wait_step( 'vtgate executes split_query properly', timeout) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # check the binlog players are gone now self.check_no_binlog_player(shard_0_master) self.check_no_binlog_player(shard_1_master) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards should be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ])
def verify(self): self.assert_shard_data_equal(0, worker.shard_master, worker.shard_0_tablets.replica) self.assert_shard_data_equal(1, worker.shard_master, worker.shard_1_tablets.replica) # Verify effect of MigrateServedTypes. Dest shards are serving now. utils.check_srv_keyspace( 'test_nj', self.KEYSPACE, 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n') # Check that query service is disabled (source shard) or enabled (dest). # The 'rdonly' tablet requires an explicit healthcheck first because # the following sequence of events is happening in this test: # - SplitDiff returns 'rdonly' as 'spare' tablet (NOT_SERVING) # - MigrateServedTypes runs and does not refresh then 'spare' tablet # (still NOT_SERVING) # Shard_TabletControl.DisableQueryService=true will be set in the topology # - explicit or periodic healthcheck runs: # a) tablet seen as caught up, change type from 'spare' to 'rdonly' # (change to SERVING) # b) post-action callback agent.refreshTablet() reads the topology # and finds out that DisableQueryService=true is set. # (change to NOT_SERVING) # # We must run an explicit healthcheck or we can see one of the two states: # - NOT_SERVING, DisableQueryService=false, tablet type 'spare' # (immediately after SplitDiff returned) # - SERVING, DisableQueryService=false, tablet type 'rdonly' # (during healthcheck before post-action callback is called) utils.run_vtctl(['RunHealthCheck', worker.shard_rdonly1.tablet_alias], auto_log=True) # source shard: query service must be disabled after MigrateServedTypes. utils.check_tablet_query_service(self, worker.shard_rdonly1, serving=False, tablet_control_disabled=True) utils.check_tablet_query_service(self, worker.shard_replica, serving=False, tablet_control_disabled=True) utils.check_tablet_query_service(self, worker.shard_master, serving=False, tablet_control_disabled=True) # dest shard -80: query service must be disabled after MigrateServedTypes. # Run explicit healthcheck because 'rdonly' tablet may still be 'spare'. utils.run_vtctl( ['RunHealthCheck', worker.shard_0_rdonly1.tablet_alias], auto_log=True) utils.check_tablet_query_service(self, worker.shard_0_rdonly1, serving=True, tablet_control_disabled=False) utils.check_tablet_query_service(self, worker.shard_0_replica, serving=True, tablet_control_disabled=False) utils.check_tablet_query_service(self, worker.shard_0_master, serving=True, tablet_control_disabled=False) # dest shard 80-: query service must be disabled after MigrateServedTypes. # Run explicit healthcheck because 'rdonly' tablet is still 'spare'. utils.run_vtctl( ['RunHealthCheck', worker.shard_1_rdonly1.tablet_alias], auto_log=True) utils.check_tablet_query_service(self, worker.shard_1_rdonly1, serving=True, tablet_control_disabled=False) utils.check_tablet_query_service(self, worker.shard_1_replica, serving=True, tablet_control_disabled=False) utils.check_tablet_query_service(self, worker.shard_1_master, serving=True, tablet_control_disabled=False)
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = ( base_sharding.keyspace_id_type == keyrange_constants.KIT_BYTES) # create databases so vttablet can start behaving somewhat normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, the slaves won't be # healthy) shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('NOT_SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') shard_1_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_rdonly1.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [ shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms'], auto_log=True) # Copy the data from the source to the destination shards. # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. # # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Test the correct handling of keyspace_id changes which happen after # the first clone. # Let row 2 go to shard 3 instead of shard 2. shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set' ' custom_ksid_col=0xD000000000000000 WHERE id=2', write=True) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Row 2 will be deleted from shard 2 and inserted to shard 3. self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 0, 1, 1) self._check_value(shard_2_master, 'resharding1', 2, 'msg2', 0xD000000000000000, should_be_here=False) self._check_value(shard_3_master, 'resharding1', 2, 'msg2', 0xD000000000000000) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Move row 2 back to shard 2 from shard 3 by changing the keyspace_id again. shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set' ' custom_ksid_col=0x9000000000000000 WHERE id=2', write=True) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Row 2 will be deleted from shard 3 and inserted to shard 2. self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 0, 1, 1) self._check_value(shard_2_master, 'resharding1', 2, 'msg2', 0x9000000000000000) self._check_value(shard_3_master, 'resharding1', 2, 'msg2', 0x9000000000000000, should_be_here=False) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 2 (provokes an insert). shard_2_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_3_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 and 5 (provokes a delete). self._insert_value(shard_3_master, 'resharding1', 4, 'msg4', 0xD000000000000000) self._insert_value(shard_3_master, 'resharding1', 5, 'msg5', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablet, which was taken offline, back to rdonly. utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 2, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 2) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_throttler_service(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_throttler_service(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all self.check_running_binlog_player(shard_2_master, 4000, 2000) self.check_running_binlog_player(shard_3_master, 4000, 2000) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 1, 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 2, 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low', 1) monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high', 2) # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace( 'test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace( 'test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias ]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug( 'Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug( 'DELAY 1: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_1.thread_name, monitor_thread_1.max_lag_ms, monitor_thread_1.lag_sum_ms / monitor_thread_1.sample_count) logging.debug( 'DELAY 2: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_2.thread_name, monitor_thread_2.max_lag_ms, monitor_thread_2.lag_sum_ms / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def test_merge_sharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'custom_ksid_col', '--sharding_column_type', base_sharding.keyspace_id_type, 'test_keyspace']) shard_0_master.init_tablet('replica', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('replica', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('replica', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') # rebuild and check SrvKeyspace utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) # won't be serving, no replication state for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-40', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/40-80', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-', shard_2_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the merge shards shard_dest_master.init_tablet('replica', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the destination shard (no db created, # so they're all not serving) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80', shard_dest_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # copy the schema utils.run_vtctl(['CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80'], auto_log=True) # copy the data (will also start filtered replication), reset source # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false'], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--offline=false', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_dest_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Update row 2 (provokes an update). shard_dest_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-2' where id=2", write=True) # Insert row 0 (provokes a delete). self._insert_value(shard_dest_master, 'resharding1', 0, 'msg0', 0x5000000000000000) workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablets, which were taken offline, back to rdonly. utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 1, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 2) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias]) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker(['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '1', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker(['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '2', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]) for t in [shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards should be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly])
def test_merge_sharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'custom_sharding_key', '--sharding_column_type', keyspace_id_type, 'test_keyspace' ]) shard_0_master.init_tablet('master', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('master', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('master', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') # rebuild and check SrvKeyspace utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_sharding_key') # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # masters will be serving for t in [shard_0_master, shard_1_master, shard_2_master]: t.wait_for_vttablet_state('SERVING') # slaves won't be, no replication state for t in [ shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly, shard_2_replica, shard_2_rdonly ]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-40', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/40-80', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_2_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the merge shards shard_dest_master.init_tablet('master', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the destination shard (no db created, # so they're all not serving) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_dest_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # copy the schema utils.run_vtctl([ 'CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80' ], auto_log=True) # copy the data (will also start filtered replication), reset source utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--source_reader_count', '10', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master( shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars( shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias]) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '0', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '1', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json([ 'VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias ]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ]) for t in [ shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly ])
def test_resharding(self): # create the keyspace with just one shard shard_master.init_tablet('master', keyspace='test_keyspace', shard='0', tablet_index=0) shard_replica.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') shard_master.start_vttablet(wait_for_state=None) shard_replica.start_vttablet(wait_for_state=None) shard_rdonly1.start_vttablet(wait_for_state=None) shard_master.wait_for_vttablet_state('SERVING') for t in [shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # must start vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start( cache_ttl='0', tablets=[shard_master, shard_replica, shard_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias]) # create the split shards shard_0_master.init_tablet('master', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet('master', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_rdonly1, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_1_master]: t.wait_for_vttablet_state('SERVING') for t in [ shard_0_replica, shard_0_rdonly1, shard_1_replica, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') sharded_tablets = [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias]) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--source_reader_count', '10', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') self.check_destination_master(shard_0_master, ['test_keyspace/0']) self.check_destination_master(shard_1_master, ['test_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data logging.debug('Running vtworker SplitDiff for -80') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-' ], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_0_master, 2000, 2000) self.check_running_binlog_player(shard_1_master, 6000, 2000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 2) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now self.check_no_binlog_player(shard_0_master) self.check_no_binlog_player(shard_1_master) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ])
def test_resharding(self): # create the keyspace with just one shard utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_master.init_tablet('master', 'test_keyspace', '0') shard_replica.init_tablet('replica', 'test_keyspace', '0') shard_rdonly1.init_tablet('rdonly', 'test_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_master.wait_for_vttablet_state('SERVING') shard_replica.wait_for_vttablet_state('SERVING') shard_rdonly1.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl( ['InitShardMaster', 'test_keyspace/0', shard_master.tablet_alias], auto_log=True) # create the tables and add startup values self._create_schema() self._insert_startup_values() # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # create the split shards shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_rdonly1.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_replica.init_tablet('replica', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -\n' + 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/0' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug("Waiting for binlog players to start on new masters...") shard_0_master.wait_for_binlog_player_count(1) shard_1_master.wait_for_binlog_player_count(1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use vtworker to compare the data logging.debug("Running vtworker SplitDiff for -80") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly1.tablet_alias, 'rdonly'], auto_log=True) logging.debug("Running vtworker SplitDiff for 80-") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_0_master.wait_for_binlog_player_count(0) shard_1_master.wait_for_binlog_player_count(0) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # scrap the original tablets in the original shard for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ])
def test_vertical_split(self): utils.run_vtctl([ 'CopySchemaShard', '--tables', '/moving/,view1', source_rdonly1.tablet_alias, 'destination_keyspace/0' ], auto_log=True) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false', 'VerticalSplitClone', '--tables', '/moving/,view1', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_tablets', '1', 'destination_keyspace/0' ], auto_log=True) # test Cancel first utils.run_vtctl(['CancelResharding', 'destination_keyspace/0'], auto_log=True) self.check_no_binlog_player(destination_master) # master should be in serving state after cancel utils.check_tablet_query_service(self, destination_master, True, False) # redo VerticalSplitClone utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false', 'VerticalSplitClone', '--tables', '/moving/,view1', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_tablets', '1', 'destination_keyspace/0' ], auto_log=True) # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', self.moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', self.moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', self.moving1_first, 100) if base_sharding.use_rbr: self._check_values(destination_master, 'vt_destination_keyspace', 'moving3_no_pk', self.moving3_no_pk_first, 100) # Verify vreplication table entries result = destination_master.mquery('_vt', 'select * from vreplication') self.assertEqual(len(result), 1) self.assertEqual(result[0][1], 'SplitClone') self.assertEqual( result[0][2], 'keyspace:"source_keyspace" shard:"0" tables:"/moving/" tables:"view1" ' ) # check the binlog player is running and exporting vars self.check_destination_master(destination_master, ['source_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(source_replica, horizontal=False) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) _ = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) self.check_binlog_player_vars(destination_master, ['source_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(source_replica, horizontal=False, min_statements=100, min_transactions=100) # use vtworker to compare the data logging.debug('Running vtworker VerticalSplitDiff') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'VerticalSplitDiff', '--min_healthy_rdonly_tablets', '1', 'destination_keyspace/0' ], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablet, make sure we have it all self.check_running_binlog_player(destination_master, 700, 300, extra_text='moving') # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars['TabletStateName'], 'NOT_SERVING') # check we can't migrate the master just yet utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # migrate rdonly only in test_ny cell, make sure nothing is migrated # in test_nj utils.run_vtctl([ 'MigrateServedFrom', '--cells=test_ny', 'destination_keyspace/0', 'rdonly' ], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(rdonly): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, None) self._check_blacklisted_tables(source_rdonly2, None) # migrate test_nj only, using command line manual fix command, # and restore it back. keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertEqual(sorted(ksf['cells']), ['test_ca', 'test_nj']) self.assertTrue(found) utils.run_vtctl([ 'SetKeyspaceServedFrom', '-source=source_keyspace', '-remove', '-cells=test_nj,test_ca', 'destination_keyspace', 'rdonly' ], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertFalse(found) utils.run_vtctl([ 'SetKeyspaceServedFrom', '-source=source_keyspace', 'destination_keyspace', 'rdonly' ], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertTrue('cells' not in ksf or not ksf['cells']) self.assertTrue(found) # now serve rdonly from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['/moving/', 'view1']) self._check_client_conn_redirection('destination_keyspace', ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['/moving/', 'view1']) self._check_client_conn_redirection('destination_keyspace', ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl([ 'MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica' ], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['/moving/', 'view1']) utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['/moving/', 'view1']) self._check_client_conn_redirection('destination_keyspace', ['master'], ['moving1', 'moving2']) # Cancel should fail now utils.run_vtctl(['CancelResharding', 'destination_keyspace/0'], auto_log=True, expect_fail=True) # then serve master from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['/moving/', 'view1']) self._check_blacklisted_tables(source_replica, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['/moving/', 'view1']) # check the binlog player is gone now self.check_no_binlog_player(destination_master) # check the stats are correct self._check_stats() # now remove the tables on the source shard. The blacklisted tables # in the source shard won't match any table, make sure that works. utils.run_vtctl( ['ApplySchema', '-sql=drop view view1', 'source_keyspace'], auto_log=True) for t in ['moving1', 'moving2']: utils.run_vtctl( ['ApplySchema', '-sql=drop table %s' % (t), 'source_keyspace'], auto_log=True) for t in [ source_master, source_replica, source_rdonly1, source_rdonly2 ]: utils.run_vtctl(['ReloadSchema', t.tablet_alias]) qr = source_master.execute('select count(1) from staying1') self.assertEqual(len(qr['rows']), 1, 'cannot read staying1: got %s' % str(qr)) # test SetShardTabletControl self._verify_vtctl_set_shard_tablet_control()