def setup_sharded_keyspace(): utils.run_vtctl(['CreateKeyspace', SHARDED_KEYSPACE]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', SHARDED_KEYSPACE, 'keyspace_id', 'uint64' ]) shard_0_master.init_tablet('replica', keyspace=SHARDED_KEYSPACE, shard='-80', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace=SHARDED_KEYSPACE, shard='-80', tablet_index=1) shard_1_master.init_tablet('replica', keyspace=SHARDED_KEYSPACE, shard='80-', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace=SHARDED_KEYSPACE, shard='80-', tablet_index=1) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.create_db('vt_test_keyspace_sharded') t.mquery(shard_0_master.dbname, create_vt_insert_test) t.start_vttablet(wait_for_state=None) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', '%s/-80' % SHARDED_KEYSPACE, shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', '%s/80-' % SHARDED_KEYSPACE, shard_1_master.tablet_alias ], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.wait_for_vttablet_state('SERVING') # rebuild to be sure we have the latest data utils.run_vtctl(['RebuildKeyspaceGraph', SHARDED_KEYSPACE], auto_log=True) utils.check_srv_keyspace( 'test_nj', SHARDED_KEYSPACE, 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n')
def test_service_switch(self): self._test_service_disabled() self._test_service_enabled() # The above tests leaves the service in disabled state, hence enabling it. utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica', 30)
def _test_service_enabled(self): start_position = _get_repl_current_position() logging.debug('_test_service_enabled starting @ %s', start_position) utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) logging.debug('sleeping a bit for the replica action to complete') utils.wait_for_tablet_type(replica_tablet.tablet_alias, tablet.Tablet.tablet_type_value['REPLICA'], 30) thd = threading.Thread(target=self.perform_writes, name='write_thd', args=(100, )) thd.daemon = True thd.start() replica_conn = self._get_replica_stream_conn() try: for stream_event in replica_conn.stream_update(start_position): if stream_event.category == update_stream.StreamEvent.DML: logging.debug('Test Service Enabled: Pass') break except Exception, e: self.fail( 'Exception in getting stream from replica: %s\n Traceback %s' % (str(e), traceback.print_exc()))
def _test_service_enabled(self): start_position = _get_repl_current_position() logging.debug('_test_service_enabled starting @ %s', start_position) utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) logging.debug('sleeping a bit for the replica action to complete') utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica', 30) thd = threading.Thread(target=self.perform_writes, name='write_thd', args=(100, )) thd.daemon = True thd.start() replica_conn = self._get_replica_stream_conn() replica_conn.dial() try: data = replica_conn.stream_start(start_position) for i in xrange(10): data = replica_conn.stream_next() if data['Category'] == 'DML' and utils.options.verbose == 2: logging.debug('Test Service Enabled: Pass') break except Exception, e: self.fail( 'Exception in getting stream from replica: %s\n Traceback %s' % (str(e), traceback.print_exc()))
def test_service_switch(self): """tests the service switch from disable -> enable -> disable""" self._test_service_disabled() self._test_service_enabled() # The above tests leaves the service in disabled state, hence enabling it. utils.run_vtctl(['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica', 30)
def _test_service_disabled(self): # it looks like update stream would be re-enabled automatically # because of vttablet health check return start_position = _get_repl_current_position() logging.debug('_test_service_disabled starting @ %s', start_position) self._exec_vt_txn(self._populate_vt_insert_test) self._exec_vt_txn(['delete from vt_insert_test']) utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare') logging.debug('dialing replica update stream service') replica_conn = self._get_replica_stream_conn() try: for _ in replica_conn.stream_update(start_position): break except dbexceptions.DatabaseError as e: self.assertIn('update stream service is not enabled', str(e)) replica_conn.close() v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Disabled': self.fail( "Update stream service should be 'Disabled' but is '%s'" % v['UpdateStreamState'])
def test_service_switch(self): """tests the service switch from disable -> enable -> disable.""" # make the replica spare utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "spare"]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, "spare") # Check UpdateStreamState is disabled. v = utils.get_vars(replica_tablet.port) if v["UpdateStreamState"] != "Disabled": self.fail("Update stream service should be 'Disabled' but is '%s'" % v["UpdateStreamState"]) # Make sure we can't start a new request. start_position = _get_repl_current_position() replica_conn = self._get_replica_stream_conn() try: for event in replica_conn.stream_update( "test_keyspace", "0", topodata_pb2.REPLICA, position=start_position ): self.assertFail("got event: %s" % str(event)) self.assertFail("stream_update terminated with no exception") except dbexceptions.DatabaseError as e: self.assertIn("operation not allowed in state NOT_SERVING", str(e)) # Go back to replica. utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "replica"]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, "replica") # Check UpdateStreamState is enabled. v = utils.get_vars(replica_tablet.port) if v["UpdateStreamState"] != "Enabled": self.fail("Update stream service should be 'Enabled' but is '%s'" % v["UpdateStreamState"])
def test_service_switch(self): """tests the service switch from disable -> enable -> disable""" self._test_service_disabled() self._test_service_enabled() # The above tests leaves the service in disabled state, hence enabling it. utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "replica"]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, tablet.Tablet.tablet_type_value["REPLICA"], 30)
def _terminated_restore(t): for e in utils.vtctld_connection.execute_vtctl_command( ['RestoreFromBackup', t.tablet_alias]): logging.info('%s', e.value) if 'shutdown mysqld' in e.value: break logging.info('waiting for restore to finish') utils.wait_for_tablet_type(t.tablet_alias, 'replica', timeout=30)
def test_terminated_restore(self): stop_restore_msg = 'Copying file 10' if use_xtrabackup: stop_restore_msg = 'Restore: Preparing' def _terminated_restore(t): for e in utils.vtctld_connection.execute_vtctl_command( ['RestoreFromBackup', t.tablet_alias]): logging.info('%s', e.value) if stop_restore_msg in e.value: break utils.Vtctld().start() # insert data on master, wait for slave to get it tablet_master.mquery('vt_test_keyspace', self._create_vt_insert_test) self._insert_data(tablet_master, 1) self._check_data(tablet_replica1, 1, 'replica1 tablet getting data') # backup the slave utils.run_vtctl(['Backup', tablet_replica1.tablet_alias], auto_log=True) # insert more data on the master self._insert_data(tablet_master, 2) # reparent to replica1 utils.run_vtctl([ 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/0', '-new_master', tablet_replica1.tablet_alias ]) # insert more data on new master self._insert_data(tablet_replica1, 3) # force the old master to restore at the latest backup, and terminate the restore # when it is in the middle of copying the files _terminated_restore(tablet_master) # check that restore_file has been created but not deleted restore_file = os.path.join(tablet_master.tablet_dir, 'restore_in_progress') self.assertTrue(os.path.isfile(restore_file)) # now retry the restore for e in utils.vtctld_connection.execute_vtctl_command( ['RestoreFromBackup', tablet_master.tablet_alias]): logging.info('%s', e.value) logging.info('waiting for restore to finish') utils.wait_for_tablet_type(tablet_master.tablet_alias, 'replica', timeout=30) # check that restore_file doesn't exist any more self.assertFalse(os.path.isfile(restore_file)) # wait for it to catch up. self._check_data(tablet_master, 3, 'former master catches up after restore')
def test_service_switch(self): """tests the service switch from disable -> enable -> disable.""" self._test_service_disabled() self._test_service_enabled() # The above tests leaves the service in disabled state, hence enabling it. utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, topodata_pb2.REPLICA, 30)
def test_health_check_drained_state_does_not_shutdown_query_service(self): # This test is similar to test_health_check, but has the following # differences: # - the second tablet is an 'rdonly' and not a 'replica' # - the second tablet will be set to 'drained' and we expect that # the query service won't be shutdown # Setup master and rdonly tablets. tablet_62344.init_tablet('replica', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') # Note we only have a master and a rdonly. So we can't enable # semi-sync in this case, as the rdonly slaves don't semi-sync ack. tablet_62344.start_vttablet(wait_for_state=None, enable_semi_sync=False) tablet_62044.start_vttablet(wait_for_state=None, init_tablet_type='rdonly', init_keyspace='test_keyspace', init_shard='0', enable_semi_sync=False) tablet_62344.wait_for_vttablet_state('NOT_SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(tablet_62044, False) # Enable replication. utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias]) # Trigger healthcheck to save time waiting for the next interval. utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) tablet_62044.wait_for_vttablet_state('SERVING') self.check_healthz(tablet_62044, True) # Change from rdonly to drained and stop replication. (These # actions are similar to the SplitClone vtworker command # implementation.) The tablet will stay healthy, and the # query service is still running. utils.run_vtctl(['ChangeSlaveType', tablet_62044.tablet_alias, 'drained']) utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias]) # Trigger healthcheck explicitly to avoid waiting for the next interval. utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'drained') self.check_healthz(tablet_62044, True) # Query service is still running. tablet_62044.wait_for_vttablet_state('SERVING') # Restart replication. Tablet will become healthy again. utils.run_vtctl(['ChangeSlaveType', tablet_62044.tablet_alias, 'rdonly']) utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias]) utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) self.check_healthz(tablet_62044, True) # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044])
def setup_sharded_keyspace(): utils.run_vtctl(['CreateKeyspace', SHARDED_KEYSPACE]) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', SHARDED_KEYSPACE, 'keyspace_id', 'uint64']) shard_0_master.init_tablet( 'master', keyspace=SHARDED_KEYSPACE, shard='-80', tablet_index=0) shard_0_replica.init_tablet( 'replica', keyspace=SHARDED_KEYSPACE, shard='-80', tablet_index=1) shard_1_master.init_tablet( 'master', keyspace=SHARDED_KEYSPACE, shard='80-', tablet_index=0) shard_1_replica.init_tablet( 'replica', keyspace=SHARDED_KEYSPACE, shard='80-', tablet_index=1) utils.run_vtctl(['RebuildKeyspaceGraph', SHARDED_KEYSPACE], auto_log=True) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.create_db('vt_test_keyspace_sharded') t.mquery(shard_0_master.dbname, create_vt_insert_test) t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_1_master]: t.wait_for_vttablet_state('SERVING') for t in [shard_0_replica, shard_1_replica]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', '%s/-80' % SHARDED_KEYSPACE, shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', '%s/80-' % SHARDED_KEYSPACE, shard_1_master.tablet_alias], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl( ['RebuildKeyspaceGraph', SHARDED_KEYSPACE], auto_log=True) utils.check_srv_keyspace('test_nj', SHARDED_KEYSPACE, 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n')
def _test_service_enabled(self): start_position = _get_repl_current_position() logging.debug("_test_service_enabled starting @ %s", start_position) utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "replica"]) logging.debug("sleeping a bit for the replica action to complete") utils.wait_for_tablet_type(replica_tablet.tablet_alias, topodata_pb2.REPLICA, 30) thd = threading.Thread(target=self.perform_writes, name="write_thd", args=(100,)) thd.daemon = True thd.start() replica_conn = self._get_replica_stream_conn() try: for stream_event in replica_conn.stream_update(start_position): if stream_event.category == update_stream.StreamEvent.DML: logging.debug("Test Service Enabled: Pass") break except Exception as e: self.fail("Exception in getting stream from replica: %s\n Traceback %s" % (str(e), traceback.format_exc())) thd.join(timeout=30) replica_conn.close() v = utils.get_vars(replica_tablet.port) if v["UpdateStreamState"] != "Enabled": self.fail("Update stream service should be 'Enabled' but is '%s'" % v["UpdateStreamState"]) self.assertIn("SE_DML", v["UpdateStreamEvents"]) self.assertIn("SE_POS", v["UpdateStreamEvents"]) logging.debug("Testing enable -> disable switch starting @ %s", start_position) replica_conn = self._get_replica_stream_conn() first = True txn_count = 0 try: for stream_event in replica_conn.stream_update(start_position): if first: utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "spare"]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, topodata_pb2.SPARE, 30) first = False else: if stream_event.category == update_stream.StreamEvent.POS: txn_count += 1 # FIXME(alainjobart) gasp, the test fails but we don't assert? logging.debug("Test Service Switch: FAIL") replica_conn.close() return except dbexceptions.DatabaseError as e: self.assertEqual( "Fatal Service Error: Disconnecting because the Update Stream " "service has been disabled", str(e) ) except Exception as e: logging.error("Exception: %s", str(e)) logging.error("Traceback: %s", traceback.format_exc()) self.fail("Update stream returned error '%s'" % str(e)) logging.debug("Streamed %d transactions before exiting", txn_count) replica_conn.close()
def setUpModule(): try: environment.topo_server().setup() setup_procs = [master_tablet.init_mysql(), replica_tablet.init_mysql()] utils.wait_procs(setup_procs) # start a vtctld so the vtctl insert commands are just RPCs, not forks. utils.Vtctld().start() # Start up a master mysql and vttablet logging.debug('Setting up tablets') utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) master_tablet.init_tablet('replica', 'test_keyspace', '0', tablet_index=0) replica_tablet.init_tablet('replica', 'test_keyspace', '0', tablet_index=1) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) master_tablet.create_db('vt_test_keyspace') replica_tablet.create_db('vt_test_keyspace') master_tablet.start_vttablet(wait_for_state=None) replica_tablet.start_vttablet(wait_for_state=None) master_tablet.wait_for_vttablet_state('NOT_SERVING') replica_tablet.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', master_tablet.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica') master_tablet.wait_for_vttablet_state('SERVING') replica_tablet.wait_for_vttablet_state('SERVING') master_tablet.mquery('vt_test_keyspace', _create_vt_a) master_tablet.mquery('vt_test_keyspace', _create_vt_b) utils.run_vtctl(['ReloadSchema', master_tablet.tablet_alias]) utils.run_vtctl(['ReloadSchema', replica_tablet.tablet_alias]) utils.run_vtctl(['RebuildVSchemaGraph']) utils.VtGate().start(tablets=[master_tablet, replica_tablet]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) except: tearDownModule() raise
def test_service_switch(self): """tests the service switch from disable -> enable -> disable.""" # make the replica spare utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare') # Check UpdateStreamState is disabled. v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Disabled': self.fail( "Update stream service should be 'Disabled' but is '%s'" % v['UpdateStreamState']) start_position = _get_repl_current_position() # Make sure we can't start a new request to vttablet directly. _, stderr = utils.run_vtctl([ 'VtTabletUpdateStream', '-position', start_position, replica_tablet.tablet_alias ], expect_fail=True) self.assertIn('operation not allowed in state NOT_SERVING', stderr) # Make sure we can't start a new request through vtgate. replica_conn = self._get_vtgate_stream_conn() try: for event, resume_timestamp in replica_conn.update_stream( 'test_keyspace', topodata_pb2.REPLICA, event=query_pb2.EventToken(shard='0', position=start_position), shard='0'): self.assertFail('got event(%d): %s' % (resume_timestamp, str(event))) self.assertFail('update_stream terminated with no exception') except dbexceptions.DatabaseError as e: self.assertIn(vtgate_gateway_flavor().no_tablet_found_message(), str(e)) # Go back to replica. utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica') # Check UpdateStreamState is enabled. v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Enabled': self.fail("Update stream service should be 'Enabled' but is '%s'" % v['UpdateStreamState'])
def _test_service_disabled(self): start_position = _get_repl_current_position() logging.debug("_test_service_disabled starting @ %s", start_position) self._exec_vt_txn(self._populate_vt_insert_test) self._exec_vt_txn(["delete from vt_insert_test"]) utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "spare"]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, tablet.Tablet.tablet_type_value["SPARE"]) logging.debug("dialing replica update stream service") replica_conn = self._get_replica_stream_conn() try: for stream_event in replica_conn.stream_update(start_position): break except Exception, e: logging.debug(str(e)) self.assertIn("update stream service is not enabled", str(e))
def _test_service_disabled(self): start_position = _get_repl_current_position() logging.debug('_test_service_disabled starting @ %s', start_position) self._exec_vt_txn(self._populate_vt_insert_test) self._exec_vt_txn(['delete from vt_insert_test']) utils.run_vtctl(['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare') logging.debug('dialing replica update stream service') replica_conn = self._get_replica_stream_conn() try: for stream_event in replica_conn.stream_update(start_position): break except Exception, e: logging.debug(str(e)) self.assertIn('update stream service is not enabled', str(e))
def test_health_check_worker_state_does_not_shutdown_query_service(self): # This test is similar to test_health_check, but has the following # differences: # - the second tablet is an 'rdonly' and not a 'replica' # - the second tablet will be set to 'worker' and we expect that # the query service won't be shutdown # Setup master and rdonly tablets. tablet_62344.init_tablet("master", "test_keyspace", "0") for t in tablet_62344, tablet_62044: t.create_db("vt_test_keyspace") tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type="replica") tablet_62044.start_vttablet( wait_for_state=None, target_tablet_type="rdonly", init_keyspace="test_keyspace", init_shard="0" ) tablet_62344.wait_for_vttablet_state("SERVING") tablet_62044.wait_for_vttablet_state("NOT_SERVING") self.check_healthz(tablet_62044, False) # Enable replication. utils.run_vtctl(["InitShardMaster", "test_keyspace/0", tablet_62344.tablet_alias]) # Trigger healthcheck to save time waiting for the next interval. utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "rdonly"]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, "rdonly") self.check_healthz(tablet_62044, True) tablet_62044.wait_for_vttablet_state("SERVING") # Change from rdonly to worker and stop replication. (These # actions are similar to the SplitClone vtworker command # implementation.) The tablet will become unhealthy, but the # query service is still running. utils.run_vtctl(["ChangeSlaveType", tablet_62044.tablet_alias, "worker"]) utils.run_vtctl(["StopSlave", tablet_62044.tablet_alias]) # Trigger healthcheck explicitly to avoid waiting for the next interval. utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "rdonly"]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, "worker") self.check_healthz(tablet_62044, False) # Make sure that replication got disabled. self.assertIn( ">unhealthy: replication_reporter: " "Replication is not running</span></div>", tablet_62044.get_status() ) # Query service is still running. tablet_62044.wait_for_vttablet_state("SERVING") # Restart replication. Tablet will become healthy again. utils.run_vtctl(["ChangeSlaveType", tablet_62044.tablet_alias, "spare"]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, "spare") utils.run_vtctl(["StartSlave", tablet_62044.tablet_alias]) utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "rdonly"]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, "rdonly") self.check_healthz(tablet_62044, True) tablet_62044.wait_for_vttablet_state("SERVING") # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044])
def _test_service_disabled(self): start_position = _get_repl_current_position() logging.debug('_test_service_disabled starting @ %s', start_position) self._exec_vt_txn(self._populate_vt_insert_test) self._exec_vt_txn(['delete from vt_insert_test']) utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare') logging.debug('dialing replica update stream service') replica_conn = self._get_replica_stream_conn() try: for stream_event in replica_conn.stream_update(start_position): break except Exception, e: logging.debug(str(e)) self.assertIn('update stream service is not enabled', str(e))
def test_update_stream_interrupt(self): """Checks that a running query is terminated on going non-serving.""" # Make sure the replica is replica type. utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) logging.debug('sleeping a bit for the replica action to complete') utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica', 30) # Save current position, insert some data. start_position = _get_repl_current_position() logging.debug('test_update_stream_interrupt starting @ %s', start_position) self._exec_vt_txn(self._populate_vt_a(1)) self._exec_vt_txn(['delete from vt_a']) # Start an Update Stream from the slave. When we get the data, go to spare. # That should interrupt the streaming RPC. replica_conn = self._get_vtgate_stream_conn() first = True txn_count = 0 try: for event, resume_timestamp in replica_conn.update_stream( 'test_keyspace', topodata_pb2.REPLICA, event=query_pb2.EventToken(shard='0', position=start_position), shard='0'): logging.debug('test_update_stream_interrupt got event(%d): %s', resume_timestamp, event) if first: utils.run_vtctl([ 'ChangeSlaveType', replica_tablet.tablet_alias, 'spare' ]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare', 30) first = False else: if event.event_token.position: txn_count += 1 self.assertFail('update_stream terminated with no exception') except dbexceptions.DatabaseError as e: self.assertIn('context canceled', str(e)) self.assertFalse(first) logging.debug('Streamed %d transactions before exiting', txn_count) replica_conn.close()
def setup_tablets(): # Start up a master mysql and vttablet logging.debug('Setting up tablets') utils.run_vtctl(['CreateKeyspace', KEYSPACE_NAME]) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', KEYSPACE_NAME, 'keyspace_id', 'uint64']) shard_0_master.init_tablet( 'master', keyspace=KEYSPACE_NAME, shard='0', tablet_index=0) shard_0_replica1.init_tablet( 'replica', keyspace=KEYSPACE_NAME, shard='0', tablet_index=1) utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) for t in [shard_0_master, shard_0_replica1]: t.create_db('vt_test_keyspace') for create_table in create_tables: t.mquery(shard_0_master.dbname, create_table) t.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [shard_0_master]: t.wait_for_vttablet_state('SERVING') for t in [shard_0_replica1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', KEYSPACE_NAME+'/0', shard_0_master.tablet_alias], auto_log=True) for t in [shard_0_replica1]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_master, shard_0_replica1]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl( ['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) utils.check_srv_keyspace( 'test_nj', KEYSPACE_NAME, 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n')
def setup_tablets(): # Start up a master mysql and vttablet logging.debug('Setting up tablets') utils.run_vtctl(['CreateKeyspace', KEYSPACE_NAME]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', KEYSPACE_NAME, 'keyspace_id', 'uint64' ]) shard_0_master.init_tablet('master', keyspace=KEYSPACE_NAME, shard='0', tablet_index=0) shard_0_replica1.init_tablet('replica', keyspace=KEYSPACE_NAME, shard='0', tablet_index=1) utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) for t in [shard_0_master, shard_0_replica1]: t.create_db('vt_test_keyspace') for create_table in create_tables: t.mquery(shard_0_master.dbname, create_table) t.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [shard_0_master]: t.wait_for_vttablet_state('SERVING') for t in [shard_0_replica1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl( ['InitShardMaster', KEYSPACE_NAME + '/0', shard_0_master.tablet_alias], auto_log=True) for t in [shard_0_replica1]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_master, shard_0_replica1]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) utils.check_srv_keyspace( 'test_nj', KEYSPACE_NAME, 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n')
def setUpModule(): try: environment.topo_server().setup() setup_procs = [master_tablet.init_mysql(), replica_tablet.init_mysql()] utils.wait_procs(setup_procs) # start a vtctld so the vtctl insert commands are just RPCs, not forks. utils.Vtctld().start() # Start up a master mysql and vttablet logging.debug('Setting up tablets') utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) master_tablet.init_tablet('master', 'test_keyspace', '0', tablet_index=0) replica_tablet.init_tablet('replica', 'test_keyspace', '0', tablet_index=1) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.validate_topology() master_tablet.create_db('vt_test_keyspace') replica_tablet.create_db('vt_test_keyspace') master_tablet.start_vttablet(wait_for_state=None) replica_tablet.start_vttablet(wait_for_state=None) master_tablet.wait_for_vttablet_state('SERVING') replica_tablet.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/0', master_tablet.tablet_alias], auto_log=True) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica') master_tablet.wait_for_vttablet_state('SERVING') replica_tablet.wait_for_vttablet_state('SERVING') master_tablet.mquery('vt_test_keyspace', _create_vt_a) master_tablet.mquery('vt_test_keyspace', _create_vt_b) utils.run_vtctl(['ReloadSchema', master_tablet.tablet_alias]) utils.run_vtctl(['ReloadSchema', replica_tablet.tablet_alias]) utils.run_vtctl(['RebuildVSchemaGraph']) utils.VtGate().start(tablets=[master_tablet, replica_tablet]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) except: tearDownModule() raise
def _test_service_disabled(self): start_position = _get_repl_current_position() logging.debug("_test_service_disabled starting @ %s", start_position) self._exec_vt_txn(self._populate_vt_insert_test) self._exec_vt_txn(["delete from vt_insert_test"]) utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "spare"]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, "spare") replica_conn = self._get_replica_stream_conn() logging.debug("dialing replica update stream service") replica_conn.dial() try: data = replica_conn.stream_start(start_position) except Exception, e: logging.debug(str(e)) if str(e) == "update stream service is not enabled": logging.debug("Test Service Disabled: Pass") else: self.fail("Test Service Disabled: Fail - did not throw the correct exception")
def _test_service_enabled(self): start_position = _get_repl_current_position() logging.debug("_test_service_enabled starting @ %s", start_position) utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "replica"]) logging.debug("sleeping a bit for the replica action to complete") utils.wait_for_tablet_type(replica_tablet.tablet_alias, tablet.Tablet.tablet_type_value["REPLICA"], 30) thd = threading.Thread(target=self.perform_writes, name="write_thd", args=(100,)) thd.daemon = True thd.start() replica_conn = self._get_replica_stream_conn() try: for stream_event in replica_conn.stream_update(start_position): if stream_event.category == update_stream.StreamEvent.DML: logging.debug("Test Service Enabled: Pass") break except Exception, e: self.fail("Exception in getting stream from replica: %s\n Traceback %s" % (str(e), traceback.print_exc()))
def _test_service_disabled(self): start_position = _get_repl_current_position() logging.debug('_test_service_disabled starting @ %s', start_position) self._exec_vt_txn(self._populate_vt_insert_test) self._exec_vt_txn(['delete from vt_insert_test']) utils.run_vtctl(['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare') replica_conn = self._get_replica_stream_conn() logging.debug('dialing replica update stream service') replica_conn.dial() try: data = replica_conn.stream_start(start_position) except Exception, e: logging.debug(str(e)) if str(e) == 'update stream service is not enabled': logging.debug('Test Service Disabled: Pass') else: self.fail( 'Test Service Disabled: Fail - did not throw the correct exception')
def test_service_switch(self): """tests the service switch from disable -> enable -> disable.""" # make the replica spare utils.run_vtctl(['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare') # Check UpdateStreamState is disabled. v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Disabled': self.fail("Update stream service should be 'Disabled' but is '%s'" % v['UpdateStreamState']) start_position = _get_repl_current_position() # Make sure we can't start a new request to vttablet directly. _, stderr = utils.run_vtctl(['VtTabletUpdateStream', '-position', start_position, replica_tablet.tablet_alias], expect_fail=True) self.assertIn('operation not allowed in state NOT_SERVING', stderr) # Make sure we can't start a new request through vtgate. replica_conn = self._get_vtgate_stream_conn() try: for event, resume_timestamp in replica_conn.update_stream( 'test_keyspace', topodata_pb2.REPLICA, event=query_pb2.EventToken(shard='0', position=start_position), shard='0'): self.assertFail('got event(%d): %s' % (resume_timestamp, str(event))) self.assertFail('update_stream terminated with no exception') except dbexceptions.DatabaseError as e: self.assertIn(vtgate_gateway_flavor().no_tablet_found_message(), str(e)) # Go back to replica. utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica') # Check UpdateStreamState is enabled. v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Enabled': self.fail("Update stream service should be 'Enabled' but is '%s'" % v['UpdateStreamState'])
def _test_service_disabled(self): start_position = _get_repl_current_position() logging.debug("_test_service_disabled starting @ %s", start_position) self._exec_vt_txn(self._populate_vt_insert_test) self._exec_vt_txn(["delete from vt_insert_test"]) utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "spare"]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, topodata_pb2.SPARE) logging.debug("dialing replica update stream service") replica_conn = self._get_replica_stream_conn() try: for _ in replica_conn.stream_update(start_position): break except Exception as e: self.assertIn("update stream service is not enabled", str(e)) replica_conn.close() v = utils.get_vars(replica_tablet.port) if v["UpdateStreamState"] != "Disabled": self.fail("Update stream service should be 'Disabled' but is '%s'" % v["UpdateStreamState"])
def test_update_stream_interrupt(self): """Checks that a running query is terminated on going non-serving.""" # Make sure the replica is replica type. utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) logging.debug('sleeping a bit for the replica action to complete') utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica', 30) # Save current position, insert some data. start_position = _get_repl_current_position() logging.debug('test_update_stream_interrupt starting @ %s', start_position) self._exec_vt_txn(self._populate_vt_a(1)) self._exec_vt_txn(['delete from vt_a']) # Start an Update Stream from the slave. When we get the data, go to spare. # That should interrupt the streaming RPC. replica_conn = self._get_vtgate_stream_conn() first = True txn_count = 0 try: for event, resume_timestamp in replica_conn.update_stream( 'test_keyspace', topodata_pb2.REPLICA, event=query_pb2.EventToken(shard='0', position=start_position), shard='0'): logging.debug('test_update_stream_interrupt got event(%d): %s', resume_timestamp, event) if first: utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare', 30) first = False else: if event.event_token.position: txn_count += 1 self.assertFail('update_stream terminated with no exception') except dbexceptions.DatabaseError as e: self.assertIn('context canceled', str(e)) self.assertFalse(first) logging.debug('Streamed %d transactions before exiting', txn_count) replica_conn.close()
def _test_service_disabled(self): start_position = _get_repl_current_position() logging.debug('_test_service_disabled starting @ %s', start_position) self._exec_vt_txn(self._populate_vt_insert_test) self._exec_vt_txn(['delete from vt_insert_test']) utils.run_vtctl(['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare') logging.debug('dialing replica update stream service') replica_conn = self._get_replica_stream_conn() try: for _ in replica_conn.stream_update(start_position): break except dbexceptions.DatabaseError as e: self.assertIn('update stream service is not enabled', str(e)) replica_conn.close() v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Disabled': self.fail("Update stream service should be 'Disabled' but is '%s'" % v['UpdateStreamState'])
def _test_service_enabled(self): start_position = _get_repl_current_position() logging.debug('_test_service_enabled starting @ %s', start_position) utils.run_vtctl(['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) logging.debug('sleeping a bit for the replica action to complete') utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica', 30) thd = threading.Thread(target=self.perform_writes, name='write_thd', args=(100,)) thd.daemon = True thd.start() replica_conn = self._get_replica_stream_conn() try: for stream_event in replica_conn.stream_update(start_position): if stream_event.category == update_stream.StreamEvent.DML: logging.debug('Test Service Enabled: Pass') break except Exception, e: self.fail('Exception in getting stream from replica: %s\n Traceback %s' % (str(e), traceback.print_exc()))
def _test_service_disabled(self): start_position = _get_repl_current_position() logging.debug('_test_service_disabled starting @ %s', start_position) self._exec_vt_txn(self._populate_vt_insert_test) self._exec_vt_txn(['delete from vt_insert_test']) utils.run_vtctl(['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type( replica_tablet.tablet_alias, tablet.Tablet.tablet_type_value['SPARE']) logging.debug('dialing replica update stream service') replica_conn = self._get_replica_stream_conn() try: for stream_event in replica_conn.stream_update(start_position): break except Exception as e: self.assertIn('update stream service is not enabled', str(e)) replica_conn.close() v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Disabled': self.fail("Update stream service should be 'Disabled' but is '%s'" % v['UpdateStreamState'])
def _test_service_enabled(self): start_position = _get_repl_current_position() logging.debug("_test_service_enabled starting @ %s", start_position) utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "replica"]) logging.debug("sleeping a bit for the replica action to complete") utils.wait_for_tablet_type(replica_tablet.tablet_alias, "replica", 30) thd = threading.Thread(target=self.perform_writes, name="write_thd", args=(100,)) thd.daemon = True thd.start() replica_conn = self._get_replica_stream_conn() replica_conn.dial() try: data = replica_conn.stream_start(start_position) for i in xrange(10): data = replica_conn.stream_next() if data["Category"] == "DML" and utils.options.verbose == 2: logging.debug("Test Service Enabled: Pass") break except Exception, e: self.fail("Exception in getting stream from replica: %s\n Traceback %s" % (str(e), traceback.print_exc()))
def setup_unsharded_keyspace(): utils.run_vtctl(['CreateKeyspace', UNSHARDED_KEYSPACE]) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', UNSHARDED_KEYSPACE, 'keyspace_id', 'uint64']) unsharded_master.init_tablet( 'master', keyspace=UNSHARDED_KEYSPACE, shard='0', tablet_index=0) unsharded_replica.init_tablet( 'replica', keyspace=UNSHARDED_KEYSPACE, shard='0', tablet_index=1) for t in [unsharded_master, unsharded_replica]: t.create_db('vt_test_keyspace_unsharded') t.mquery(unsharded_master.dbname, create_vt_insert_test) t.start_vttablet(wait_for_state=None) for t in [unsharded_master]: t.wait_for_vttablet_state('SERVING') for t in [unsharded_replica]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', '%s/0' % UNSHARDED_KEYSPACE, unsharded_master.tablet_alias], auto_log=True) for t in [unsharded_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [unsharded_master, unsharded_replica]: t.wait_for_vttablet_state('SERVING') # rebuild to be sure we have the right version utils.run_vtctl(['RebuildKeyspaceGraph', UNSHARDED_KEYSPACE], auto_log=True) utils.check_srv_keyspace('test_nj', UNSHARDED_KEYSPACE, 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n')
def test_update_stream_interrupt(self): """Checks that a running query is terminated on going non-serving.""" # Make sure the replica is replica type. utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "replica"]) logging.debug("sleeping a bit for the replica action to complete") utils.wait_for_tablet_type(replica_tablet.tablet_alias, "replica", 30) # Save current position, insert some data. start_position = _get_repl_current_position() logging.debug("test_update_stream_interrupt starting @ %s", start_position) self._exec_vt_txn(self._populate_vt_a(1)) self._exec_vt_txn(["delete from vt_a"]) # Start an Update Stream from the slave. When we get the data, go to spare. # That should interrupt the streaming RPC. replica_conn = self._get_replica_stream_conn() first = True txn_count = 0 try: for event in replica_conn.stream_update( "test_keyspace", "0", topodata_pb2.REPLICA, position=start_position ): logging.debug("test_update_stream_interrupt got event: %s", event) if first: utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "spare"]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, "spare", 30) first = False else: if event.event_token.position: txn_count += 1 self.assertFail("stream_update terminated with no exception") except dbexceptions.DatabaseError as e: self.assertIn("context canceled", str(e)) self.assertFalse(first) logging.debug("Streamed %d transactions before exiting", txn_count) replica_conn.close()
def test_service_switch(self): """tests the service switch from disable -> enable -> disable.""" # make the replica spare utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare') # Check UpdateStreamState is disabled. v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Disabled': self.fail( "Update stream service should be 'Disabled' but is '%s'" % v['UpdateStreamState']) # Make sure we can't start a new request. start_position = _get_repl_current_position() replica_conn = self._get_replica_stream_conn() try: for event in replica_conn.stream_update('test_keyspace', '0', topodata_pb2.REPLICA, position=start_position): self.assertFail('got event: %s' % str(event)) self.assertFail('stream_update terminated with no exception') except dbexceptions.DatabaseError as e: self.assertIn('operation not allowed in state NOT_SERVING', str(e)) # Go back to replica. utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica') # Check UpdateStreamState is enabled. v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Enabled': self.fail("Update stream service should be 'Enabled' but is '%s'" % v['UpdateStreamState'])
def test_resharding(self): # create the keyspace with just one shard shard_master.init_tablet( 'master', keyspace='test_keyspace', shard='0', tablet_index=0) shard_replica.init_tablet( 'replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_rdonly1.init_tablet( 'rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') shard_master.start_vttablet(wait_for_state=None) shard_replica.start_vttablet(wait_for_state=None) shard_rdonly1.start_vttablet(wait_for_state=None) shard_master.wait_for_vttablet_state('SERVING') for t in [shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias], auto_log=True) utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # must start vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias]) # create the split shards shard_0_master.init_tablet( 'master', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet( 'replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly1.init_tablet( 'rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet( 'master', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet( 'replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly1.init_tablet( 'rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_rdonly1, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_1_master]: t.wait_for_vttablet_state('SERVING') for t in [shard_0_replica, shard_0_rdonly1, shard_1_replica, shard_1_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') sharded_tablets = [shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias]) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms'], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 3, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_0_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Delete row 2 (provokes an insert). shard_1_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_1_master.mquery('vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 (provokes a delete). self._insert_value(shard_1_master, 'resharding1', 4, 'msg4', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--exclude_tables', 'unrelated', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 1, 1) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') self.check_destination_master(shard_0_master, ['test_keyspace/0']) self.check_destination_master(shard_1_master, ['test_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data logging.debug('Running vtworker SplitDiff for -80') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_0_master, 2000, 2000) self.check_running_binlog_player(shard_1_master, 6000, 2000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 2) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl( ['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # check the binlog players are gone now self.check_no_binlog_player(shard_0_master) self.check_no_binlog_player(shard_1_master) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl(['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1])
def test_sharded_recovery(self): """Test recovery from backup flow. test_recovery will: - create a shard with master and replica1 only - run InitShardMaster - insert some data - perform a resharding - take a backup of both new shards - insert more data on the masters of both shards - create a recovery keyspace - bring up tablet_replica2 and tablet_replica3 in the new keyspace - check that new tablets do not have data created after backup - check that vtgate queries work correctly """ # insert data on master, wait for replica to get it utils.run_vtctl([ 'ApplySchema', '-sql', self._create_vt_insert_test, 'test_keyspace' ], auto_log=True) self._insert_data(tablet_master, 1) self._check_data(tablet_replica1, 1, 'replica1 tablet getting data') # insert more data on the master self._insert_data(tablet_master, 4) utils.run_vtctl( ['ApplyVSchema', '-vschema', self._vschema_json, 'test_keyspace'], auto_log=True) # create the split shards shard_0_master.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly.init_tablet('rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly.init_tablet('rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ]: t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=True) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') sharded_tablets = [ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl( ['CopySchemaShard', 'test_keyspace/0', keyspace_shard], auto_log=True) utils.run_vtctl(['SplitClone', 'test_keyspace', '0', '-80,80-'], auto_log=True) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) # remove the original tablets in the original shard tablet.kill_tablets([tablet_master, tablet_replica1, tablet_rdonly]) for t in [tablet_replica1, tablet_rdonly]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', tablet_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards should be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) result = shard_0_master.mquery('vt_test_keyspace', "select count(*) from vt_insert_test") shard_0_count = result[0][0] logging.debug("Shard -80 has %d rows", shard_0_count) shard_0_test_id = 0 if shard_0_count > 0: result = shard_0_master.mquery('vt_test_keyspace', "select id from vt_insert_test") shard_0_test_id = result[0][0] result = shard_1_master.mquery('vt_test_keyspace', "select count(*) from vt_insert_test") shard_1_count = result[0][0] logging.debug("Shard 80- has %d rows", shard_1_count) shard_1_test_id = 0 if shard_1_count > 0: result = shard_1_master.mquery('vt_test_keyspace', "select id from vt_insert_test") shard_1_test_id = result[0][0] # backup the new shards utils.run_vtctl(['Backup', shard_0_replica.tablet_alias], auto_log=True) utils.run_vtctl(['Backup', shard_1_replica.tablet_alias], auto_log=True) # check that the backup shows up in the listing backups = self._list_backups('-80') logging.debug('list of backups: %s', backups) self.assertEqual(len(backups), 1) self.assertTrue(backups[0].endswith(shard_0_replica.tablet_alias)) backups = self._list_backups('80-') logging.debug('list of backups: %s', backups) self.assertEqual(len(backups), 1) self.assertTrue(backups[0].endswith(shard_1_replica.tablet_alias)) # start vtgate vtgate = utils.VtGate() vtgate.start(tablets=[shard_0_master, shard_1_master], tablet_types_to_wait='MASTER') utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1) vtgate_conn = get_connection() cursor = vtgate_conn.cursor(tablet_type='master', keyspace=None, writable=True) # insert more data on the masters for i in [2, 3]: cursor.execute( 'insert into vt_insert_test (id, msg) values (:id, :msg)', { 'id': i, 'msg': 'test %s' % i }) vtgate_conn.close() vtgate.kill() # now bring up the recovery keyspace and 2 tablets, letting it restore from backup. self._restore(tablet_replica2, 'recovery_keyspace', '-80') self._restore(tablet_replica3, 'recovery_keyspace', '80-') # check the new replicas have the correct number of rows self._check_data(tablet_replica2, shard_0_count, 'replica2 tablet should not have new data') self._check_data(tablet_replica3, shard_1_count, 'replica3 tablet should not have new data') # start vtgate vtgate = utils.VtGate() vtgate.start(tablets=[ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica, tablet_replica2, tablet_replica3 ], tablet_types_to_wait='REPLICA') utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1) utils.vtgate.wait_for_endpoints('recovery_keyspace.-80.replica', 1) utils.vtgate.wait_for_endpoints('recovery_keyspace.80-.replica', 1) # check that vtgate doesn't route queries to new tablet vtgate_conn = get_connection() cursor = vtgate_conn.cursor(tablet_type='replica', keyspace=None, writable=True) cursor.execute('select count(*) from vt_insert_test', {}) result = cursor.fetchall() if not result: self.fail('Result cannot be null') else: self.assertEqual(result[0][0], 4) # check that new keyspace is accessible by using ks.table cursor.execute('select count(*) from recovery_keyspace.vt_insert_test', {}) result = cursor.fetchall() if not result: self.fail('Result cannot be null') else: self.assertEqual(result[0][0], 2) # check that new keyspace is accessible with 'use ks' cursor.execute('use recovery_keyspace@replica', {}) cursor.execute('select count(*) from vt_insert_test', {}) result = cursor.fetchall() if not result: self.fail('Result cannot be null') else: self.assertEqual(result[0][0], 2) # TODO check that new tablet is accessible with 'use ks:shard' # this currently does not work through the python client, though it works from mysql client #cursor.execute('use recovery_keyspace:-80@replica', {}) #cursor.execute('select count(*) from vt_insert_test', {}) #result = cursor.fetchall() #if not result: # self.fail('Result cannot be null') #else: # self.assertEqual(result[0][0], shard_0_count) #cursor.execute('select id from vt_insert_test', {}) #result = cursor.fetchall() #if not result: # self.fail('Result cannot be null') #else: # self.assertEqual(result[0][0], shard_0_test_id) #cursor.execute('use recovery_keyspace:80-@replica', {}) #cursor.execute('select count(*) from vt_insert_test', {}) #result = cursor.fetchall() #if not result: # self.fail('Result cannot be null') #else: # self.assertEqual(result[0][0], shard_1_count) #cursor.execute('use recovery_keyspace:80-@replica', {}) #cursor.execute('select id from vt_insert_test', {}) #result = cursor.fetchall() #if not result: # self.fail('Result cannot be null') #else: # self.assertEqual(result[0][0], shard_1_test_id) vtgate_conn.close() tablet_replica2.kill_vttablet() tablet_replica3.kill_vttablet() vtgate.kill()
v = utils.get_vars(replica_tablet.port) if v["UpdateStreamState"] != "Enabled": self.fail("Update stream service should be 'Enabled' but is '%s'" % v["UpdateStreamState"]) self.assertIn("DML", v["UpdateStreamEvents"]) self.assertIn("POS", v["UpdateStreamEvents"]) logging.debug("Testing enable -> disable switch starting @ %s", start_position) replica_conn = self._get_replica_stream_conn() first = True txn_count = 0 try: for stream_event in replica_conn.stream_update(start_position): if first: utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "spare"]) utils.wait_for_tablet_type( replica_tablet.tablet_alias, tablet.Tablet.tablet_type_value["SPARE"], 30 ) first = False else: if stream_event.category == update_stream.StreamEvent.POS: txn_count += 1 logging.debug("Test Service Switch: FAIL") return except dbexceptions.DatabaseError, e: self.assertEqual( "Fatal Service Error: Disconnecting because the Update Stream " "service has been disabled", str(e) ) except Exception, e: logging.error("Exception: %s", str(e)) logging.error("Traceback: %s", traceback.print_exc()) self.fail("Update stream returned error '%s'" % str(e))
def test_health_check(self): # one master, one replica that starts in spare # (for the replica, we let vttablet do the InitTablet) tablet_62344.init_tablet('master', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='replica', lameduck_period='5s', init_keyspace='test_keyspace', init_shard='0') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(tablet_62044, False) utils.run_vtctl( ['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias]) # make sure the 'spare' slave goes to 'replica' utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'replica') self.check_healthz(tablet_62044, True) # make sure the master is still master ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias]) self.assertEqual(ti['type'], topodata_pb2.MASTER, 'unexpected master type: %s' % ti['type']) # stop replication, make sure we go unhealthy. utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'spare') self.check_healthz(tablet_62044, False) # make sure the serving graph was updated timeout = 10 while True: try: utils.run_vtctl_json( ['GetEndPoints', 'test_nj', 'test_keyspace/0', 'replica']) except protocols_flavor().client_error_exception_type(): logging.debug('Tablet is gone from serving graph, good') break timeout = utils.wait_step( 'Stopped replication didn\'t trigger removal from serving graph', timeout) # make sure status web page is unhappy self.assertIn( '>unhealthy: replication_reporter: ' 'Replication is not running</span></div>', tablet_62044.get_status()) # make sure the health stream is updated health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias]) self.assertIn('replication_reporter: Replication is not running', health['realtime_stats']['health_error']) self.assertNotIn('serving', health) # then restart replication, make sure we go back to healthy utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'replica') # make sure status web page is healthy self.assertIn('>healthy</span></div>', tablet_62044.get_status()) # make sure the vars is updated v = utils.get_vars(tablet_62044.port) self.assertEqual(v['LastHealthMapCount'], 0) # now test VtTabletStreamHealth returns the right thing stdout, _ = utils.run_vtctl( ['VtTabletStreamHealth', '-count', '2', tablet_62044.tablet_alias], trap_output=True, auto_log=True) lines = stdout.splitlines() self.assertEqual(len(lines), 2) for line in lines: logging.debug('Got health: %s', line) data = json.loads(line) self.assertIn('realtime_stats', data) self.assertIn('serving', data) self.assertTrue(data['serving']) self.assertNotIn('health_error', data['realtime_stats']) self.assertNotIn('tablet_externally_reparented_timestamp', data) self.assertEqual('test_keyspace', data['target']['keyspace']) self.assertEqual('0', data['target']['shard']) self.assertEqual(topodata_pb2.REPLICA, data['target']['tablet_type']) # Test that VtTabletStreamHealth reports a QPS >0.0. # Therefore, issue several reads first. # NOTE: This may be potentially flaky because we'll observe a QPS >0.0 # exactly "once" for the duration of one sampling interval (5s) and # after that we'll see 0.0 QPS rates again. If this becomes actually # flaky, we need to read continuously in a separate thread. for _ in range(10): tablet_62044.execute('select 1 from dual') # This may take up to 5 seconds to become true because we sample the query # counts for the rates only every 5 seconds (see query_service_stats.go). timeout = 10 while True: health = utils.run_vtctl_json([ 'VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias ]) if health['realtime_stats'].get('qps', 0.0) > 0.0: break timeout = utils.wait_step('QPS >0.0 seen', timeout) # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044]) # the replica was in lameduck for 5 seconds, should have been enough # to reset its state to spare ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) self.assertEqual( ti['type'], topodata_pb2.SPARE, "tablet didn't go to spare while in lameduck mode: %s" % str(ti)) # Also the replica should be gone from the serving graph. utils.run_vtctl( ['GetEndPoints', 'test_nj', 'test_keyspace/0', 'replica'], expect_fail=True)
def _test_service_enabled(self): # it looks like update stream would be re-enabled automatically # because of vttablet health check return start_position = _get_repl_current_position() logging.debug('_test_service_enabled starting @ %s', start_position) utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) logging.debug('sleeping a bit for the replica action to complete') utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica', 30) thd = threading.Thread(target=self.perform_writes, name='write_thd', args=(100,)) thd.daemon = True thd.start() replica_conn = self._get_replica_stream_conn() try: for stream_event in replica_conn.stream_update(start_position): if stream_event.category == update_stream.StreamEvent.DML: logging.debug('Test Service Enabled: Pass') break except dbexceptions.DatabaseError as e: self.fail('Exception in getting stream from replica: %s\n Traceback %s' % (str(e), traceback.format_exc())) thd.join(timeout=30) replica_conn.close() v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Enabled': self.fail("Update stream service should be 'Enabled' but is '%s'" % v['UpdateStreamState']) self.assertIn('SE_DML', v['UpdateStreamEvents']) self.assertIn('SE_POS', v['UpdateStreamEvents']) logging.debug('Testing enable -> disable switch starting @ %s', start_position) replica_conn = self._get_replica_stream_conn() first = True txn_count = 0 try: for stream_event in replica_conn.stream_update(start_position): if first: utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare', 30) first = False else: if stream_event.category == update_stream.StreamEvent.POS: txn_count += 1 # FIXME(alainjobart) gasp, the test fails but we don't assert? logging.debug('Test Service Switch: FAIL') replica_conn.close() return except dbexceptions.DatabaseError as e: self.assertEqual( 'Fatal Service Error: Disconnecting because the Update Stream ' 'service has been disabled', str(e)) except Exception as e: logging.error('Exception: %s', str(e)) logging.error('Traceback: %s', traceback.format_exc()) self.fail("Update stream returned error '%s'" % str(e)) logging.debug('Streamed %d transactions before exiting', txn_count) replica_conn.close()
def _test_service_enabled(self): start_position = _get_repl_current_position() logging.debug('_test_service_enabled starting @ %s', start_position) utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) logging.debug('sleeping a bit for the replica action to complete') utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica', 30) thd = threading.Thread(target=self.perform_writes, name='write_thd', args=(100, )) thd.daemon = True thd.start() replica_conn = self._get_replica_stream_conn() try: for stream_event in replica_conn.stream_update(start_position): if stream_event.category == update_stream.StreamEvent.DML: logging.debug('Test Service Enabled: Pass') break except dbexceptions.DatabaseError as e: self.fail( 'Exception in getting stream from replica: %s\n Traceback %s' % (str(e), traceback.format_exc())) thd.join(timeout=30) replica_conn.close() v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Enabled': self.fail("Update stream service should be 'Enabled' but is '%s'" % v['UpdateStreamState']) self.assertIn('SE_DML', v['UpdateStreamEvents']) self.assertIn('SE_POS', v['UpdateStreamEvents']) logging.debug('Testing enable -> disable switch starting @ %s', start_position) replica_conn = self._get_replica_stream_conn() first = True txn_count = 0 try: for stream_event in replica_conn.stream_update(start_position): if first: utils.run_vtctl([ 'ChangeSlaveType', replica_tablet.tablet_alias, 'spare' ]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare', 30) first = False else: if stream_event.category == update_stream.StreamEvent.POS: txn_count += 1 # FIXME(alainjobart) gasp, the test fails but we don't assert? logging.debug('Test Service Switch: FAIL') replica_conn.close() return except dbexceptions.DatabaseError as e: self.assertEqual( 'Fatal Service Error: Disconnecting because the Update Stream ' 'service has been disabled', str(e)) except Exception as e: logging.error('Exception: %s', str(e)) logging.error('Traceback: %s', traceback.format_exc()) self.fail("Update stream returned error '%s'" % str(e)) logging.debug('Streamed %d transactions before exiting', txn_count) replica_conn.close()
def launch( self, keyspace, shards=None, replica_count=1, rdonly_count=0, ddls=None): """Launch test environment.""" if replica_count < 1: raise Exception('replica_count=%d < 1; tests now use semi-sync' ' and must have at least one replica' % replica_count) self.tablets = [] self.master_tablets = [] utils.run_vtctl(['CreateKeyspace', keyspace]) if not shards or shards[0] == '0': shards = ['0'] # Create tablets and start mysqld. procs = [] for shard in shards: procs.append(self._new_tablet(keyspace, shard, 'master', None)) for i in xrange(replica_count): procs.append(self._new_tablet(keyspace, shard, 'replica', i)) for i in xrange(rdonly_count): procs.append(self._new_tablet(keyspace, shard, 'rdonly', i)) utils.wait_procs(procs) # init tablets. for shard in shards: tablet_index = 0 self._init_tablet(keyspace, shard, 'master', None, tablet_index) tablet_index += 1 for i in xrange(replica_count): self._init_tablet(keyspace, shard, 'replica', i, tablet_index) tablet_index += 1 for i in xrange(rdonly_count): self._init_tablet(keyspace, shard, 'rdonly', i, tablet_index) tablet_index += 1 utils.run_vtctl(['RebuildKeyspaceGraph', keyspace], auto_log=True) # Start tablets. for shard in shards: self._start_tablet(keyspace, shard, 'master', None) for i in xrange(replica_count): self._start_tablet(keyspace, shard, 'replica', i) for i in xrange(rdonly_count): self._start_tablet(keyspace, shard, 'rdonly', i) for t in self.master_tablets: t.wait_for_vttablet_state('SERVING') for t in self.tablets: if t not in self.master_tablets: t.wait_for_vttablet_state('NOT_SERVING') for t in self.master_tablets: utils.run_vtctl(['InitShardMaster', '-force', keyspace+'/'+t.shard, t.tablet_alias], auto_log=True) t.tablet_type = 'master' for t in self.tablets: if t.tablet_type == 'replica': utils.wait_for_tablet_type(t.tablet_alias, 'replica') elif t.tablet_type == 'rdonly': utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') for t in self.tablets: t.wait_for_vttablet_state('SERVING') utils.run_vtctl(['RebuildKeyspaceGraph', keyspace], auto_log=True) for ddl in ddls: fname = os.path.join(environment.tmproot, 'ddl.sql') with open(fname, 'w') as f: f.write(ddl) utils.run_vtctl(['ApplySchema', '-sql-file', fname, keyspace])
def setUpModule(): global master_start_position try: environment.topo_server().setup() # start mysql instance external to the test setup_procs = [master_tablet.init_mysql(), replica_tablet.init_mysql()] utils.wait_procs(setup_procs) # start a vtctld so the vtctl insert commands are just RPCs, not forks utils.Vtctld().start() # Start up a master mysql and vttablet logging.debug('Setting up tablets') utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) master_tablet.init_tablet('master', 'test_keyspace', '0', tablet_index=0) replica_tablet.init_tablet('replica', 'test_keyspace', '0', tablet_index=1) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.validate_topology() master_tablet.create_db('vt_test_keyspace') master_tablet.create_db('other_database') replica_tablet.create_db('vt_test_keyspace') replica_tablet.create_db('other_database') master_tablet.start_vttablet(wait_for_state=None) replica_tablet.start_vttablet(wait_for_state=None) master_tablet.wait_for_vttablet_state('SERVING') replica_tablet.wait_for_vttablet_state('NOT_SERVING') for t in [master_tablet, replica_tablet]: t.reset_replication() utils.run_vtctl( ['InitShardMaster', 'test_keyspace/0', master_tablet.tablet_alias], auto_log=True) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica') master_tablet.wait_for_vttablet_state('SERVING') replica_tablet.wait_for_vttablet_state('SERVING') # reset counter so tests don't assert tablet.Tablet.tablets_running = 0 master_start_position = _get_master_current_position() master_tablet.mquery('vt_test_keyspace', _create_vt_insert_test) master_tablet.mquery('vt_test_keyspace', _create_vt_a) master_tablet.mquery('vt_test_keyspace', _create_vt_b) utils.run_vtctl(['ReloadSchema', master_tablet.tablet_alias]) utils.run_vtctl(['ReloadSchema', replica_tablet.tablet_alias]) utils.run_vtctl(['RebuildVSchemaGraph']) utils.VtGate().start(tablets=[master_tablet, replica_tablet]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) # Wait for the master and slave tablet's ReloadSchema to have worked. # Note we don't specify a keyspace name, there is only one, vschema # will just use that single keyspace. timeout = 10 while True: try: utils.vtgate.execute('select count(1) from vt_insert_test', tablet_type='master') utils.vtgate.execute('select count(1) from vt_insert_test', tablet_type='replica') break except protocols_flavor().client_error_exception_type(): logging.exception('query failed') timeout = utils.wait_step('slave tablet having correct schema', timeout) # also re-run ReloadSchema on slave, it case the first one # didn't get the replicated table. utils.run_vtctl(['ReloadSchema', replica_tablet.tablet_alias]) except: tearDownModule() raise
self.fail("Update stream service should be 'Enabled' but is '%s'" % v['UpdateStreamState']) self.assertTrue('DML' in v['UpdateStreamEvents']) self.assertTrue('POS' in v['UpdateStreamEvents']) logging.debug('Testing enable -> disable switch starting @ %s', start_position) replica_conn = self._get_replica_stream_conn() replica_conn.dial() disabled_err = False txn_count = 0 try: data = replica_conn.stream_start(start_position) utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare', 30) while data: data = replica_conn.stream_next() if data is not None and data['Category'] == 'POS': txn_count += 1 logging.debug('Test Service Switch: FAIL') return except dbexceptions.DatabaseError, e: self.assertEqual( 'Fatal Service Error: Disconnecting because the Update Stream ' 'service has been disabled', str(e)) except Exception, e: logging.error('Exception: %s', str(e)) logging.error('Traceback: %s', traceback.print_exc()) self.fail("Update stream returned error '%s'" % str(e)) logging.debug('Streamed %d transactions before exiting', txn_count)
def test_repeated_init_shard_master(self): """Test that using InitShardMaster can go back and forth between 2 hosts.""" for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, lameduck_period='5s', init_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') # Tablets are not replicating, so they won't be healthy. for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(t, False) # Pick one master out of the two. utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias]) # Run health check on both, make sure they are both healthy. # Also make sure the types are correct. for t in tablet_62344, tablet_62044: utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True) self.check_healthz(t, True) utils.wait_for_tablet_type(tablet_62344.tablet_alias, 'master', timeout=0) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'replica', timeout=0) # Pick the other one as master, make sure they are still healthy. utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', tablet_62044.tablet_alias]) # Run health check on both, make sure they are both healthy. # Also make sure the types are correct. for t in tablet_62344, tablet_62044: utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True) self.check_healthz(t, True) utils.wait_for_tablet_type(tablet_62344.tablet_alias, 'replica', timeout=0) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'master', timeout=0) # Come back to the original guy. utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias]) # Run health check on both, make sure they are both healthy. # Also make sure the types are correct. for t in tablet_62344, tablet_62044: utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True) self.check_healthz(t, True) utils.wait_for_tablet_type(tablet_62344.tablet_alias, 'master', timeout=0) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'replica', timeout=0) # And done. tablet.kill_tablets([tablet_62344, tablet_62044])
if v['UpdateStreamState'] != 'Enabled': self.fail("Update stream service should be 'Enabled' but is '%s'" % v['UpdateStreamState']) self.assertTrue('DML' in v['UpdateStreamEvents']) self.assertTrue('POS' in v['UpdateStreamEvents']) logging.debug('Testing enable -> disable switch starting @ %s', start_position) replica_conn = self._get_replica_stream_conn() replica_conn.dial() disabled_err = False txn_count = 0 try: data = replica_conn.stream_start(start_position) utils.run_vtctl(['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare', 30) while data: data = replica_conn.stream_next() if data is not None and data['Category'] == 'POS': txn_count += 1 logging.debug('Test Service Switch: FAIL') return except dbexceptions.DatabaseError, e: self.assertEqual( 'Fatal Service Error: Disconnecting because the Update Stream ' 'service has been disabled', str(e)) except Exception, e: logging.error('Exception: %s', str(e)) logging.error('Traceback: %s', traceback.print_exc()) self.fail("Update stream returned error '%s'" % str(e))
def test_health_check(self): # one master, one replica that starts in spare # (for the replica, we let vttablet do the InitTablet) tablet_62344.init_tablet('master', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='replica', lameduck_period='5s', init_keyspace='test_keyspace', init_shard='0') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(tablet_62044, False) utils.run_vtctl(['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias]) # make sure the 'spare' slave goes to 'replica' utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'replica') self.check_healthz(tablet_62044, True) # make sure the master is still master ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias]) self.assertEqual(ti['type'], topodata_pb2.MASTER, 'unexpected master type: %s' % ti['type']) # stop replication, make sure we go unhealthy. utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'spare') self.check_healthz(tablet_62044, False) # make sure the serving graph was updated timeout = 10 while True: try: utils.run_vtctl_json(['GetEndPoints', 'test_nj', 'test_keyspace/0', 'replica']) except protocols_flavor().client_error_exception_type(): logging.debug('Tablet is gone from serving graph, good') break timeout = utils.wait_step( 'Stopped replication didn\'t trigger removal from serving graph', timeout) # make sure status web page is unhappy self.assertIn( '>unhealthy: replication_reporter: ' 'Replication is not running</span></div>', tablet_62044.get_status()) # make sure the health stream is updated health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias]) self.assertIn( 'replication_reporter: Replication is not running', health['realtime_stats']['health_error']) self.assertNotIn('serving', health) # then restart replication, and write data, make sure we go back to healthy utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'replica') # make sure status web page is healthy self.assertIn('>healthy</span></div>', tablet_62044.get_status()) # make sure the vars is updated v = utils.get_vars(tablet_62044.port) self.assertEqual(v['LastHealthMapCount'], 0) # now test VtTabletStreamHealth returns the right thing stdout, _ = utils.run_vtctl(['VtTabletStreamHealth', '-count', '2', tablet_62044.tablet_alias], trap_output=True, auto_log=True) lines = stdout.splitlines() self.assertEqual(len(lines), 2) for line in lines: logging.debug('Got health: %s', line) data = json.loads(line) self.assertIn('realtime_stats', data) self.assertIn('serving', data) self.assertTrue(data['serving']) self.assertNotIn('health_error', data['realtime_stats']) self.assertNotIn('tablet_externally_reparented_timestamp', data) self.assertEqual('test_keyspace', data['target']['keyspace']) self.assertEqual('0', data['target']['shard']) self.assertEqual(topodata_pb2.REPLICA, data['target']['tablet_type']) # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044]) # the replica was in lameduck for 5 seconds, should have been enough # to reset its state to spare ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) self.assertEqual( ti['type'], topodata_pb2.SPARE, "tablet didn't go to spare while in lameduck mode: %s" % str(ti)) # Also the replica should be gone from the serving graph. utils.run_vtctl(['GetEndPoints', 'test_nj', 'test_keyspace/0', 'replica'], expect_fail=True)
def test_service_switch(self): self._test_service_disabled() self._test_service_enabled() # The above tests leaves the service in disabled state, hence enabling it. utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "replica"]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, "replica", 30)
def _init_keyspaces_and_tablets(self): utils.run_vtctl(['CreateKeyspace', 'source_keyspace']) utils.run_vtctl([ 'CreateKeyspace', '--served_from', 'master:source_keyspace,replica:source_keyspace,rdonly:' 'source_keyspace', 'destination_keyspace' ]) source_master.init_tablet('replica', keyspace='source_keyspace', shard='0', tablet_index=0) source_replica.init_tablet('replica', keyspace='source_keyspace', shard='0', tablet_index=1) source_rdonly1.init_tablet('rdonly', keyspace='source_keyspace', shard='0', tablet_index=2) source_rdonly2.init_tablet('rdonly', keyspace='source_keyspace', shard='0', tablet_index=3) destination_master.init_tablet('replica', keyspace='destination_keyspace', shard='0', tablet_index=0) destination_replica.init_tablet('replica', keyspace='destination_keyspace', shard='0', tablet_index=1) destination_rdonly1.init_tablet('rdonly', keyspace='destination_keyspace', shard='0', tablet_index=2) destination_rdonly2.init_tablet('rdonly', keyspace='destination_keyspace', shard='0', tablet_index=3) utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._create_source_schema() for t in [ source_master, source_replica, destination_master, destination_replica ]: t.start_vttablet(wait_for_state=None) for t in [ source_rdonly1, source_rdonly2, destination_rdonly1, destination_rdonly2 ]: t.start_vttablet(wait_for_state=None) # wait for the tablets master_tablets = [source_master, destination_master] replica_tablets = [ source_replica, source_rdonly1, source_rdonly2, destination_replica, destination_rdonly1, destination_rdonly2 ] for t in master_tablets + replica_tablets: t.wait_for_vttablet_state('NOT_SERVING') # check SrvKeyspace self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(rdonly): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') # reparent to make the tablets work (we use health check, fix their types) utils.run_vtctl([ 'InitShardMaster', '-force', 'source_keyspace/0', source_master.tablet_alias ], auto_log=True) source_master.tablet_type = 'master' utils.run_vtctl([ 'InitShardMaster', '-force', 'destination_keyspace/0', destination_master.tablet_alias ], auto_log=True) destination_master.tablet_type = 'master' for t in [source_replica, destination_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [ source_rdonly1, source_rdonly2, destination_rdonly1, destination_rdonly2 ]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') for t in master_tablets + replica_tablets: t.wait_for_vttablet_state('SERVING')
def test_resharding(self): # create the keyspace with just one shard shard_master.init_tablet('master', keyspace='test_keyspace', shard='0', tablet_index=0) shard_replica.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') shard_master.start_vttablet(wait_for_state=None) shard_replica.start_vttablet(wait_for_state=None) shard_rdonly1.start_vttablet(wait_for_state=None) shard_master.wait_for_vttablet_state('SERVING') for t in [shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # must start vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start( cache_ttl='0', tablets=[shard_master, shard_replica, shard_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias]) # create the split shards shard_0_master.init_tablet('master', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet('master', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_rdonly1, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_1_master]: t.wait_for_vttablet_state('SERVING') for t in [ shard_0_replica, shard_0_rdonly1, shard_1_replica, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') sharded_tablets = [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias]) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--source_reader_count', '10', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') self.check_destination_master(shard_0_master, ['test_keyspace/0']) self.check_destination_master(shard_1_master, ['test_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data logging.debug('Running vtworker SplitDiff for -80') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-' ], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_0_master, 2000, 2000) self.check_running_binlog_player(shard_1_master, 6000, 2000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 2) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now self.check_no_binlog_player(shard_0_master) self.check_no_binlog_player(shard_1_master) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ])
self.assertIn('DML', v['UpdateStreamEvents']) self.assertIn('POS', v['UpdateStreamEvents']) logging.debug('Testing enable -> disable switch starting @ %s', start_position) replica_conn = self._get_replica_stream_conn() first = True txn_count = 0 try: for stream_event in replica_conn.stream_update(start_position): if first: utils.run_vtctl([ 'ChangeSlaveType', replica_tablet.tablet_alias, 'spare' ]) utils.wait_for_tablet_type( replica_tablet.tablet_alias, tablet.Tablet.tablet_type_value['SPARE'], 30) first = False else: if stream_event.category == update_stream.StreamEvent.POS: txn_count += 1 logging.debug('Test Service Switch: FAIL') return except dbexceptions.DatabaseError, e: self.assertEqual( 'Fatal Service Error: Disconnecting because the Update Stream ' 'service has been disabled', str(e)) except Exception, e: logging.error('Exception: %s', str(e)) logging.error('Traceback: %s', traceback.print_exc()) self.fail("Update stream returned error '%s'" % str(e))
def test_health_check_worker_state_does_not_shutdown_query_service(self): # This test is similar to test_health_check, but has the following # differences: # - the second tablet is an 'rdonly' and not a 'replica' # - the second tablet will be set to 'worker' and we expect that # the query service won't be shutdown # Setup master and rdonly tablets. tablet_62344.init_tablet('master', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='rdonly', init_keyspace='test_keyspace', init_shard='0') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(tablet_62044, False) # Enable replication. utils.run_vtctl( ['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias]) # Trigger healthcheck to save time waiting for the next interval. utils.run_vtctl( ['RunHealthCheck', tablet_62044.tablet_alias, 'rdonly']) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'rdonly') self.check_healthz(tablet_62044, True) tablet_62044.wait_for_vttablet_state('SERVING') # Change from rdonly to worker and stop replication. (These # actions are similar to the SplitClone vtworker command # implementation.) The tablet will become unhealthy, but the # query service is still running. utils.run_vtctl( ['ChangeSlaveType', tablet_62044.tablet_alias, 'worker']) utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias]) # Trigger healthcheck explicitly to avoid waiting for the next interval. utils.run_vtctl( ['RunHealthCheck', tablet_62044.tablet_alias, 'rdonly']) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'worker') self.check_healthz(tablet_62044, False) # Make sure that replication got disabled. self.assertIn( '>unhealthy: replication_reporter: ' 'Replication is not running</span></div>', tablet_62044.get_status()) # Query service is still running. tablet_62044.wait_for_vttablet_state('SERVING') # Restart replication. Tablet will become healthy again. utils.run_vtctl( ['ChangeSlaveType', tablet_62044.tablet_alias, 'spare']) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'spare') utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias]) utils.run_vtctl( ['RunHealthCheck', tablet_62044.tablet_alias, 'rdonly']) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'rdonly') self.check_healthz(tablet_62044, True) tablet_62044.wait_for_vttablet_state('SERVING') # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044])
def test_resharding(self): # create the keyspace with just one shard shard_master.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=0) shard_replica.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') # replica is not started, InitShardMaster should timeout shard_master.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) shard_rdonly1.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [shard_master, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work - expect fail # because replica tablet is not up _, stderr = utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True, expect_fail=True) self.assertIn('tablet test_nj-0000062345 ResetReplication failed', stderr) # start replica shard_replica.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) shard_replica.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # We must start vtgate after tablets are up, or else wait until 1min refresh # (that is the tablet_refresh_interval parameter for discovery gateway) # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start( cache_ttl='0', tablets=[shard_master, shard_replica, shard_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type ]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias]) # create the split shards shard_0_master.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') sharded_tablets = [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.vtgate = None utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]) var = None # Wait for the endpoints, either local or remote. utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1, var=var) # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias]) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( [ '--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false' ], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 3, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_0_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Delete row 2 (provokes an insert). shard_1_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_1_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 (provokes a delete). self._insert_value(shard_1_master, 'resharding1', 4, 'msg4', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 1, 1, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 3) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') self.check_destination_master(shard_0_master, ['test_keyspace/0']) self.check_destination_master(shard_1_master, ['test_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) if base_sharding.use_multi_split_diff: logging.debug('Running vtworker MultiSplitDiff for 0') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'MultiSplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], auto_log=True) else: logging.debug('Running vtworker SplitDiff for -80') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-' ], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_0_master, 2000, 2000) self.check_running_binlog_player(shard_1_master, 6000, 2000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' timeout = 10.0 while True: try: s = utils.vtgate.split_query(sql, 'test_keyspace', 2) break except Exception: # pylint: disable=broad-except timeout = utils.wait_step( 'vtgate executes split_query properly', timeout) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # check the binlog players are gone now self.check_no_binlog_player(shard_0_master) self.check_no_binlog_player(shard_1_master) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards should be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ])
def setUpModule(): global master_start_position try: environment.topo_server().setup() # start mysql instance external to the test setup_procs = [master_tablet.init_mysql(), replica_tablet.init_mysql()] utils.wait_procs(setup_procs) # start a vtctld so the vtctl insert commands are just RPCs, not forks utils.Vtctld().start() # Start up a master mysql and vttablet logging.debug('Setting up tablets') utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) master_tablet.init_tablet('master', 'test_keyspace', '0', tablet_index=0) replica_tablet.init_tablet('replica', 'test_keyspace', '0', tablet_index=1) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.validate_topology() master_tablet.create_db('vt_test_keyspace') master_tablet.create_db('other_database') replica_tablet.create_db('vt_test_keyspace') replica_tablet.create_db('other_database') master_tablet.start_vttablet(wait_for_state=None) replica_tablet.start_vttablet(wait_for_state=None) master_tablet.wait_for_vttablet_state('SERVING') replica_tablet.wait_for_vttablet_state('NOT_SERVING') for t in [master_tablet, replica_tablet]: t.reset_replication() utils.run_vtctl(['InitShardMaster', 'test_keyspace/0', master_tablet.tablet_alias], auto_log=True) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica') master_tablet.wait_for_vttablet_state('SERVING') replica_tablet.wait_for_vttablet_state('SERVING') # reset counter so tests don't assert tablet.Tablet.tablets_running = 0 master_start_position = _get_master_current_position() master_tablet.mquery('vt_test_keyspace', _create_vt_insert_test) master_tablet.mquery('vt_test_keyspace', _create_vt_a) master_tablet.mquery('vt_test_keyspace', _create_vt_b) utils.run_vtctl(['ReloadSchema', master_tablet.tablet_alias]) utils.run_vtctl(['ReloadSchema', replica_tablet.tablet_alias]) utils.VtGate().start(tablets=[master_tablet, replica_tablet]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) # wait for the master and slave tablet's ReloadSchema to have worked timeout = 10 while True: try: utils.vtgate.execute('select count(1) from vt_insert_test', tablet_type='master') utils.vtgate.execute('select count(1) from vt_insert_test', tablet_type='replica') break except protocols_flavor().client_error_exception_type(): logging.exception('query failed') timeout = utils.wait_step('slave tablet having correct schema', timeout) # also re-run ReloadSchema on slave, it case the first one # didn't get the replicated table. utils.run_vtctl(['ReloadSchema', replica_tablet.tablet_alias]) except: tearDownModule() raise
def test_custom_end_to_end(self): """Runs through the common operations of a custom sharded keyspace. Tests creation with one shard, schema change, reading / writing data, adding one more shard, reading / writing data from both shards, applying schema changes again, and reading / writing data from both shards again. """ utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # start the first shard only for now shard_0_master.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_0_rdonly.init_tablet('rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_0_master, shard_0_replica, shard_0_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_0_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_0_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_0_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_0_rdonly.tablet_alias, 'rdonly') for t in [shard_0_master, shard_0_replica, shard_0_rdonly]: t.wait_for_vttablet_state('SERVING') self._check_shards_count_in_srv_keyspace(1) s = utils.run_vtctl_json(['GetShard', 'test_keyspace/0']) self.assertEqual(len(s['served_types']), 3) # create a table on shard 0 sql = '''create table data( id bigint auto_increment, name varchar(64), primary key (id) ) Engine=InnoDB''' utils.run_vtctl(['ApplySchema', '-sql=' + sql, 'test_keyspace'], auto_log=True) # reload schema everywhere so the QueryService knows about the tables for t in [shard_0_master, shard_0_replica, shard_0_rdonly]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # create shard 1 shard_1_master.init_tablet('replica', keyspace='test_keyspace', shard='1', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='1', tablet_index=1) shard_1_rdonly.init_tablet('rdonly', keyspace='test_keyspace', shard='1', tablet_index=2) for t in [shard_1_master, shard_1_replica, shard_1_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_1_master, shard_1_replica, shard_1_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') s = utils.run_vtctl_json(['GetShard', 'test_keyspace/1']) self.assertEqual(len(s['served_types']), 3) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/1', shard_1_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_1_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_1_rdonly.tablet_alias, 'rdonly') for t in [shard_1_master, shard_1_replica, shard_1_rdonly]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl([ 'CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/1' ], auto_log=True) # we need to rebuild SrvKeyspace here to account for the new shards. utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) self._check_shards_count_in_srv_keyspace(2) # must start vtgate after tablets are up, or else wait until 1min refresh utils.VtGate().start(tablets=[ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.1.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.1.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.1.rdonly', 1) # insert and check data on shard 0 self._insert_data('0', 100, 10) self._check_data('0', 100, 10) # insert and check data on shard 1 self._insert_data('1', 200, 10) self._check_data('1', 200, 10) # create a second table on all shards sql = '''create table data2( id bigint auto_increment, name varchar(64), primary key (id) ) Engine=InnoDB''' utils.run_vtctl(['ApplySchema', '-sql=' + sql, 'test_keyspace'], auto_log=True) # reload schema everywhere so the QueryService knows about the tables for t in all_tablets: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # insert and read data on all shards self._insert_data('0', 300, 10, table='data2') self._insert_data('1', 400, 10, table='data2') self._check_data('0', 300, 10, table='data2') self._check_data('1', 400, 10, table='data2') # Now test SplitQuery API works (used in MapReduce usually, but bringing # up a full MR-capable cluster is too much for this test environment) sql = 'select id, name from data' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 4) shard0count = 0 shard1count = 0 for q in s: if q['shard_part']['shards'][0] == '0': shard0count += 1 if q['shard_part']['shards'][0] == '1': shard1count += 1 self.assertEqual(shard0count, 2) self.assertEqual(shard1count, 2) # run the queries, aggregate the results, make sure we have all rows rows = {} for q in s: bindvars = {} for name, value in q['query']['bind_variables'].iteritems(): # vtctl encodes bytes as base64. bindvars[name] = int(base64.standard_b64decode(value['value'])) qr = utils.vtgate.execute_shards(q['query']['sql'], 'test_keyspace', ','.join( q['shard_part']['shards']), tablet_type='master', bindvars=bindvars) for r in qr['rows']: rows[int(r[0])] = r[1] self.assertEqual(len(rows), 20) expected = {} for i in xrange(10): expected[100 + i] = 'row %d' % (100 + i) expected[200 + i] = 'row %d' % (200 + i) self.assertEqual(rows, expected) self._test_vtclient_execute_shards_fallback()