def test_service_switch(self): """tests the service switch from disable -> enable -> disable.""" # make the replica spare utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "spare"]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, "spare") # Check UpdateStreamState is disabled. v = utils.get_vars(replica_tablet.port) if v["UpdateStreamState"] != "Disabled": self.fail("Update stream service should be 'Disabled' but is '%s'" % v["UpdateStreamState"]) # Make sure we can't start a new request. start_position = _get_repl_current_position() replica_conn = self._get_replica_stream_conn() try: for event in replica_conn.stream_update( "test_keyspace", "0", topodata_pb2.REPLICA, position=start_position ): self.assertFail("got event: %s" % str(event)) self.assertFail("stream_update terminated with no exception") except dbexceptions.DatabaseError as e: self.assertIn("operation not allowed in state NOT_SERVING", str(e)) # Go back to replica. utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "replica"]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, "replica") # Check UpdateStreamState is enabled. v = utils.get_vars(replica_tablet.port) if v["UpdateStreamState"] != "Enabled": self.fail("Update stream service should be 'Enabled' but is '%s'" % v["UpdateStreamState"])
def _check_query_service(self, tablet, serving, tablet_control_disabled): """_check_query_service will check that the query service is enabled or disabled on the tablet. It will also check if the tablet control status is the reason for being enabled / disabled. It will also run a remote RunHealthCheck to be sure it doesn't change the serving state. """ tablet_vars = utils.get_vars(tablet.port) if serving: expected_state = 'SERVING' else: expected_state = 'NOT_SERVING' self.assertEqual(tablet_vars['TabletStateName'], expected_state, 'tablet %s is not in the right serving state: got %s expected %s' % (tablet.tablet_alias, tablet_vars['TabletStateName'], expected_state)) status = tablet.get_status() if tablet_control_disabled: self.assertIn("Query Service disabled by TabletControl", status) else: self.assertNotIn("Query Service disabled by TabletControl", status) if tablet.tablet_type == 'rdonly': utils.run_vtctl(['RunHealthCheck', tablet.tablet_alias, 'rdonly'], auto_log=True) tablet_vars = utils.get_vars(tablet.port) if serving: expected_state = 'SERVING' else: expected_state = 'NOT_SERVING' self.assertEqual(tablet_vars['TabletStateName'], expected_state, 'tablet %s is not in the right serving state after health check: got %s expected %s' % (tablet.tablet_alias, tablet_vars['TabletStateName'], expected_state))
def wait_for_vttablet_state(self, expected, timeout=60.0, port=None): expr = re.compile('^' + expected + '$') while True: v = utils.get_vars(port or self.port) last_seen_state = '?' if v is None: if self.proc.poll() is not None: raise utils.TestError( 'vttablet died while test waiting for state %s' % expected) logging.debug( ' vttablet %s not answering at /debug/vars, waiting...', self.tablet_alias) else: if 'TabletStateName' not in v: logging.debug( ' vttablet %s not exporting TabletStateName, waiting...', self.tablet_alias) else: s = v['TabletStateName'] last_seen_state = s if expr.match(s): break else: logging.debug( ' vttablet %s in state %s != %s', self.tablet_alias, s, expected) timeout = utils.wait_step( 'waiting for %s state %s (last seen state: %s)' % (self.tablet_alias, expected, last_seen_state), timeout, sleep_time=0.1)
def _check_stats(self): v = utils.get_vars(self.vtgate_port) self.assertEqual(v['VttabletCall']['Histograms']['Execute.source_keyspace.0.replica']['Count'], 2, "unexpected value for VttabletCall(Execute.source_keyspace.0.replica) inside %s" % str(v)) self.assertEqual(v['VtgateApi']['Histograms']['ExecuteKeyRanges.destination_keyspace.master']['Count'], 6, "unexpected value for VtgateApi(ExecuteKeyRanges.destination_keyspace.master) inside %s" % str(v)) self.assertEqual(len(v['VtgateApiErrorCounts']), 0, "unexpected errors for VtgateApiErrorCounts inside %s" % str(v)) self.assertEqual(v['EndpointCount']['test_nj.source_keyspace.0.master'], 1, "unexpected EndpointCount inside %s" % str(v)) self.assertEqual(v['DegradedEndpointCount']['test_nj.source_keyspace.0.master'], 0, "unexpected DegradedEndpointCount inside %s" % str(v))
def wait_for_vtocc_state(self, expected, timeout=60.0, port=None): while True: v = utils.get_vars(port or self.port) last_seen_state = "?" if v == None: logging.debug( ' vttablet %s not answering at /debug/vars, waiting...', self.tablet_alias) else: if 'TabletStateName' not in v: logging.debug( ' vttablet %s not exporting TabletStateName, waiting...', self.tablet_alias) else: s = v['TabletStateName'] last_seen_state = s if s != expected: logging.debug( ' vttablet %s in state %s != %s', self.tablet_alias, s, expected) else: break timeout = utils.wait_step('waiting for state %s (last seen state: %s)' % (expected, last_seen_state), timeout, sleep_time=0.1)
def test_zkocc_qps(self): # preload the test_nj cell zkocc_14850 = utils.zkocc_start() qpser = utils.run_bg(utils.vtroot+'/bin/zkclient2 -server localhost:%u -mode qps /zk/test_nj/vt/zkocc1/data1 /zk/test_nj/vt/zkocc1/data2' % utils.zkocc_port_base) time.sleep(10) utils.kill_sub_process(qpser) # get the zkocc vars, make sure we have what we need v = utils.get_vars(utils.zkocc_port_base) if v['ZkReader']['test_nj']['State'] != 'Connected': raise utils.TestError('invalid zk global state: ', v['ZkReader']['test_nj']['State']) # some checks on performance / stats # a typical workstation will do 45-47k QPS, check we have more than 15k rpcCalls = v['ZkReader']['RpcCalls'] if rpcCalls < 150000: self.fail('QPS is too low: %u < 15000' % (rpcCalls / 10)) else: logging.debug("Recorded qps: %u", rpcCalls / 10) cacheReads = v['ZkReader']['test_nj']['CacheReads'] if cacheReads < 150000: self.fail('Cache QPS is too low: %u < 15000' % (cacheReads / 10)) totalCacheReads = v['ZkReader']['total']['CacheReads'] self.assertEqual(cacheReads, totalCacheReads, 'Rollup stats are wrong') self.assertEqual(v['ZkReader']['UnknownCellErrors'], 0, 'unexpected UnknownCellErrors') utils.zkocc_kill(zkocc_14850)
def test_vtgate_qps(self): # create the topology utils.run_vtctl('CreateKeyspace test_keyspace') t = tablet.Tablet(tablet_uid=1, cell="nj") t.init_tablet("master", "test_keyspace", "0") t.update_addrs() utils.run_vtctl('RebuildShardGraph test_keyspace/0', auto_log=True) utils.run_vtctl('RebuildKeyspaceGraph test_keyspace', auto_log=True) # start vtgate and the qps-er vtgate_proc, vtgate_port = utils.vtgate_start() qpser = utils.run_bg(utils.vtroot+'/bin/zkclient2 -server localhost:%u -mode qps2 test_nj test_keyspace' % vtgate_port) time.sleep(10) utils.kill_sub_process(qpser) # get the vtgate vars, make sure we have what we need v = utils.get_vars(vtgate_port) # some checks on performance / stats # a typical workstation will do 38-40k QPS, check we have more than 15k rpcCalls = v['TopoReaderRpcQueryCount']['test_nj'] if rpcCalls < 150000: self.fail('QPS is too low: %u < 15000' % (rpcCalls / 10)) else: logging.debug("Recorded qps: %u", rpcCalls / 10) utils.vtgate_kill(vtgate_proc)
def check_binlog_player_vars(self, tablet_obj, source_shards, seconds_behind_master_max=0): """Checks the binlog player variables are correctly exported. Args: tablet_obj: the tablet to check. source_shards: the shards to check we are replicating from. seconds_behind_master_max: if non-zero, the lag should be smaller than this value. """ v = utils.get_vars(tablet_obj.port) self.assertIn('BinlogPlayerMapSize', v) self.assertEquals(v['BinlogPlayerMapSize'], len(source_shards)) self.assertIn('BinlogPlayerSecondsBehindMaster', v) self.assertIn('BinlogPlayerSecondsBehindMasterMap', v) self.assertIn('BinlogPlayerSourceShardNameMap', v) shards = v['BinlogPlayerSourceShardNameMap'].values() self.assertEquals(sorted(shards), sorted(source_shards)) self.assertIn('BinlogPlayerSourceTabletAliasMap', v) for i in xrange(len(source_shards)): self.assertIn('%d' % i, v['BinlogPlayerSourceTabletAliasMap']) if seconds_behind_master_max != 0: self.assertTrue( v['BinlogPlayerSecondsBehindMaster'] < seconds_behind_master_max, 'BinlogPlayerSecondsBehindMaster is too high: %d > %d' % ( v['BinlogPlayerSecondsBehindMaster'], seconds_behind_master_max)) for i in xrange(len(source_shards)): self.assertTrue( v['BinlogPlayerSecondsBehindMasterMap']['%d' % i] < seconds_behind_master_max, 'BinlogPlayerSecondsBehindMasterMap is too high: %d > %d' % ( v['BinlogPlayerSecondsBehindMasterMap']['%d' % i], seconds_behind_master_max))
def check_binlog_server_vars(self, tablet_obj, horizontal=True, min_statements=0, min_transactions=0): """Checks the binlog server variables are correctly exported. Args: tablet_obj: the tablet to check. horizontal: true if horizontal split, false for vertical split. min_statements: check the statement count is greater or equal to this. min_transactions: check the transaction count is greater or equal to this. """ v = utils.get_vars(tablet_obj.port) if horizontal: skey = 'UpdateStreamKeyRangeStatements' tkey = 'UpdateStreamKeyRangeTransactions' else: skey = 'UpdateStreamTablesStatements' tkey = 'UpdateStreamTablesTransactions' self.assertIn(skey, v) self.assertIn(tkey, v) if min_statements > 0: self.assertTrue(v[skey] >= min_statements, 'only got %d < %d statements' % (v[skey], min_statements)) if min_transactions > 0: self.assertTrue(v[tkey] >= min_transactions, 'only got %d < %d transactions' % (v[tkey], min_transactions))
def wait_for_vttablet_state(self, expected, timeout=60.0, port=None): # wait for zookeeper PID just to be sure we have it if environment.topo_server_implementation == 'zookeeper': if not self.checked_zk_pid: utils.run(environment.binary_args('zk') + ['wait', '-e', self.zk_pid], stdout=utils.devnull) self.checked_zk_pid = True while True: v = utils.get_vars(port or self.port) if v == None: logging.debug( ' vttablet %s not answering at /debug/vars, waiting...', self.tablet_alias) else: if 'Voltron' not in v: logging.debug( ' vttablet %s not exporting Voltron, waiting...', self.tablet_alias) else: s = v['TabletStateName'] if s != expected: logging.debug( ' vttablet %s in state %s != %s', self.tablet_alias, s, expected) else: break timeout = utils.wait_step('waiting for state %s' % expected, timeout, sleep_time=0.1)
def wait_for_binlog_server_state(self, expected, timeout=30.0): """Wait for the tablet's binlog server to be in the provided state. Args: expected: the state to wait for. timeout: how long to wait before error. """ while True: v = utils.get_vars(self.port) if v is None: if self.proc.poll() is not None: raise utils.TestError( 'vttablet died while test waiting for binlog state %s' % expected) logging.debug(' vttablet not answering at /debug/vars, waiting...') else: if 'UpdateStreamState' not in v: logging.debug( ' vttablet not exporting BinlogServerState, waiting...') else: s = v['UpdateStreamState'] if s != expected: logging.debug(" vttablet's binlog server in state %s != %s", s, expected) else: break timeout = utils.wait_step( 'waiting for binlog server state %s' % expected, timeout, sleep_time=0.5) logging.debug('tablet %s binlog service is in state %s', self.tablet_alias, expected)
def test_vtgate_qps(self): # create the topology utils.run_vtctl('CreateKeyspace test_keyspace') t = tablet.Tablet(tablet_uid=1, cell="nj") t.init_tablet("master", "test_keyspace", "0") t.update_addrs() utils.run_vtctl('RebuildKeyspaceGraph test_keyspace', auto_log=True) # start vtgate and the qps-er vtgate_proc, vtgate_port = utils.vtgate_start( extra_args=['-cpu_profile', os.path.join(environment.tmproot, 'vtgate.pprof')]) qpser = utils.run_bg(environment.binary_args('zkclient2') + [ '-server', 'localhost:%u' % vtgate_port, '-mode', 'qps', '-zkclient_cpu_profile', os.path.join(environment.tmproot, 'zkclient2.pprof'), 'test_nj', 'test_keyspace']) qpser.wait() # get the vtgate vars, make sure we have what we need v = utils.get_vars(vtgate_port) # some checks on performance / stats rpcCalls = v['TopoReaderRpcQueryCount']['test_nj'] if rpcCalls < MIN_QPS * 10: self.fail('QPS is too low: %u < %u' % (rpcCalls / 10, MIN_QPS)) else: logging.debug("Recorded qps: %u", rpcCalls / 10) utils.vtgate_kill(vtgate_proc)
def verify_reconciliation_counters(self, worker_port, online_or_offline, table, inserts, updates, deletes, equal): """Checks that the reconciliation Counters have the expected values.""" worker_vars = utils.get_vars(worker_port) i = worker_vars['Worker' + online_or_offline + 'InsertsCounters'] if inserts == 0: self.assertNotIn(table, i) else: self.assertEqual(i[table], inserts) u = worker_vars['Worker' + online_or_offline + 'UpdatesCounters'] if updates == 0: self.assertNotIn(table, u) else: self.assertEqual(u[table], updates) d = worker_vars['Worker' + online_or_offline + 'DeletesCounters'] if deletes == 0: self.assertNotIn(table, d) else: self.assertEqual(d[table], deletes) e = worker_vars['Worker' + online_or_offline + 'EqualRowsCounters'] if equal == 0: self.assertNotIn(table, e) else: self.assertEqual(e[table], equal)
def check_stream_health_equals_binlog_player_vars(self, tablet_obj, count): """Checks the variables exported by streaming health check match vars. Args: tablet_obj: the tablet to check. count: number of binlog players to expect. """ blp_stats = utils.get_vars(tablet_obj.port) self.assertEqual(blp_stats['BinlogPlayerMapSize'], count) # Enforce health check because it's not running by default as # tablets may not be started with it, or may not run it in time. utils.run_vtctl(['RunHealthCheck', tablet_obj.tablet_alias]) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', tablet_obj.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertNotIn('serving', stream_health) self.assertIn('realtime_stats', stream_health) self.assertNotIn('health_error', stream_health['realtime_stats']) self.assertIn('binlog_players_count', stream_health['realtime_stats']) self.assertEqual(blp_stats['BinlogPlayerMapSize'], stream_health['realtime_stats']['binlog_players_count']) self.assertEqual(blp_stats['BinlogPlayerSecondsBehindMaster'], stream_health['realtime_stats'].get( 'seconds_behind_master_filtered_replication', 0))
def test_zkocc_qps(self): # preload the test_nj cell zkocc_14850 = utils.zkocc_start() qpser = utils.run_bg(environment.binary_argstr('zkclient2')+' -server localhost:%u -mode qps /zk/test_nj/vt/zkocc1/data1 /zk/test_nj/vt/zkocc1/data2' % environment.topo_server().zkocc_port_base) qpser.wait() # get the zkocc vars, make sure we have what we need v = utils.get_vars(environment.topo_server().zkocc_port_base) if v['ZkReader']['test_nj']['State'] != 'Connected': self.fail('invalid zk global state: ' + v['ZkReader']['test_nj']['State']) # some checks on performance / stats rpcCalls = v['ZkReader']['RpcCalls'] if rpcCalls < MIN_QPS * 10: self.fail('QPS is too low: %u < %u' % (rpcCalls / 10, MIN_QPS)) else: logging.debug("Recorded qps: %u", rpcCalls / 10) cacheReads = v['ZkReader']['test_nj']['CacheReads'] if cacheReads < MIN_QPS * 10: self.fail('Cache QPS is too low: %u < %u' % (cacheReads, MIN_QPS * 10)) totalCacheReads = v['ZkReader']['total']['CacheReads'] self.assertEqual(cacheReads, totalCacheReads, 'Rollup stats are wrong') self.assertEqual(v['ZkReader']['UnknownCellErrors'], 0, 'unexpected UnknownCellErrors') utils.zkocc_kill(zkocc_14850)
def wait_for_binlog_player_count(self, expected, timeout=30.0): """Wait for a tablet to have binlog players. Args: expected: number of expected binlog players to wait for. timeout: how long to wait. """ while True: v = utils.get_vars(self.port) if v is None: if self.proc.poll() is not None: raise utils.TestError( 'vttablet died while test waiting for binlog count %s' % expected) logging.debug(' vttablet not answering at /debug/vars, waiting...') else: if 'BinlogPlayerMapSize' not in v: logging.debug( ' vttablet not exporting BinlogPlayerMapSize, waiting...') else: s = v['BinlogPlayerMapSize'] if s != expected: logging.debug(" vttablet's binlog player map has count %d != %d", s, expected) else: break timeout = utils.wait_step( 'waiting for binlog player count %d' % expected, timeout, sleep_time=0.5) logging.debug('tablet %s binlog player has %d players', self.tablet_alias, expected)
def run_test_zkocc_qps(): _populate_zk() # preload the test_nj cell zkocc_14850 = utils.zkocc_start() qpser = utils.run_bg(utils.vtroot+'/bin/zkclient2 -server localhost:%u -mode qps /zk/test_nj/zkocc1/data1 /zk/test_nj/zkocc1/data2' % utils.zkocc_port_base) time.sleep(10) utils.kill_sub_process(qpser) # get the zkocc vars, make sure we have what we need v = utils.get_vars(utils.zkocc_port_base) if v['ZkReader']['test_nj']['State']['Current'] != 'Connected': raise utils.TestError('invalid zk global state: ', v['ZkReader']['test_nj']['State']['Current']) if v['ZkReader']['test_nj']['State']['DurationConnected'] < 9e9: raise utils.TestError('not enough time in Connected state', v['ZkReader']['test_nj']['State']['DurationConnected']) # some checks on performance / stats # a typical workstation will do 15k QPS, check we have more than 3k rpcCalls = v['ZkReader']['RpcCalls'] if rpcCalls < 30000: raise utils.TestError('QPS is too low: %u < 30000', rpcCalls / 10) cacheReads = v['ZkReader']['test_nj']['CacheReads'] if cacheReads < 30000: raise utils.TestError('Cache QPS is too low: %u < 30000', cacheReads / 10) totalCacheReads = v['ZkReader']['total']['CacheReads'] if cacheReads != totalCacheReads: raise utils.TestError('Rollup stats are wrong: %u != %u', cacheReads, totalCacheReads) if v['ZkReader']['UnknownCellErrors'] != 0: raise utils.TestError('unexpected UnknownCellErrors', v['ZkReader']['UnknownCellErrors']) utils.zkocc_kill(zkocc_14850)
def test_restart_during_action(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) tablet_62344.init_tablet('master', 'test_keyspace', '0') utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/0']) utils.validate_topology() srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['MasterCell'], 'test_nj') tablet_62344.create_db('vt_test_keyspace') tablet_62344.start_vttablet() utils.run_vtctl(['Ping', tablet_62344.tablet_alias]) # schedule long action utils.run_vtctl(['-no-wait', 'Sleep', tablet_62344.tablet_alias, '15s'], stdout=utils.devnull) # ping blocks until the sleep finishes unless we have a schedule race action_path, _ = utils.run_vtctl(['-no-wait', 'Ping', tablet_62344.tablet_alias], trap_output=True) action_path = action_path.strip() # kill agent leaving vtaction running tablet_62344.kill_vttablet() # restart agent tablet_62344.start_vttablet() # we expect this action with a short wait time to fail. this isn't the best # and has some potential for flakiness. utils.run_vtctl(['-wait-time', '2s', 'WaitForAction', action_path], expect_fail=True) # wait until the background sleep action is done, otherwise there will be # a leftover vtaction whose result may overwrite running actions # NOTE(alainjobart): Yes, I've seen it happen, it's a pain to debug: # the zombie Sleep clobbers the Clone command in the following tests utils.run_vtctl(['-wait-time', '20s', 'WaitForAction', action_path], auto_log=True) if environment.topo_server_implementation == 'zookeeper': # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) logging.debug("vars: %s" % str(v)) # then the Zookeeper connections if v['ZkMetaConn']['test_nj']['Current'] != 'Connected': self.fail('invalid zk test_nj state: %s' % v['ZkMetaConn']['test_nj']['Current']) if v['ZkMetaConn']['global']['Current'] != 'Connected': self.fail('invalid zk global state: %s' % v['ZkMetaConn']['global']['Current']) if v['ZkMetaConn']['test_nj']['DurationConnected'] < 10e9: self.fail('not enough time in Connected state: %u', v['ZkMetaConn']['test_nj']['DurationConnected']) if v['TabletType'] != 'master': self.fail('TabletType not exported correctly') tablet_62344.kill_vttablet()
def test_actions_and_timeouts(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) tablet_62344.init_tablet('master', 'test_keyspace', '0') utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/0']) utils.validate_topology() srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['MasterCell'], 'test_nj') tablet_62344.create_db('vt_test_keyspace') tablet_62344.start_vttablet() utils.run_vtctl(['RpcPing', tablet_62344.tablet_alias]) # schedule long action in the background, sleep a little bit to make sure # it started to run args = (environment.binary_args('vtctl') + environment.topo_server_flags() + environment.tablet_manager_protocol_flags() + environment.tabletconn_protocol_flags() + ['-log_dir', environment.vtlogroot, 'Sleep', tablet_62344.tablet_alias, '10s']) bg = utils.run_bg(args) time.sleep(3) # try a frontend RpcPing that should timeout as the tablet is busy # running the other one stdout, stderr = utils.run_vtctl(['-wait-time', '3s', 'RpcPing', tablet_62344.tablet_alias], expect_fail=True) if 'Timeout waiting for' not in stderr: self.fail("didn't find the right error strings in failed RpcPing: " + stderr) # wait for the background vtctl bg.wait() if environment.topo_server_implementation == 'zookeeper': # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) logging.debug("vars: %s" % str(v)) # then the Zookeeper connections if v['ZkMetaConn']['test_nj']['Current'] != 'Connected': self.fail('invalid zk test_nj state: %s' % v['ZkMetaConn']['test_nj']['Current']) if v['ZkMetaConn']['global']['Current'] != 'Connected': self.fail('invalid zk global state: %s' % v['ZkMetaConn']['global']['Current']) if v['ZkMetaConn']['test_nj']['DurationConnected'] < 10e9: self.fail('not enough time in Connected state: %u', v['ZkMetaConn']['test_nj']['DurationConnected']) if v['TabletType'] != 'master': self.fail('TabletType not exported correctly') tablet_62344.kill_vttablet()
def test_purge_cache(self): utils.debug("===========test_purge_cache=========") cache_counters = framework.MultiDict(utils.get_vars(replica_tablet.port))['CacheCounters'] utils.debug("cache counters %s" % cache_counters) try: purge_cache_counter = cache_counters['PurgeCache'] except KeyError, e: purge_cache_counter = 0
def wait_for_vars(var, key, value): timeout = 20.0 while True: v = utils.get_vars(utils.vtgate.port) if v and var in v and key in v[var] and v[var][key] == value: break timeout = utils.wait_step( 'waiting for /debug/vars of %s/%s' % (var, key), timeout)
def test_actions_and_timeouts(self): # Start up a master mysql and vttablet utils.run_vtctl(["CreateKeyspace", "test_keyspace"]) tablet_62344.init_tablet("master", "test_keyspace", "0") utils.run_vtctl(["RebuildShardGraph", "test_keyspace/0"]) utils.validate_topology() self._check_srv_shard() tablet_62344.create_db("vt_test_keyspace") tablet_62344.start_vttablet() utils.run_vtctl(["Ping", tablet_62344.tablet_alias]) # schedule long action in the background, sleep a little bit to make sure # it started to run args = ( environment.binary_args("vtctl") + environment.topo_server().flags() + [ "-tablet_manager_protocol", protocols_flavor().tablet_manager_protocol(), "-tablet_protocol", protocols_flavor().tabletconn_protocol(), "-log_dir", environment.vtlogroot, "Sleep", tablet_62344.tablet_alias, "10s", ] ) bg = utils.run_bg(args) time.sleep(3) # try a frontend RefreshState that should timeout as the tablet is busy # running the other one _, stderr = utils.run_vtctl(["-wait-time", "3s", "RefreshState", tablet_62344.tablet_alias], expect_fail=True) self.assertIn(protocols_flavor().rpc_timeout_message(), stderr) # wait for the background vtctl bg.wait() if environment.topo_server().flavor() == "zookeeper": # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) logging.debug("vars: %s", v) # then the Zookeeper connections if v["ZkCachedConn"]["test_nj"] != "Connected": self.fail("invalid zk test_nj state: %s" % v["ZkCachedConn"]["test_nj"]) if v["ZkCachedConn"]["global"] != "Connected": self.fail("invalid zk global state: %s" % v["ZkCachedConn"]["global"]) if v["TabletType"] != "master": self.fail("TabletType not exported correctly") tablet_62344.kill_vttablet()
def test_actions_and_timeouts(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) tablet_62344.init_tablet('master', 'test_keyspace', '0') utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/0']) utils.validate_topology() self._check_srv_shard() tablet_62344.create_db('vt_test_keyspace') tablet_62344.start_vttablet() utils.run_vtctl(['Ping', tablet_62344.tablet_alias]) # schedule long action in the background, sleep a little bit to make sure # it started to run args = (environment.binary_args('vtctl') + environment.topo_server().flags() + ['-tablet_manager_protocol', protocols_flavor().tablet_manager_protocol(), '-tablet_protocol', protocols_flavor().tabletconn_protocol(), '-log_dir', environment.vtlogroot, 'Sleep', tablet_62344.tablet_alias, '10s']) bg = utils.run_bg(args) time.sleep(3) # try a frontend RefreshState that should timeout as the tablet is busy # running the other one stdout, stderr = utils.run_vtctl( ['-wait-time', '3s', 'RefreshState', tablet_62344.tablet_alias], expect_fail=True) self.assertIn(protocols_flavor().rpc_timeout_message(), stderr) # wait for the background vtctl bg.wait() if environment.topo_server().flavor() == 'zookeeper': # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) logging.debug('vars: %s', v) # then the Zookeeper connections if v['ZkMetaConn']['test_nj']['Current'] != 'Connected': self.fail('invalid zk test_nj state: %s' % v['ZkMetaConn']['test_nj']['Current']) if v['ZkMetaConn']['global']['Current'] != 'Connected': self.fail('invalid zk global state: %s' % v['ZkMetaConn']['global']['Current']) if v['ZkMetaConn']['test_nj']['DurationConnected'] < 10e9: self.fail('not enough time in Connected state: %d', v['ZkMetaConn']['test_nj']['DurationConnected']) if v['TabletType'] != 'master': self.fail('TabletType not exported correctly') tablet_62344.kill_vttablet()
def _check_binlog_player_vars(self, tablet, seconds_behind_master_max = 0): v = utils.get_vars(tablet.port) self.assertTrue("BinlogPlayerMapSize" in v) self.assertTrue("BinlogPlayerSecondsBehindMaster" in v) if seconds_behind_master_max != 0: self.assertTrue(v["BinlogPlayerSecondsBehindMaster"] < seconds_behind_master_max, "BinlogPlayerSecondsBehindMaster is too high: %u > %u" % ( v["BinlogPlayerSecondsBehindMaster"], seconds_behind_master_max))
def test_service_switch(self): """tests the service switch from disable -> enable -> disable.""" # make the replica spare utils.run_vtctl(['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare') # Check UpdateStreamState is disabled. v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Disabled': self.fail("Update stream service should be 'Disabled' but is '%s'" % v['UpdateStreamState']) start_position = _get_repl_current_position() # Make sure we can't start a new request to vttablet directly. _, stderr = utils.run_vtctl(['VtTabletUpdateStream', '-position', start_position, replica_tablet.tablet_alias], expect_fail=True) self.assertIn('operation not allowed in state NOT_SERVING', stderr) # Make sure we can't start a new request through vtgate. replica_conn = self._get_vtgate_stream_conn() try: for event, resume_timestamp in replica_conn.update_stream( 'test_keyspace', topodata_pb2.REPLICA, event=query_pb2.EventToken(shard='0', position=start_position), shard='0'): self.assertFail('got event(%d): %s' % (resume_timestamp, str(event))) self.assertFail('update_stream terminated with no exception') except dbexceptions.DatabaseError as e: self.assertIn(vtgate_gateway_flavor().no_tablet_found_message(), str(e)) # Go back to replica. utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica') # Check UpdateStreamState is enabled. v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Enabled': self.fail("Update stream service should be 'Enabled' but is '%s'" % v['UpdateStreamState'])
def _test_service_enabled(self): start_position = _get_repl_current_position() logging.debug("_test_service_enabled starting @ %s", start_position) utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "replica"]) logging.debug("sleeping a bit for the replica action to complete") utils.wait_for_tablet_type(replica_tablet.tablet_alias, topodata_pb2.REPLICA, 30) thd = threading.Thread(target=self.perform_writes, name="write_thd", args=(100,)) thd.daemon = True thd.start() replica_conn = self._get_replica_stream_conn() try: for stream_event in replica_conn.stream_update(start_position): if stream_event.category == update_stream.StreamEvent.DML: logging.debug("Test Service Enabled: Pass") break except Exception as e: self.fail("Exception in getting stream from replica: %s\n Traceback %s" % (str(e), traceback.format_exc())) thd.join(timeout=30) replica_conn.close() v = utils.get_vars(replica_tablet.port) if v["UpdateStreamState"] != "Enabled": self.fail("Update stream service should be 'Enabled' but is '%s'" % v["UpdateStreamState"]) self.assertIn("SE_DML", v["UpdateStreamEvents"]) self.assertIn("SE_POS", v["UpdateStreamEvents"]) logging.debug("Testing enable -> disable switch starting @ %s", start_position) replica_conn = self._get_replica_stream_conn() first = True txn_count = 0 try: for stream_event in replica_conn.stream_update(start_position): if first: utils.run_vtctl(["ChangeSlaveType", replica_tablet.tablet_alias, "spare"]) utils.wait_for_tablet_type(replica_tablet.tablet_alias, topodata_pb2.SPARE, 30) first = False else: if stream_event.category == update_stream.StreamEvent.POS: txn_count += 1 # FIXME(alainjobart) gasp, the test fails but we don't assert? logging.debug("Test Service Switch: FAIL") replica_conn.close() return except dbexceptions.DatabaseError as e: self.assertEqual( "Fatal Service Error: Disconnecting because the Update Stream " "service has been disabled", str(e) ) except Exception as e: logging.error("Exception: %s", str(e)) logging.error("Traceback: %s", traceback.format_exc()) self.fail("Update stream returned error '%s'" % str(e)) logging.debug("Streamed %d transactions before exiting", txn_count) replica_conn.close()
def _check_stats(self): v = utils.get_vars(self.vtgate_port) self.assertEqual(v['VttabletCall']['Histograms']['Execute.source_keyspace.0.replica']['Count'], 2, "unexpected value for VttabletCall(Execute.source_keyspace.0.replica) inside %s" % str(v)) self.assertEqual(v['VtgateApi']['Histograms']['ExecuteKeyRanges.destination_keyspace.master']['Count'], 6, "unexpected value for VtgateApi(ExecuteKeyRanges.destination_keyspace.master) inside %s" % str(v)) self.assertEqual(len(v['VtgateApiErrorCounts']), 0, "unexpected errors for VtgateApiErrorCounts inside %s" % str(v)) self.assertEqual( v['ResilientSrvTopoServerEndPointsReturnedCount']['test_nj.source_keyspace.0.master'] / v['ResilientSrvTopoServerEndPointQueryCount']['test_nj.source_keyspace.0.master'], 1, "unexpected EndPointsReturnedCount inside %s" % str(v)) self.assertNotIn( 'test_nj.source_keyspace.0.master', v['ResilientSrvTopoServerEndPointDegradedResultCount'], "unexpected EndPointDegradedResultCount inside %s" % str(v))
def test_restart_during_action(self): # Start up a master mysql and vttablet utils.run_vtctl('CreateKeyspace test_keyspace') tablet_62344.init_tablet('master', 'test_keyspace', '0') utils.run_vtctl('RebuildShardGraph test_keyspace/0') utils.validate_topology() tablet_62344.create_db('vt_test_keyspace') tablet_62344.start_vttablet() utils.run_vtctl('Ping ' + tablet_62344.tablet_alias) # schedule long action utils.run_vtctl('-no-wait Sleep %s 15s' % tablet_62344.tablet_alias, stdout=utils.devnull) # ping blocks until the sleep finishes unless we have a schedule race action_path, _ = utils.run_vtctl('-no-wait Ping ' + tablet_62344.tablet_alias, trap_output=True) # kill agent leaving vtaction running tablet_62344.kill_vttablet() # restart agent tablet_62344.start_vttablet() # we expect this action with a short wait time to fail. this isn't the best # and has some potential for flakiness. utils.run_vtctl('-wait-time 2s WaitForAction ' + action_path, expect_fail=True) # wait until the background sleep action is done, otherwise there will be # a leftover vtaction whose result may overwrite running actions # NOTE(alainjobart): Yes, I've seen it happen, it's a pain to debug: # the zombie Sleep clobbers the Clone command in the following tests utils.run_vtctl('-wait-time 20s WaitForAction ' + action_path, auto_log=True) # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) logging.debug("vars: %s" % str(v)) if v['Voltron']['States']['DurationSERVING'] < 10e9: raise utils.TestError('not enough time in Open state', v['Voltron']['States']['DurationSERVING']) # then the Zookeeper connections if v['ZkMetaConn']['test_nj']['Current'] != 'Connected': raise utils.TestError('invalid zk test_nj state: ', v['ZkMetaConn']['test_nj']['Current']) if v['ZkMetaConn']['global']['Current'] != 'Connected': raise utils.TestError('invalid zk global state: ', v['ZkMetaConn']['global']['Current']) if v['ZkMetaConn']['test_nj']['DurationConnected'] < 10e9: raise utils.TestError('not enough time in Connected state', v['ZkMetaConn']['test_nj']['DurationConnected']) if v['TabletType'] != 'master': raise utils.TestError('TabletType not exported correctly') tablet_62344.kill_vttablet()
def run_test_restart_during_action(): # Start up a master mysql and vttablet utils.run_vtctl("CreateKeyspace -force test_keyspace") tablet_62344.init_tablet("master", "test_keyspace", "0") utils.run_vtctl("RebuildShardGraph test_keyspace/0") utils.validate_topology() tablet_62344.create_db("vt_test_keyspace") tablet_62344.start_vttablet() utils.run_vtctl("Ping " + tablet_62344.tablet_alias) # schedule long action utils.run_vtctl("-no-wait Sleep %s 15s" % tablet_62344.tablet_alias, stdout=devnull) # ping blocks until the sleep finishes unless we have a schedule race action_path, _ = utils.run_vtctl("-no-wait Ping " + tablet_62344.tablet_alias, trap_output=True) # kill agent leaving vtaction running tablet_62344.kill_vttablet() # restart agent tablet_62344.start_vttablet() # we expect this action with a short wait time to fail. this isn't the best # and has some potential for flakiness. utils.run_fail(utils.vtroot + "/bin/vtctl --alsologtostderr -wait-time 2s WaitForAction " + action_path) # wait until the background sleep action is done, otherwise there will be # a leftover vtaction whose result may overwrite running actions # NOTE(alainjobart): Yes, I've seen it happen, it's a pain to debug: # the zombie Sleep clobbers the Clone command in the following tests utils.run(utils.vtroot + "/bin/vtctl --alsologtostderr -wait-time 20s WaitForAction " + action_path) # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) utils.debug("vars: %s" % str(v)) if v["Voltron"]["States"]["DurationOPEN"] < 10e9: raise utils.TestError("not enough time in Open state", v["Voltron"]["States"]["DurationOPEN"]) # then the Zookeeper connections if v["ZkMetaConn"]["test_nj"]["Current"] != "Connected": raise utils.TestError("invalid zk test_nj state: ", v["ZkMetaConn"]["test_nj"]["Current"]) if v["ZkMetaConn"]["global"]["Current"] != "Connected": raise utils.TestError("invalid zk global state: ", v["ZkMetaConn"]["global"]["Current"]) if v["ZkMetaConn"]["test_nj"]["DurationConnected"] < 10e9: raise utils.TestError("not enough time in Connected state", v["ZkMetaConn"]["test_nj"]["DurationConnected"]) if v["tablet-type"] != "master": raise utils.TestError("tablet-type not exported correctly") tablet_62344.kill_vttablet()
def test_actions_and_timeouts(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) tablet_62344.init_tablet('master', 'test_keyspace', '0') utils.validate_topology() tablet_62344.create_db('vt_test_keyspace') tablet_62344.start_vttablet() utils.run_vtctl(['Ping', tablet_62344.tablet_alias]) # schedule long action in the background, sleep a little bit to make sure # it started to run args = (environment.binary_args('vtctl') + environment.topo_server().flags() + ['-tablet_manager_protocol', protocols_flavor().tablet_manager_protocol(), '-tablet_protocol', protocols_flavor().tabletconn_protocol(), '-log_dir', environment.vtlogroot, 'Sleep', tablet_62344.tablet_alias, '10s']) bg = utils.run_bg(args) time.sleep(3) # try a frontend RefreshState that should timeout as the tablet is busy # running the other one _, stderr = utils.run_vtctl( ['-wait-time', '3s', 'RefreshState', tablet_62344.tablet_alias], expect_fail=True) self.assertIn(protocols_flavor().rpc_timeout_message(), stderr) # wait for the background vtctl bg.wait() if environment.topo_server().flavor() == 'zookeeper': # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) logging.debug('vars: %s', v) # then the Zookeeper connections if v['ZkCachedConn']['test_nj'] != 'Connected': self.fail('invalid zk test_nj state: %s' % v['ZkCachedConn']['test_nj']) if v['ZkCachedConn']['global'] != 'Connected': self.fail('invalid zk global state: %s' % v['ZkCachedConn']['global']) if v['TabletType'] != 'master': self.fail('TabletType not exported correctly') tablet_62344.kill_vttablet()
def test_service_switch(self): """tests the service switch from disable -> enable -> disable.""" # make the replica spare utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare') # Check UpdateStreamState is disabled. v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Disabled': self.fail( "Update stream service should be 'Disabled' but is '%s'" % v['UpdateStreamState']) # Make sure we can't start a new request. start_position = _get_repl_current_position() replica_conn = self._get_replica_stream_conn() try: for event in replica_conn.stream_update('test_keyspace', '0', topodata_pb2.REPLICA, position=start_position): self.assertFail('got event: %s' % str(event)) self.assertFail('stream_update terminated with no exception') except dbexceptions.DatabaseError as e: self.assertIn('operation not allowed in state NOT_SERVING', str(e)) # Go back to replica. utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'replica') # Check UpdateStreamState is enabled. v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Enabled': self.fail("Update stream service should be 'Enabled' but is '%s'" % v['UpdateStreamState'])
class TestUpdateStream(unittest.TestCase): _populate_vt_insert_test = [ "insert into vt_insert_test (msg) values ('test %s')" % x for x in xrange(4) ] def _populate_vt_a(self, count): return [ 'insert into vt_a (eid, id) values (%d, %d)' % (x, x) for x in xrange(count + 1) if x > 0 ] def _populate_vt_b(self, count): return [ "insert into vt_b (eid, name, foo) values (%d, 'name %s', 'foo %s')" % (x, x, x) for x in xrange(count) ] def _get_master_stream_conn(self): protocol, endpoint = master_tablet.update_stream_python_endpoint() return update_stream.connect(protocol, endpoint, 30) def _get_replica_stream_conn(self): protocol, endpoint = replica_tablet.update_stream_python_endpoint() return update_stream.connect(protocol, endpoint, 30) def _test_service_disabled(self): start_position = _get_repl_current_position() logging.debug('_test_service_disabled starting @ %s', start_position) self._exec_vt_txn(self._populate_vt_insert_test) self._exec_vt_txn(['delete from vt_insert_test']) utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, tablet.Tablet.tablet_type_value['SPARE']) logging.debug('dialing replica update stream service') replica_conn = self._get_replica_stream_conn() try: for stream_event in replica_conn.stream_update(start_position): break except Exception, e: logging.debug(str(e)) self.assertIn('update stream service is not enabled', str(e)) v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Disabled': self.fail( "Update stream service should be 'Disabled' but is '%s'" % v['UpdateStreamState'])
def run_test_restart_during_action(): # Start up a master mysql and vttablet utils.run_vtctl('CreateKeyspace -force test_keyspace') tablet_62344.init_tablet('master', 'test_keyspace', '0') utils.run_vtctl('RebuildShardGraph test_keyspace/0') utils.validate_topology() tablet_62344.create_db('vt_test_keyspace') tablet_62344.start_vttablet() utils.run_vtctl('Ping ' + tablet_62344.tablet_alias) # schedule long action utils.run_vtctl('-no-wait Sleep %s 15s' % tablet_62344.tablet_alias, stdout=devnull) # ping blocks until the sleep finishes unless we have a schedule race action_path, _ = utils.run_vtctl('-no-wait Ping ' + tablet_62344.tablet_alias, trap_output=True) # kill agent leaving vtaction running tablet_62344.kill_vttablet() # restart agent tablet_62344.start_vttablet() # we expect this action with a short wait time to fail. this isn't the best # and has some potential for flakiness. utils.run_fail(utils.vtroot+'/bin/vtctl -logfile=/dev/null -log.level=WARNING -wait-time 2s WaitForAction ' + action_path) # wait until the background sleep action is done, otherwise there will be # a leftover vtaction whose result may overwrite running actions # NOTE(alainjobart): Yes, I've seen it happen, it's a pain to debug: # the zombie Sleep clobbers the Clone command in the following tests utils.run(utils.vtroot+'/bin/vtctl -logfile=/dev/null -log.level=WARNING -wait-time 20s WaitForAction ' + action_path) # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) if v['Voltron']['States']['DurationOPEN'] < 10e9: raise utils.TestError('not enough time in Open state', v['Voltron']['States']['DurationOPEN']) # then the Zookeeper connections if v['ZkMetaConn']['test_nj']['Current'] != 'Connected': raise utils.TestError('invalid zk test_nj state: ', v['ZkMetaConn']['test_nj']['Current']) if v['ZkMetaConn']['global']['Current'] != 'Connected': raise utils.TestError('invalid zk global state: ', v['ZkMetaConn']['global']['Current']) if v['ZkMetaConn']['test_nj']['DurationConnected'] < 10e9: raise utils.TestError('not enough time in Connected state', v['ZkMetaConn']['test_nj']['DurationConnected']) tablet_62344.kill_vttablet()
def chain(): """Present the results of running a chain on a file.""" chain_identifier, url = get_vars(request, ["id", "data"]) info('chain=%s' % chain_identifier) chain = LAPPS_SERVICE_CHAINS.get_chain(chain_identifier) info('source-url=%s' % url) data = requests.get(url).text result = chain.run({ "discriminator": "http://vocab.lappsgrid.org/ns/media/text", "payload": data}) info("discriminator=%s" % result.get('discriminator')) return render_template("chain.html", chain=chain, fname=url, result=result, builder=HtmlBuilder())
def _build_network(self): obs_dim = self.obs_space.shape[0] act_dim = self.act_space.shape[0] self.state_ph, self.act_ph, self.next_state_ph, self.rew_ph, self.done_ph = placeholders( obs_dim, act_dim, obs_dim, None, None) # Main outputs with tf.variable_scope('main'): self.pi, self.q, q_pi = ddpg_mlp_actor_critic( self.state_ph, self.act_ph, action_space=self.act_space, hidden_sizes=self.hidden_sizes) # Target networks with tf.variable_scope('target'): _, _, q_pi_targ = ddpg_mlp_actor_critic( self.next_state_ph, self.act_ph, action_space=self.act_space, hidden_sizes=self.hidden_sizes ) # We only need q_pi_targ to compute bellman backup var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman target backup backup = tf.stop_gradient(self.rew_ph + self.gamma * (1 - self.done_ph) * q_pi_targ) # Objectives self.q_loss = tf.reduce_mean((self.q - backup)**2) self.pi_loss = -tf.reduce_mean(q_pi) # Optimizers self.train_q_opt = tf.train.AdamOptimizer(learning_rate=1e-3).minimize( self.q_loss, var_list=get_vars('main/q')) self.train_pi_opt = tf.train.AdamOptimizer( learning_rate=1e-3).minimize(self.pi_loss, var_list=get_vars('main/pi')) # Polyak averaging for target variables self.target_update = tf.group([ tf.assign(v_target, self.polyak * v_target + (1 - self.polyak) * v_main) for v_target, v_main in zip(get_vars('target'), get_vars('main')) ]) # Init self.target_init = tf.group([ tf.assign(v_target, v_main) for v_target, v_main in zip(get_vars('target'), get_vars('main')) ])
def wait_for_vttablet_state(self, expected, timeout=60.0, port=None): while True: v = utils.get_vars(port or self.port) if v == None: logging.debug(" vttablet %s not answering at /debug/vars, waiting...", self.tablet_alias) else: if 'Voltron' not in v: logging.debug(" vttablet %s not exporting Voltron, waiting...", self.tablet_alias) else: s = v["TabletStateName"] if s != expected: logging.debug(" vttablet %s in state %s != %s", self.tablet_alias, s, expected) else: break timeout = utils.wait_step('waiting for state %s' % expected, timeout, sleep_time=0.1)
def test_tablet_restart(self): utils.debug("===========test_tablet_restart=========") utils.run_vtctl('ChangeSlaveType test_nj-0000062345 replica') time.sleep(5) perform_insert(100) time.sleep(5) invalidatorStats = framework.MultiDict(json.load(urllib2.urlopen("http://%s/debug/vars" % replica_host)))['CacheInvalidationProcessor'] checkpoint1 = invalidatorStats['Checkpoint'] utils.debug("invalidatorStats %s checkpoint1 %s" % (invalidatorStats, checkpoint1)) cache_counters = framework.MultiDict(utils.get_vars(replica_tablet.port))['CacheCounters'] utils.debug("cache counters %s" % cache_counters) try: purge_cache_counter = cache_counters['PurgeCache'] except KeyError, e: purge_cache_counter = 0
def wait_for_binlog_player_count(self, expected, timeout=30.0): while True: v = utils.get_vars(self.port) if v == None: logging.debug(" vttablet not answering at /debug/vars, waiting...") else: if 'BinlogPlayerMapSize' not in v: logging.debug(" vttablet not exporting BinlogPlayerMapSize, waiting...") else: s = v['BinlogPlayerMapSize'] if s != expected: logging.debug(" vttablet's binlog player map has count %u != %u", s, expected) else: break timeout = utils.wait_step('waiting for binlog player count %d' % expected, timeout, sleep_time=0.5) logging.debug("tablet %s binlog player has %d players", self.tablet_alias, expected)
def _check_stream_health_equals_binlog_player_vars(self, tablet_obj): blp_stats = utils.get_vars(tablet_obj.port) # Enforce health check because it's not running by default as # tablets are not started with it. utils.run_vtctl(['RunHealthCheck', tablet_obj.tablet_alias, 'replica']) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', tablet_obj.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertNotIn('serving', stream_health) self.assertIn('realtime_stats', stream_health) self.assertNotIn('health_error', stream_health['realtime_stats']) # count is > 0 and therefore not omitted by the Go JSON marshaller. self.assertIn('binlog_players_count', stream_health['realtime_stats']) self.assertEqual(blp_stats['BinlogPlayerMapSize'], stream_health['realtime_stats']['binlog_players_count']) self.assertEqual(blp_stats['BinlogPlayerSecondsBehindMaster'], stream_health['realtime_stats'].get( 'seconds_behind_master_filtered_replication', 0))
def wait_for_binlog_server_state(self, expected, timeout=30.0): while True: v = utils.get_vars(self.port) if v == None: logging.debug(" vttablet not answering at /debug/vars, waiting...") else: if 'UpdateStreamState' not in v: logging.debug(" vttablet not exporting BinlogServerState, waiting...") else: s = v['UpdateStreamState'] if s != expected: logging.debug(" vttablet's binlog server in state %s != %s", s, expected) else: break timeout = utils.wait_step('waiting for binlog server state %s' % expected, timeout, sleep_time=0.5) logging.debug("tablet %s binlog service is in state %s", self.tablet_alias, expected)
def wait_for_vttablet_state(self, expected, timeout=5.0, port=None): while True: v = utils.get_vars(port or self.port) if v == None: logging.debug(" vttablet %s not answering at /debug/vars, waiting...", self.tablet_alias) else: if 'Voltron' not in v: logging.debug(" vttablet %s not exporting Voltron, waiting...", self.tablet_alias) else: s = v['Voltron']['States']['Current'] if s != expected: logging.debug(" vttablet %s in state %s != %s", self.tablet_alias, s, expected) else: break logging.debug("sleeping a bit while we wait") time.sleep(0.1) timeout -= 0.1 if timeout <= 0: raise utils.TestError("timeout waiting for state %s" % expected)
def _test_service_disabled(self): start_position = _get_repl_current_position() logging.debug('_test_service_disabled starting @ %s', start_position) self._exec_vt_txn(self._populate_vt_insert_test) self._exec_vt_txn(['delete from vt_insert_test']) utils.run_vtctl(['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) utils.wait_for_tablet_type(replica_tablet.tablet_alias, 'spare') logging.debug('dialing replica update stream service') replica_conn = self._get_replica_stream_conn() try: for _ in replica_conn.stream_update(start_position): break except dbexceptions.DatabaseError as e: self.assertIn('update stream service is not enabled', str(e)) replica_conn.close() v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState'] != 'Disabled': self.fail("Update stream service should be 'Disabled' but is '%s'" % v['UpdateStreamState'])
def run_test_zkocc_qps(): _populate_zk() # preload the test_nj cell zkocc_14850 = utils.zkocc_start() qpser = utils.run_bg( utils.vtroot + '/bin/zkclient2 -server localhost:%u -mode qps /zk/test_nj/zkocc1/data1 /zk/test_nj/zkocc1/data2' % utils.zkocc_port_base) time.sleep(10) utils.kill_sub_process(qpser) # get the zkocc vars, make sure we have what we need v = utils.get_vars(utils.zkocc_port_base) if v['ZkReader']['test_nj']['State']['Current'] != 'Connected': raise utils.TestError('invalid zk global state: ', v['ZkReader']['test_nj']['State']['Current']) if v['ZkReader']['test_nj']['State']['DurationConnected'] < 9e9: raise utils.TestError( 'not enough time in Connected state', v['ZkReader']['test_nj']['State']['DurationConnected']) # some checks on performance / stats # a typical workstation will do 15k QPS, check we have more than 3k rpcCalls = v['ZkReader']['RpcCalls'] if rpcCalls < 30000: raise utils.TestError('QPS is too low: %u < 30000', rpcCalls / 10) cacheReads = v['ZkReader']['test_nj']['CacheReads'] if cacheReads < 30000: raise utils.TestError('Cache QPS is too low: %u < 30000', cacheReads / 10) totalCacheReads = v['ZkReader']['total']['CacheReads'] if cacheReads != totalCacheReads: raise utils.TestError('Rollup stats are wrong: %u != %u', cacheReads, totalCacheReads) if v['ZkReader']['UnknownCellErrors'] != 0: raise utils.TestError('unexpected UnknownCellErrors', v['ZkReader']['UnknownCellErrors']) utils.zkocc_kill(zkocc_14850)
def _wait_for_binlog_player_count(self, tablet, expected, timeout=5.0): while True: v = utils.get_vars(tablet.port) if v == None: logging.debug(" vttablet not answering at /debug/vars, waiting...") else: if 'BinlogPlayerMapSize' not in v: logging.debug(" vttablet not exporting BinlogPlayerMapSize, waiting...") else: s = v['BinlogPlayerMapSize'] if s != expected: logging.debug(" vttablet's binlog player map has count %u != %u", s, expected) else: break logging.debug("sleeping a bit while we wait") time.sleep(0.1) timeout -= 0.1 if timeout <= 0: self.fail("timeout waiting for binlog player count %d" % expected) logging.debug("tablet %s binlog player has %d players", tablet.tablet_alias, expected)
def _check_stats(self): v = utils.get_vars(self.vtgate_port) self.assertEqual( v['VttabletCall']['Histograms'] ['Execute.source_keyspace.0.replica']['Count'], 2, "unexpected value for VttabletCall(Execute.source_keyspace.0.replica) inside %s" % str(v)) self.assertEqual( v['VtgateApi']['Histograms'] ['ExecuteKeyRanges.destination_keyspace.master']['Count'], 6, "unexpected value for VtgateApi(ExecuteKeyRanges.destination_keyspace.master) inside %s" % str(v)) self.assertEqual( len(v['VtgateApiErrorCounts']), 0, "unexpected errors for VtgateApiErrorCounts inside %s" % str(v)) self.assertEqual( v['EndpointCount']['test_nj.source_keyspace.0.master'], 1, "unexpected EndpointCount inside %s" % str(v)) self.assertEqual( v['DegradedEndpointCount']['test_nj.source_keyspace.0.master'], 0, "unexpected DegradedEndpointCount inside %s" % str(v))
def _wait_for_binlog_server_state(self, tablet, expected, timeout=5.0): while True: v = utils.get_vars(tablet.port) if v == None: logging.debug(" vttablet not answering at /debug/vars, waiting...") else: if 'BinlogServerState' not in v: logging.debug(" vttablet not exporting BinlogServerState, waiting...") else: s = v['BinlogServerState']['Current'] if s != expected: logging.debug(" vttablet's binlog server in state %s != %s", s, expected) else: break logging.debug("sleeping a bit while we wait") time.sleep(0.1) timeout -= 0.1 if timeout <= 0: self.fail("timeout waiting for binlog server state %s" % expected) logging.debug("tablet %s binlog service is in state %s", tablet.tablet_alias, expected)
def _check_binlog_player_vars(self, tablet, seconds_behind_master_max = 0): v = utils.get_vars(tablet.port) self.assertTrue('BinlogPlayerMapSize' in v) self.assertTrue('BinlogPlayerSecondsBehindMaster' in v) self.assertTrue('BinlogPlayerSecondsBehindMasterMap' in v) self.assertTrue('BinlogPlayerSourceShardNameMap' in v) self.assertTrue('0' in v['BinlogPlayerSourceShardNameMap']) self.assertEquals(v['BinlogPlayerSourceShardNameMap']['0'], 'test_keyspace/80-') self.assertTrue('BinlogPlayerSourceTabletAliasMap' in v) self.assertTrue('0' in v['BinlogPlayerSourceTabletAliasMap']) if seconds_behind_master_max != 0: self.assertTrue(v['BinlogPlayerSecondsBehindMaster'] < seconds_behind_master_max, 'BinlogPlayerSecondsBehindMaster is too high: %u > %u' % ( v['BinlogPlayerSecondsBehindMaster'], seconds_behind_master_max)) self.assertTrue(v['BinlogPlayerSecondsBehindMasterMap']['0'] < seconds_behind_master_max, 'BinlogPlayerSecondsBehindMasterMap is too high: %u > %u' % ( v['BinlogPlayerSecondsBehindMasterMap']['0'], seconds_behind_master_max))
def test_tablet_restart(self): utils.debug("===========test_tablet_restart=========") utils.run_vtctl('ChangeSlaveType test_nj-0000062345 replica') time.sleep(5) perform_insert(100) time.sleep(5) invalidatorStats = framework.MultiDict( json.load( urllib2.urlopen("http://%s/debug/vars" % replica_host)))['CacheInvalidationProcessor'] checkpoint1 = invalidatorStats['Checkpoint'] utils.debug("invalidatorStats %s checkpoint1 %s" % (invalidatorStats, checkpoint1)) cache_counters = framework.MultiDict( utils.get_vars(replica_tablet.port))['CacheCounters'] utils.debug("cache counters %s" % cache_counters) try: purge_cache_counter = cache_counters['PurgeCache'] except KeyError, e: purge_cache_counter = 0
def verify_reconciliation_counters(self, worker_port, online_or_offline, table, inserts, updates, deletes): """Checks that the reconciliation Counters have the expected values.""" worker_vars = utils.get_vars(worker_port) i = worker_vars['Worker' + online_or_offline + 'InsertsCounters'] if inserts == 0: self.assertNotIn(table, i) else: self.assertEqual(i[table], inserts) u = worker_vars['Worker' + online_or_offline + 'UpdatesCounters'] if updates == 0: self.assertNotIn(table, u) else: self.assertEqual(u[table], updates) d = worker_vars['Worker' + online_or_offline + 'DeletesCounters'] if deletes == 0: self.assertNotIn(table, d) else: self.assertEqual(d[table], deletes)
class TestUpdateStream(unittest.TestCase): def _get_master_stream_conn(self): #return update_stream_service.UpdateStreamConnection(master_host, 30, user="******", password="******") return update_stream_service.UpdateStreamConnection(master_host, 30) def _get_replica_stream_conn(self): #return update_stream_service.UpdateStreamConnection(replica_host, 30, user="******", password="******") return update_stream_service.UpdateStreamConnection(replica_host, 30) def _test_service_disabled(self): start_position = _get_repl_current_position() logging.debug("_test_service_disabled starting @ %s" % start_position) self._exec_vt_txn(master_host, _populate_vt_insert_test) self._exec_vt_txn(master_host, [ 'delete from vt_insert_test', ]) utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) # time.sleep(20) replica_conn = self._get_replica_stream_conn() logging.debug("dialing replica update stream service") replica_conn.dial() try: binlog_pos, data, err = replica_conn.stream_start(start_position) except Exception, e: logging.debug(str(e)) if str(e) == "Update stream service is not enabled yet": logging.debug("Test Service Disabled: Pass") else: self.fail( "Test Service Disabled: Fail - did not throw the correct exception" ) v = utils.get_vars(replica_tablet.port) if v['UpdateStreamState']['Current'] != 'Disabled': self.fail( "Update stream service should be 'Disabled' but is '%s'" % v['UpdateStreamState']['Current'])
def wait_for_binlog_server_state(self, expected, timeout=30.0): while True: v = utils.get_vars(self.port) if v == None: logging.debug(" vttablet not answering at /debug/vars, waiting...") else: if 'UpdateStreamState' not in v: logging.debug(" vttablet not exporting BinlogServerState, waiting...") else: s = v['UpdateStreamState'] if s != expected: logging.debug(" vttablet's binlog server in state %s != %s", s, expected) else: break logging.debug("sleeping a bit while we wait") time.sleep(0.5) timeout -= 0.5 if timeout <= 0: raise utils.TestError("timeout waiting for binlog server state %s" % expected) logging.debug("tablet %s binlog service is in state %s", self.tablet_alias, expected)
def check_binlog_player_vars(self, tablet_obj, source_shards, seconds_behind_master_max=0): """Checks the binlog player variables are correctly exported. Args: tablet_obj: the tablet to check. source_shards: the shards to check we are replicating from. seconds_behind_master_max: if non-zero, the lag should be smaller than this value. """ v = utils.get_vars(tablet_obj.port) self.assertIn('BinlogPlayerMapSize', v) self.assertEquals(v['BinlogPlayerMapSize'], len(source_shards)) self.assertIn('BinlogPlayerSecondsBehindMaster', v) self.assertIn('BinlogPlayerSecondsBehindMasterMap', v) self.assertIn('BinlogPlayerSourceShardNameMap', v) shards = v['BinlogPlayerSourceShardNameMap'].values() self.assertEquals(sorted(shards), sorted(source_shards)) self.assertIn('BinlogPlayerSourceTabletAliasMap', v) for i in xrange(len(source_shards)): self.assertIn('%d' % i, v['BinlogPlayerSourceTabletAliasMap']) if seconds_behind_master_max != 0: self.assertTrue( v['BinlogPlayerSecondsBehindMaster'] < seconds_behind_master_max, 'BinlogPlayerSecondsBehindMaster is too high: %d > %d' % (v['BinlogPlayerSecondsBehindMaster'], seconds_behind_master_max)) for i in xrange(len(source_shards)): self.assertTrue( v['BinlogPlayerSecondsBehindMasterMap']['%d' % i] < seconds_behind_master_max, 'BinlogPlayerSecondsBehindMasterMap is too high: %d > %d' % (v['BinlogPlayerSecondsBehindMasterMap']['%d' % i], seconds_behind_master_max))
def check_binlog_player_vars(self, tablet_obj, source_shards, seconds_behind_master_max=0): """Checks the binlog player variables are correctly exported. Args: tablet_obj: the tablet to check. source_shards: the shards to check we are replicating from. seconds_behind_master_max: if non-zero, the lag should be smaller than this value. """ v = utils.get_vars(tablet_obj.port) self.assertIn('VReplicationStreamCount', v) self.assertEquals(v['VReplicationStreamCount'], len(source_shards)) self.assertIn('VReplicationSecondsBehindMasterMax', v) self.assertIn('VReplicationSecondsBehindMaster', v) self.assertIn('VReplicationSource', v) shards = v['VReplicationSource'].values() self.assertEquals(sorted(shards), sorted(source_shards)) self.assertIn('VReplicationSourceTablet', v) for uid in v['VReplicationSource']: self.assertIn(uid, v['VReplicationSourceTablet']) if seconds_behind_master_max != 0: self.assertTrue( v['VReplicationSecondsBehindMasterMax'] < seconds_behind_master_max, 'VReplicationSecondsBehindMasterMax is too high: %d > %d' % (v['VReplicationSecondsBehindMasterMax'], seconds_behind_master_max)) for uid in v['VReplicationSource']: self.assertTrue( v['VReplicationSecondsBehindMaster'][uid] < seconds_behind_master_max, 'VReplicationSecondsBehindMaster is too high: %d > %d' % (v['VReplicationSecondsBehindMaster'][uid], seconds_behind_master_max))
else: optimizer = tf.train.AdamOptimizer( learning_rate=args.learning_rate_state_value, epsilon=args.adam_eps) if args.grad_clip > 0: valGradients, valVaribales = zip( *optimizer.compute_gradients(stateValueLoss)) valGradients, _ = tf.clip_by_global_norm(valGradients, args.grad_clip) svfOptimizationStep = optimizer.apply_gradients( zip(valGradients, valVaribales)) else: svfOptimizationStep = optimizer.minimize(stateValueLoss) #other ops policyParams = utils.get_vars(policyParamsScope) getPolicyParams = utils.flat_concat(policyParams) setPolicyParams = utils.assign_params_from_flat(policyParamsFlatten, policyParams) d, HxOp = utils.hesian_vector_product(KLcontraint, policyParams) surrogateFlatLoss = utils.flat_grad(Lloss, policyParams) if args.damping_coef > 0: HxOp += args.damping_coef * d #tf session initialization init = tf.initialize_local_variables() init2 = tf.initialize_all_variables() sess.run([init, init2])
def _check_binlog_server_vars(self, tablet_obj): v = utils.get_vars(tablet_obj.port) self.assertIn('UpdateStreamKeyRangeStatements', v) self.assertIn('UpdateStreamKeyRangeTransactions', v)
def verify_successful_worker_copy_with_reparent(self, mysql_down=False): """Verifies that vtworker can successfully copy data for a SplitClone. Order of operations: 1. Run a background vtworker 2. Wait until the worker successfully resolves the destination masters. 3. Reparent the destination tablets 4. Wait until the vtworker copy is finished 5. Verify that the worker was forced to reresolve topology and retry writes due to the reparent. 6. Verify that the data was copied successfully to both new shards Args: mysql_down: boolean. If True, we take down the MySQL instances on the destination masters at first, then bring them back and reparent away. Raises: AssertionError if things didn't go as expected. """ if mysql_down: logging.debug('Shutting down mysqld on destination masters.') utils.wait_procs([ shard_0_master.shutdown_mysql(), shard_1_master.shutdown_mysql() ]) worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj'], auto_log=True) # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. # --chunk_count is 2 because rows are currently ordered by primary key such # that all rows of the first shard come first and then the second shard. # TODO(mberlin): Remove --offline=false once vtworker ensures that the # destination shards are not behind the master's replication # position. args = [ 'SplitClone', '--offline=false', '--destination_writer_count', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999' ] if not mysql_down: # Make the clone as slow as necessary such that there is enough time to # run PlannedReparent in the meantime. # TOOD(mberlin): Once insert_values is fixed to uniformly distribute the # rows across shards when sorted by primary key, remove # --chunk_count 2, --min_rows_per_chunk 1 and set # --source_reader_count back to 1. args.extend([ '--source_reader_count', '2', '--chunk_count', '2', '--min_rows_per_chunk', '1', '--write_query_max_rows', '1' ]) args.append('test_keyspace/0') workerclient_proc = utils.run_vtworker_client_bg(args, worker_rpc_port) if mysql_down: # If MySQL is down, we wait until vtworker retried at least once to make # sure it reached the point where a write failed due to MySQL being down. # There should be two retries at least, one for each destination shard. utils.poll_for_vars( 'vtworker', worker_port, 'WorkerRetryCount >= 2', condition_fn=lambda v: v.get('WorkerRetryCount') >= 2) logging.debug( 'Worker has retried at least twice, starting reparent now') # vtworker is blocked at this point. This is a good time to test that its # throttler server is reacting to RPCs. self.check_throttler_service( 'localhost:%d' % worker_rpc_port, ['test_keyspace/-80', 'test_keyspace/80-'], 9999) # Bring back masters. Since we test with semi-sync now, we need at least # one replica for the new master. This test is already quite expensive, # so we bring back the old master as a replica rather than having a third # replica up the whole time. logging.debug('Restarting mysqld on destination masters') utils.wait_procs( [shard_0_master.start_mysql(), shard_1_master.start_mysql()]) # Reparent away from the old masters. utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/-80', shard_0_replica.tablet_alias ], auto_log=True) utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-', shard_1_replica.tablet_alias ], auto_log=True) else: # NOTE: There is a race condition around this: # It's possible that the SplitClone vtworker command finishes before the # PlannedReparentShard vtctl command, which we start below, succeeds. # Then the test would fail because vtworker did not have to retry. # # To workaround this, the test takes a parameter to increase the number of # rows that the worker has to copy (with the idea being to slow the worker # down). # You should choose a value for num_insert_rows, such that this test # passes for your environment (trial-and-error...) # Make sure that vtworker got past the point where it picked a master # for each destination shard ("finding targets" state). utils.poll_for_vars( 'vtworker', worker_port, 'WorkerState == cloning the data (online)', condition_fn=lambda v: v.get('WorkerState') == 'cloning the' ' data (online)') logging.debug('Worker is in copy state, starting reparent now') utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/-80', shard_0_replica.tablet_alias ], auto_log=True) utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-', shard_1_replica.tablet_alias ], auto_log=True) utils.wait_procs([workerclient_proc]) # Verify that we were forced to re-resolve and retry. worker_vars = utils.get_vars(worker_port) self.assertGreater( worker_vars['WorkerRetryCount'], 1, "expected vtworker to retry each of the two reparented" " destination masters at least once, but it didn't") self.assertNotEqual(worker_vars['WorkerRetryCount'], {}, "expected vtworker to retry, but it didn't") utils.kill_sub_process(worker_proc, soft=True) # Wait for the destination RDONLYs to catch up or the following offline # clone will try to insert rows which already exist. # TODO(mberlin): Remove this once SplitClone supports it natively. utils.wait_for_replication_pos(shard_0_replica, shard_0_rdonly1) utils.wait_for_replication_pos(shard_1_replica, shard_1_rdonly1) # Run final offline clone to enable filtered replication. _, _ = utils.run_vtworker([ '-cell', 'test_nj', 'SplitClone', '--online=false', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], auto_log=True) # Make sure that everything is caught up to the same replication point self.run_split_diff('test_keyspace/-80', all_shard_tablets, shard_0_tablets) self.run_split_diff('test_keyspace/80-', all_shard_tablets, shard_1_tablets) self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica) self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
def test_event_token(self): """Checks the background binlog monitor thread works.""" timeout = 10 while True: replica_position = _get_repl_current_position() value = None v = utils.get_vars(replica_tablet.port) if 'EventTokenPosition' in v: value = v['EventTokenPosition'] if value == replica_position: logging.debug('got expected EventTokenPosition vars: %s', value) ts = v['EventTokenTimestamp'] now = long(time.time()) self.assertTrue( ts >= now - 120, 'EventTokenTimestamp is too old: %d < %d' % (ts, now - 120)) self.assertTrue( ts <= now, 'EventTokenTimestamp is too recent: %d > %d' % (ts, now)) break timeout = utils.wait_step( 'EventTokenPosition must be up to date but got %s (expected %s)' % (value, replica_position), timeout) # With vttablet up to date, test a vttablet query returns the EventToken. qr = replica_tablet.execute( 'select * from vt_insert_test', execute_options='include_event_token:true ') logging.debug('Got result: %s', qr) self.assertIn('extras', qr) self.assertIn('event_token', qr['extras']) self.assertEqual(qr['extras']['event_token']['position'], replica_position) # Same thing through vtgate qr = utils.vtgate.execute('select * from vt_insert_test', tablet_type='replica', execute_options='include_event_token:true ') logging.debug('Got result: %s', qr) self.assertIn('extras', qr) self.assertIn('event_token', qr['extras']) self.assertEqual(qr['extras']['event_token']['position'], replica_position) # Make sure the compare_event_token flag works, by sending a very # old timestamp, or a timestamp in the future. qr = replica_tablet.execute( 'select * from vt_insert_test', execute_options='compare_event_token: <timestamp:123 > ') self.assertIn('extras', qr) self.assertIn('fresher', qr['extras']) self.assertTrue(qr['extras']['fresher']) future_timestamp = long(time.time()) + 100 qr = replica_tablet.execute( 'select * from vt_insert_test', execute_options='compare_event_token: <timestamp:%d > ' % future_timestamp) self.assertTrue(qr['extras'] is None) # Same thing through vtgate qr = utils.vtgate.execute( 'select * from vt_insert_test', tablet_type='replica', execute_options='compare_event_token: <timestamp:123 > ') self.assertIn('extras', qr) self.assertIn('fresher', qr['extras']) self.assertTrue(qr['extras']['fresher']) future_timestamp = long(time.time()) + 100 qr = utils.vtgate.execute( 'select * from vt_insert_test', tablet_type='replica', execute_options='compare_event_token: <timestamp:%d > ' % future_timestamp) self.assertTrue(qr['extras'] is None) # Make sure the compare_event_token flag works, by sending a very # old timestamp, or a timestamp in the future, when combined with # include_event_token flag. qr = replica_tablet.execute('select * from vt_insert_test', execute_options='include_event_token:true ' 'compare_event_token: <timestamp:123 > ') self.assertIn('extras', qr) self.assertIn('fresher', qr['extras']) self.assertTrue(qr['extras']['fresher']) self.assertIn('event_token', qr['extras']) self.assertEqual(qr['extras']['event_token']['position'], replica_position) future_timestamp = long(time.time()) + 100 qr = replica_tablet.execute('select * from vt_insert_test', execute_options='include_event_token:true ' 'compare_event_token: <timestamp:%d > ' % future_timestamp) self.assertNotIn('fresher', qr['extras']) self.assertIn('event_token', qr['extras']) self.assertEqual(qr['extras']['event_token']['position'], replica_position) # Same thing through vtgate qr = utils.vtgate.execute('select * from vt_insert_test', tablet_type='replica', execute_options='include_event_token:true ' 'compare_event_token: <timestamp:123 > ') self.assertIn('extras', qr) self.assertIn('fresher', qr['extras']) self.assertTrue(qr['extras']['fresher']) self.assertIn('event_token', qr['extras']) self.assertEqual(qr['extras']['event_token']['position'], replica_position) future_timestamp = long(time.time()) + 100 qr = utils.vtgate.execute('select * from vt_insert_test', tablet_type='replica', execute_options='include_event_token:true ' 'compare_event_token: <timestamp:%d > ' % future_timestamp) self.assertNotIn('fresher', qr['extras']) self.assertIn('event_token', qr['extras']) self.assertEqual(qr['extras']['event_token']['position'], replica_position)
def get_client_count(tablet): debugvars = utils.get_vars(tablet.port) return debugvars['Messages'].get('sharded_message.ClientCount', 0)
def verify_successful_worker_copy_with_reparent(self, mysql_down=False): """Verifies that vtworker can successfully copy data for a SplitClone. Order of operations: 1. Run a background vtworker 2. Wait until the worker successfully resolves the destination masters. 3. Reparent the destination tablets 4. Wait until the vtworker copy is finished 5. Verify that the worker was forced to reresolve topology and retry writes due to the reparent. 6. Verify that the data was copied successfully to both new shards Args: mysql_down: boolean. If True, we take down the MySQL instances on the destination masters at first, then bring them back and reparent away. Raises: AssertionError if things didn't go as expected. """ if mysql_down: logging.debug('Shutting down mysqld on destination masters.') utils.wait_procs( [shard_0_master.shutdown_mysql(), shard_1_master.shutdown_mysql()]) worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj'], auto_log=True) workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--source_reader_count', '1', '--destination_pack_count', '1', '--destination_writer_count', '1', 'test_keyspace/0'], worker_rpc_port) if mysql_down: # If MySQL is down, we wait until resolving at least twice (to verify that # we do reresolve and retry due to MySQL being down). worker_vars = utils.poll_for_vars( 'vtworker', worker_port, 'WorkerDestinationActualResolves >= 2', condition_fn=lambda v: v.get('WorkerDestinationActualResolves') >= 2) self.assertNotEqual( worker_vars['WorkerRetryCount'], {}, "expected vtworker to retry, but it didn't") logging.debug('Worker has resolved at least twice, starting reparent now') # Bring back masters. Since we test with semi-sync now, we need at least # one replica for the new master. This test is already quite expensive, # so we bring back the old master as a replica rather than having a third # replica up the whole time. logging.debug('Restarting mysqld on destination masters') utils.wait_procs( [shard_0_master.start_mysql(), shard_1_master.start_mysql()]) # Reparent away from the old masters. utils.run_vtctl( ['PlannedReparentShard', 'test_keyspace/-80', shard_0_replica.tablet_alias], auto_log=True) utils.run_vtctl( ['PlannedReparentShard', 'test_keyspace/80-', shard_1_replica.tablet_alias], auto_log=True) else: # NOTE: There is a race condition around this: # It's possible that the SplitClone vtworker command finishes before the # PlannedReparentShard vtctl command, which we start below, succeeds. # Then the test would fail because vtworker did not have to resolve the # master tablet again (due to the missing reparent). # # To workaround this, the test takes a parameter to increase the number of # rows that the worker has to copy (with the idea being to slow the worker # down). # You should choose a value for num_insert_rows, such that this test # passes for your environment (trial-and-error...) utils.poll_for_vars( 'vtworker', worker_port, 'WorkerDestinationActualResolves >= 1', condition_fn=lambda v: v.get('WorkerDestinationActualResolves') >= 1) logging.debug('Worker has resolved at least once, starting reparent now') utils.run_vtctl( ['PlannedReparentShard', 'test_keyspace/-80', shard_0_replica.tablet_alias], auto_log=True) utils.run_vtctl( ['PlannedReparentShard', 'test_keyspace/80-', shard_1_replica.tablet_alias], auto_log=True) utils.wait_procs([workerclient_proc]) # Verify that we were forced to reresolve and retry. worker_vars = utils.get_vars(worker_port) self.assertGreater(worker_vars['WorkerDestinationActualResolves'], 1) self.assertGreater(worker_vars['WorkerDestinationAttemptedResolves'], 1) self.assertNotEqual(worker_vars['WorkerRetryCount'], {}, "expected vtworker to retry, but it didn't") utils.kill_sub_process(worker_proc, soft=True) # Make sure that everything is caught up to the same replication point self.run_split_diff('test_keyspace/-80', all_shard_tablets, shard_0_tablets) self.run_split_diff('test_keyspace/80-', all_shard_tablets, shard_1_tablets) self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica) self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
def _check_binlog_server_vars(self, tablet, timeout=5.0): v = utils.get_vars(tablet.port) self.assertTrue("UpdateStreamKeyRangeStatements" in v) self.assertTrue("UpdateStreamKeyRangeTransactions" in v)