Exemplo n.º 1
0
  def test_restart_during_action(self):
    # Start up a master mysql and vttablet
    utils.run_vtctl('CreateKeyspace test_keyspace')

    tablet_62344.init_tablet('master', 'test_keyspace', '0')
    utils.run_vtctl('RebuildShardGraph test_keyspace/0')
    utils.validate_topology()
    tablet_62344.create_db('vt_test_keyspace')
    tablet_62344.start_vttablet()

    utils.run_vtctl('Ping ' + tablet_62344.tablet_alias)

    # schedule long action
    utils.run_vtctl('-no-wait Sleep %s 15s' % tablet_62344.tablet_alias, stdout=utils.devnull)
    # ping blocks until the sleep finishes unless we have a schedule race
    action_path, _ = utils.run_vtctl('-no-wait Ping ' + tablet_62344.tablet_alias, trap_output=True)

    # kill agent leaving vtaction running
    tablet_62344.kill_vttablet()

    # restart agent
    tablet_62344.start_vttablet()

    # we expect this action with a short wait time to fail. this isn't the best
    # and has some potential for flakiness.
    utils.run_fail(utils.vtroot+'/bin/vtctl -log_dir '+utils.tmp_root+' --alsologtostderr -wait-time 2s WaitForAction ' + action_path)

    # wait until the background sleep action is done, otherwise there will be
    # a leftover vtaction whose result may overwrite running actions
    # NOTE(alainjobart): Yes, I've seen it happen, it's a pain to debug:
    # the zombie Sleep clobbers the Clone command in the following tests
    utils.run_vtctl('-wait-time 20s WaitForAction ' + action_path,
                    auto_log=True)

    # extra small test: we ran for a while, get the states we were in,
    # make sure they're accounted for properly
    # first the query engine States
    v = utils.get_vars(tablet_62344.port)
    logging.debug("vars: %s" % str(v))
    if v['Voltron']['States']['DurationSERVING'] < 10e9:
      raise utils.TestError('not enough time in Open state', v['Voltron']['States']['DurationSERVING'])
    # then the Zookeeper connections
    if v['ZkMetaConn']['test_nj']['Current'] != 'Connected':
      raise utils.TestError('invalid zk test_nj state: ', v['ZkMetaConn']['test_nj']['Current'])
    if v['ZkMetaConn']['global']['Current'] != 'Connected':
      raise utils.TestError('invalid zk global state: ', v['ZkMetaConn']['global']['Current'])
    if v['ZkMetaConn']['test_nj']['DurationConnected'] < 10e9:
      raise utils.TestError('not enough time in Connected state', v['ZkMetaConn']['test_nj']['DurationConnected'])
    if v['TabletType'] != 'master':
      raise utils.TestError('TabletType not exported correctly')

    tablet_62344.kill_vttablet()
  def test_restart_during_action(self):
    # Start up a master mysql and vttablet
    utils.run_vtctl('CreateKeyspace test_keyspace')

    tablet_62344.init_tablet('master', 'test_keyspace', '0')
    utils.run_vtctl('RebuildShardGraph test_keyspace/0')
    utils.validate_topology()
    tablet_62344.create_db('vt_test_keyspace')
    tablet_62344.start_vttablet()

    utils.run_vtctl('Ping ' + tablet_62344.tablet_alias)

    # schedule long action
    utils.run_vtctl('-no-wait Sleep %s 15s' % tablet_62344.tablet_alias, stdout=utils.devnull)
    # ping blocks until the sleep finishes unless we have a schedule race
    action_path, _ = utils.run_vtctl('-no-wait Ping ' + tablet_62344.tablet_alias, trap_output=True)

    # kill agent leaving vtaction running
    tablet_62344.kill_vttablet()

    # restart agent
    tablet_62344.start_vttablet()

    # we expect this action with a short wait time to fail. this isn't the best
    # and has some potential for flakiness.
    utils.run_fail(utils.vtroot+'/bin/vtctl -log_dir '+utils.tmp_root+' --alsologtostderr -wait-time 2s WaitForAction ' + action_path)

    # wait until the background sleep action is done, otherwise there will be
    # a leftover vtaction whose result may overwrite running actions
    # NOTE(alainjobart): Yes, I've seen it happen, it's a pain to debug:
    # the zombie Sleep clobbers the Clone command in the following tests
    utils.run_vtctl('-wait-time 20s WaitForAction ' + action_path,
                    auto_log=True)

    # extra small test: we ran for a while, get the states we were in,
    # make sure they're accounted for properly
    # first the query engine States
    v = utils.get_vars(tablet_62344.port)
    logging.debug("vars: %s" % str(v))
    if v['Voltron']['States']['DurationSERVING'] < 10e9:
      raise utils.TestError('not enough time in Open state', v['Voltron']['States']['DurationSERVING'])
    # then the Zookeeper connections
    if v['ZkMetaConn']['test_nj']['Current'] != 'Connected':
      raise utils.TestError('invalid zk test_nj state: ', v['ZkMetaConn']['test_nj']['Current'])
    if v['ZkMetaConn']['global']['Current'] != 'Connected':
      raise utils.TestError('invalid zk global state: ', v['ZkMetaConn']['global']['Current'])
    if v['ZkMetaConn']['test_nj']['DurationConnected'] < 10e9:
      raise utils.TestError('not enough time in Connected state', v['ZkMetaConn']['test_nj']['DurationConnected'])
    if v['TabletType'] != 'master':
      raise utils.TestError('TabletType not exported correctly')

    tablet_62344.kill_vttablet()
Exemplo n.º 3
0
def run_test_restart_during_action():
    # Start up a master mysql and vttablet
    utils.run_vtctl("CreateKeyspace -force test_keyspace")

    tablet_62344.init_tablet("master", "test_keyspace", "0")
    utils.run_vtctl("RebuildShardGraph test_keyspace/0")
    utils.validate_topology()
    tablet_62344.create_db("vt_test_keyspace")
    tablet_62344.start_vttablet()

    utils.run_vtctl("Ping " + tablet_62344.tablet_alias)

    # schedule long action
    utils.run_vtctl("-no-wait Sleep %s 15s" % tablet_62344.tablet_alias, stdout=devnull)
    # ping blocks until the sleep finishes unless we have a schedule race
    action_path, _ = utils.run_vtctl("-no-wait Ping " + tablet_62344.tablet_alias, trap_output=True)

    # kill agent leaving vtaction running
    tablet_62344.kill_vttablet()

    # restart agent
    tablet_62344.start_vttablet()

    # we expect this action with a short wait time to fail. this isn't the best
    # and has some potential for flakiness.
    utils.run_fail(utils.vtroot + "/bin/vtctl --alsologtostderr -wait-time 2s WaitForAction " + action_path)

    # wait until the background sleep action is done, otherwise there will be
    # a leftover vtaction whose result may overwrite running actions
    # NOTE(alainjobart): Yes, I've seen it happen, it's a pain to debug:
    # the zombie Sleep clobbers the Clone command in the following tests
    utils.run(utils.vtroot + "/bin/vtctl --alsologtostderr -wait-time 20s WaitForAction " + action_path)

    # extra small test: we ran for a while, get the states we were in,
    # make sure they're accounted for properly
    # first the query engine States
    v = utils.get_vars(tablet_62344.port)
    utils.debug("vars: %s" % str(v))
    if v["Voltron"]["States"]["DurationOPEN"] < 10e9:
        raise utils.TestError("not enough time in Open state", v["Voltron"]["States"]["DurationOPEN"])
    # then the Zookeeper connections
    if v["ZkMetaConn"]["test_nj"]["Current"] != "Connected":
        raise utils.TestError("invalid zk test_nj state: ", v["ZkMetaConn"]["test_nj"]["Current"])
    if v["ZkMetaConn"]["global"]["Current"] != "Connected":
        raise utils.TestError("invalid zk global state: ", v["ZkMetaConn"]["global"]["Current"])
    if v["ZkMetaConn"]["test_nj"]["DurationConnected"] < 10e9:
        raise utils.TestError("not enough time in Connected state", v["ZkMetaConn"]["test_nj"]["DurationConnected"])
    if v["tablet-type"] != "master":
        raise utils.TestError("tablet-type not exported correctly")

    tablet_62344.kill_vttablet()
Exemplo n.º 4
0
def run_test_rebuild():
    utils.run_vtctl("CreateKeyspace -force test_keyspace")
    tablet_62344.init_tablet("master", "test_keyspace", "0")
    tablet_62044.init_tablet("replica", "test_keyspace", "0")
    tablet_31981.init_tablet("experimental", "test_keyspace", "0")  # in ny by default

    utils.run_vtctl("RebuildKeyspaceGraph -cells=test_nj test_keyspace", auto_log=True)
    utils.run_fail(utils.vtroot + "/bin/zk cat /zk/test_ny/vt/ns/test_keyspace/0/master")

    utils.run_vtctl("RebuildKeyspaceGraph -cells=test_ny test_keyspace", auto_log=True)

    real_master = utils.zk_cat("/zk/test_nj/vt/ns/test_keyspace/0/master")
    master_alias = utils.zk_cat("/zk/test_ny/vt/ns/test_keyspace/0/master")
    if real_master != master_alias:
        raise utils.TestError("master serving graph in all cells failed:\n%s!=\n%s" % (real_master, master_alias))
Exemplo n.º 5
0
def run_test_rebuild():
  utils.run_vtctl('CreateKeyspace -force test_keyspace')
  tablet_62344.init_tablet('master', 'test_keyspace', '0')
  tablet_62044.init_tablet('replica', 'test_keyspace', '0')
  tablet_31981.init_tablet('experimental', 'test_keyspace', '0') # in ny by default

  utils.run_vtctl('RebuildKeyspaceGraph -cells=test_nj test_keyspace',
                  auto_log=True)
  utils.run_fail(utils.vtroot+'/bin/zk cat /zk/test_ny/vt/ns/test_keyspace/0/master')

  utils.run_vtctl('RebuildKeyspaceGraph -cells=test_ny test_keyspace',
                  auto_log=True)

  real_master = utils.zk_cat('/zk/test_nj/vt/ns/test_keyspace/0/master')
  master_alias = utils.zk_cat('/zk/test_ny/vt/ns/test_keyspace/0/master')
  if real_master != master_alias:
    raise utils.TestError('master serving graph in all cells failed:\n%s!=\n%s'
                          % (real_master, master_alias))
Exemplo n.º 6
0
def run_test_reparent_down_master():
  utils.zk_wipe()

  utils.run_vtctl('CreateKeyspace -force test_keyspace')

  # create the database so vttablets start, as they are serving
  tablet_62344.create_db('vt_test_keyspace')
  tablet_62044.create_db('vt_test_keyspace')
  tablet_41983.create_db('vt_test_keyspace')
  tablet_31981.create_db('vt_test_keyspace')

  # Start up a master mysql and vttablet
  tablet_62344.init_tablet('master', 'test_keyspace', '0', start=True)

  # Create a few slaves for testing reparenting.
  tablet_62044.init_tablet('replica', 'test_keyspace', '0', start=True)
  tablet_41983.init_tablet('replica', 'test_keyspace', '0', start=True)
  tablet_31981.init_tablet('replica', 'test_keyspace', '0', start=True)

  # Recompute the shard layout node - until you do that, it might not be valid.
  utils.run_vtctl('RebuildShardGraph test_keyspace/0')
  utils.validate_topology()

  # Force the slaves to reparent assuming that all the datasets are identical.
  utils.run_vtctl('ReparentShard -force test_keyspace/0 ' + tablet_62344.tablet_alias, auto_log=True)
  utils.validate_topology()

  # Make the master agent and database unavailable.
  tablet_62344.kill_vttablet()
  tablet_62344.shutdown_mysql().wait()

  expected_addr = utils.hostname + ':' + str(tablet_62344.port)
  _check_db_addr('test_keyspace.0.master:_vtocc', expected_addr)

  # Perform a reparent operation - the Validate part will try to ping
  # the master and fail somewhat quickly
  stdout, stderr = utils.run_fail(utils.vtroot+'/bin/vtctl -logfile=/dev/null -log.level=INFO -wait-time 5s ReparentShard test_keyspace/0 ' + tablet_62044.tablet_alias)
  utils.debug("Failed ReparentShard output:\n" + stderr)
  if 'ValidateShard verification failed: timed out during validate' not in stderr:
    raise utils.TestError("didn't find the right error strings in failed ReparentShard: " + stderr)

  # Should timeout and fail
  stdout, stderr = utils.run_fail(utils.vtroot+'/bin/vtctl -logfile=/dev/null -log.level=INFO -wait-time 5s ScrapTablet ' + tablet_62344.tablet_alias)
  utils.debug("Failed ScrapTablet output:\n" + stderr)
  if 'deadline exceeded' not in stderr:
    raise utils.TestError("didn't find the right error strings in failed ScrapTablet: " + stderr)

  # Should interrupt and fail
  sp = utils.run_bg(utils.vtroot+'/bin/vtctl -log.level=INFO -wait-time 10s ScrapTablet ' + tablet_62344.tablet_alias, stdout=PIPE, stderr=PIPE)
  # Need time for the process to start before killing it.
  time.sleep(0.1)
  os.kill(sp.pid, signal.SIGINT)
  stdout, stderr = sp.communicate()

  utils.debug("Failed ScrapTablet output:\n" + stderr)
  if 'interrupted' not in stderr:
    raise utils.TestError("didn't find the right error strings in failed ScrapTablet: " + stderr)

  # Force the scrap action in zk even though tablet is not accessible.
  tablet_62344.scrap(force=True)

  utils.run_fail(utils.vtroot+'/bin/vtctl -logfile=/dev/null -log.level=WARNING ChangeSlaveType -force %s idle' %
                 tablet_62344.tablet_alias)

  # Remove pending locks (make this the force option to ReparentShard?)
  utils.run_vtctl('PurgeActions /zk/global/vt/keyspaces/test_keyspace/shards/0/action')

  # Re-run reparent operation, this shoud now proceed unimpeded.
  utils.run_vtctl('-wait-time 1m ReparentShard test_keyspace/0 ' + tablet_62044.tablet_alias, auto_log=True)

  utils.validate_topology()
  expected_addr = utils.hostname + ':' + str(tablet_62044.port)
  _check_db_addr('test_keyspace.0.master:_vtocc', expected_addr)

  utils.run_vtctl('ChangeSlaveType -force %s idle' % tablet_62344.tablet_alias)

  idle_tablets, _ = utils.run_vtctl('ListAllTablets test_nj', trap_output=True)
  if '0000062344 <null> <null> idle' not in idle_tablets:
    raise utils.TestError('idle tablet not found', idle_tablets)

  tablet_62044.kill_vttablet()
  tablet_41983.kill_vttablet()
  tablet_31981.kill_vttablet()

  # sothe other tests don't have any surprise
  tablet_62344.start_mysql().wait()
Exemplo n.º 7
0
    def test_resharding(self):
        utils.run_vtctl('CreateKeyspace test_keyspace')

        shard_0_master.init_tablet('master', 'test_keyspace', '-80')
        shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
        shard_1_master.init_tablet('master', 'test_keyspace', '80-')
        shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-')
        shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-')

        utils.run_vtctl(
            'RebuildShardGraph /zk/global/vt/keyspaces/test_keyspace/shards/*',
            auto_log=True)

        utils.run_vtctl(
            'RebuildKeyspaceGraph /zk/global/vt/keyspaces/test_keyspace',
            auto_log=True)

        # create databases so vttablet can start behaving normally
        shard_0_master.create_db('vt_test_keyspace')
        shard_0_replica.create_db('vt_test_keyspace')
        shard_1_master.create_db('vt_test_keyspace')
        shard_1_slave1.create_db('vt_test_keyspace')
        shard_1_slave2.create_db('vt_test_keyspace')

        # start the tablets
        shard_0_master.start_vttablet()
        shard_0_replica.start_vttablet()
        shard_1_master.start_vttablet()
        shard_1_slave1.start_vttablet()
        shard_1_slave2.start_vttablet(wait_for_state='NOT_SERVING')  # spare

        # reparent to make the tablets work
        utils.run_vtctl('ReparentShard -force test_keyspace/-80 ' +
                        shard_0_master.tablet_alias,
                        auto_log=True)
        utils.run_vtctl('ReparentShard -force test_keyspace/80- ' +
                        shard_1_master.tablet_alias,
                        auto_log=True)

        # create the tables
        self._create_schema()
        self._insert_startup_values()

        # create the split shards
        shard_2_master.init_tablet('master', 'test_keyspace', '80-C0')
        shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0')
        shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0')
        shard_3_master.init_tablet('master', 'test_keyspace', 'C0-')
        shard_3_replica.init_tablet('spare', 'test_keyspace', 'C0-')

        # start vttablet on the split shards (no db created,
        # so they're all not serving)
        shard_2_master.start_vttablet(wait_for_state='CONNECTING')
        shard_2_replica1.start_vttablet(wait_for_state='NOT_SERVING')
        shard_2_replica2.start_vttablet(wait_for_state='NOT_SERVING')
        shard_3_master.start_vttablet(wait_for_state='CONNECTING')
        shard_3_replica.start_vttablet(wait_for_state='NOT_SERVING')

        utils.run_vtctl('ReparentShard -force test_keyspace/80-C0 ' +
                        shard_2_master.tablet_alias,
                        auto_log=True)
        utils.run_vtctl('ReparentShard -force test_keyspace/C0- ' +
                        shard_3_master.tablet_alias,
                        auto_log=True)

        utils.run_vtctl(
            'RebuildShardGraph /zk/global/vt/keyspaces/test_keyspace/shards/*',
            auto_log=True)

        utils.run_vtctl('RebuildKeyspaceGraph -use-served-types test_keyspace',
                        auto_log=True)
        self._check_srv_keyspace(
            'test_nj', 'test_keyspace',
            'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' +
            'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica')

        # take the snapshot for the split
        utils.run_vtctl('MultiSnapshot --spec=80-C0- %s keyspace_id' %
                        (shard_1_slave1.tablet_alias),
                        auto_log=True)

        # wait for tablet's binlog server service to be enabled after snapshot,
        # and check all the others while we're at it
        self._wait_for_binlog_server_state(shard_1_master, "Disabled")
        self._wait_for_binlog_server_state(shard_1_slave1, "Enabled")

        # perform the restore.
        utils.run_vtctl([
            'ShardMultiRestore', '-strategy=populateBlpCheckpoint',
            'test_keyspace/80-C0', shard_1_slave1.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'ShardMultiRestore', '-strategy=populateBlpCheckpoint',
            'test_keyspace/C0-', shard_1_slave1.tablet_alias
        ],
                        auto_log=True)

        # check the startup values are in the right place
        self._check_startup_values()

        # check the schema too
        utils.run_vtctl('ValidateSchemaKeyspace test_keyspace', auto_log=True)

        # check the binlog players are running
        self._wait_for_binlog_player_count(shard_2_master, 1)
        self._wait_for_binlog_player_count(shard_3_master, 1)

        # testing filtered replication: insert a bunch of data on shard 1,
        # check we get most of it after a few seconds, wait for binlog server
        # timeout, check we get all of it.
        logging.debug("Inserting lots of data on source shard")
        self._insert_lots(1000)
        logging.debug("Checking 80 percent of data is sent quickly")
        self._check_lots_timeout(1000, 80, 5)
        logging.debug("Checking all data goes through eventually")
        self._check_lots_timeout(1000, 100, 20)
        logging.debug("Checking no data was sent the wrong way")
        self._check_lots_not_present(1000)

        # start a thread to insert data into shard_1 in the background
        # with current time, and monitor the delay
        insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000,
                                       0x9000000000000000)
        insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001,
                                       0xD000000000000000)
        monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low")
        monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high")

        # tests a failover switching serving to a different replica
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica'])
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare'])
        shard_1_slave2.wait_for_vttablet_state('OPEN')
        shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')

        # test data goes through again
        logging.debug("Inserting lots of data on source shard")
        self._insert_lots(1000, base=1000)
        logging.debug("Checking 80 percent of data was sent quickly")
        self._check_lots_timeout(1000, 80, 5, base=1000)

        # now serve rdonly from the split shards
        utils.run_fail(
            utils.vtroot +
            '/bin/vtctl MigrateServedTypes test_keyspace/80- master')
        utils.run_vtctl('MigrateServedTypes test_keyspace/80- rdonly',
                        auto_log=True)
        self._check_srv_keyspace(
            'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' +
            'Partitions(rdonly): -80 80-C0 C0-\n' +
            'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica')

        # then serve replica from the split shards
        utils.run_vtctl('MigrateServedTypes test_keyspace/80- replica',
                        auto_log=True)
        self._check_srv_keyspace(
            'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' +
            'Partitions(rdonly): -80 80-C0 C0-\n' +
            'Partitions(replica): -80 80-C0 C0-\n' +
            'TabletTypes: master,replica')

        # move replica back and forth
        utils.run_vtctl(
            'MigrateServedTypes -reverse test_keyspace/80- replica',
            auto_log=True)
        self._check_srv_keyspace(
            'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' +
            'Partitions(rdonly): -80 80-C0 C0-\n' +
            'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica')
        utils.run_vtctl('MigrateServedTypes test_keyspace/80- replica',
                        auto_log=True)
        self._check_srv_keyspace(
            'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' +
            'Partitions(rdonly): -80 80-C0 C0-\n' +
            'Partitions(replica): -80 80-C0 C0-\n' +
            'TabletTypes: master,replica')

        # reparent shard_2 to shard_2_replica1, then insert more data and
        # see it flow through still
        utils.run_vtctl('ReparentShard test_keyspace/80-C0 %s' %
                        shard_2_replica1.tablet_alias)
        logging.debug(
            "Inserting lots of data on source shard after reparenting")
        self._insert_lots(3000, base=2000)
        logging.debug("Checking 80 percent of data was sent fairly quickly")
        self._check_lots_timeout(3000, 80, 10, base=2000)

        # going to migrate the master now, check the delays
        monitor_thread_1.done = True
        monitor_thread_2.done = True
        insert_thread_1.done = True
        insert_thread_2.done = True
        logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u",
                      monitor_thread_1.object_name, monitor_thread_1.max_lag,
                      monitor_thread_1.lag_sum / monitor_thread_1.sample_count)
        logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u",
                      monitor_thread_2.object_name, monitor_thread_2.max_lag,
                      monitor_thread_2.lag_sum / monitor_thread_2.sample_count)

        # then serve master from the split shards
        utils.run_vtctl('MigrateServedTypes test_keyspace/80- master',
                        auto_log=True)
        self._check_srv_keyspace(
            'test_nj', 'test_keyspace', 'Partitions(master): -80 80-C0 C0-\n' +
            'Partitions(rdonly): -80 80-C0 C0-\n' +
            'Partitions(replica): -80 80-C0 C0-\n' +
            'TabletTypes: master,replica')

        # check the binlog players are gone now
        self._wait_for_binlog_player_count(shard_2_master, 0)
        self._wait_for_binlog_player_count(shard_3_master, 0)

        # kill everything
        for t in [
                shard_0_master, shard_0_replica, shard_1_master,
                shard_1_slave1, shard_1_slave2, shard_2_master,
                shard_2_replica1, shard_2_replica2, shard_3_master,
                shard_3_replica
        ]:
            t.kill_vttablet()
Exemplo n.º 8
0
  def test_resharding(self):
    utils.run_vtctl('CreateKeyspace test_keyspace')

    shard_0_master.init_tablet( 'master',  'test_keyspace', '-80')
    shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
    shard_1_master.init_tablet( 'master',  'test_keyspace', '80-')
    shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-')
    shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-')

    utils.run_vtctl('RebuildShardGraph /zk/global/vt/keyspaces/test_keyspace/shards/*', auto_log=True)

    utils.run_vtctl('RebuildKeyspaceGraph /zk/global/vt/keyspaces/test_keyspace', auto_log=True)

    # create databases so vttablet can start behaving normally
    shard_0_master.create_db('vt_test_keyspace')
    shard_0_replica.create_db('vt_test_keyspace')
    shard_1_master.create_db('vt_test_keyspace')
    shard_1_slave1.create_db('vt_test_keyspace')
    shard_1_slave2.create_db('vt_test_keyspace')

    # start the tablets
    shard_0_master.start_vttablet()
    shard_0_replica.start_vttablet()
    shard_1_master.start_vttablet()
    shard_1_slave1.start_vttablet()
    shard_1_slave2.start_vttablet(wait_for_state='NOT_SERVING') # spare

    # reparent to make the tablets work
    utils.run_vtctl('ReparentShard -force test_keyspace/-80 ' + shard_0_master.tablet_alias, auto_log=True)
    utils.run_vtctl('ReparentShard -force test_keyspace/80- ' + shard_1_master.tablet_alias, auto_log=True)

    # create the tables
    self._create_schema()
    self._insert_startup_values()

    # create the split shards
    shard_2_master.init_tablet( 'master',  'test_keyspace', '80-C0')
    shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0')
    shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0')
    shard_3_master.init_tablet( 'master',  'test_keyspace', 'C0-')
    shard_3_replica.init_tablet('spare', 'test_keyspace', 'C0-')

    # start vttablet on the split shards (no db created,
    # so they're all not serving)
    shard_2_master.start_vttablet(wait_for_state='CONNECTING')
    shard_2_replica1.start_vttablet(wait_for_state='NOT_SERVING')
    shard_2_replica2.start_vttablet(wait_for_state='NOT_SERVING')
    shard_3_master.start_vttablet(wait_for_state='CONNECTING')
    shard_3_replica.start_vttablet(wait_for_state='NOT_SERVING')

    utils.run_vtctl('ReparentShard -force test_keyspace/80-C0 ' + shard_2_master.tablet_alias, auto_log=True)
    utils.run_vtctl('ReparentShard -force test_keyspace/C0- ' + shard_3_master.tablet_alias, auto_log=True)

    utils.run_vtctl('RebuildShardGraph /zk/global/vt/keyspaces/test_keyspace/shards/*', auto_log=True)

    utils.run_vtctl('RebuildKeyspaceGraph -use-served-types test_keyspace', auto_log=True)
    self._check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n' +
                             'Partitions(rdonly): -80 80-\n' +
                             'Partitions(replica): -80 80-\n' +
                             'TabletTypes: master,replica')

    # take the snapshot for the split
    utils.run_vtctl('MultiSnapshot --spec=80-C0- %s keyspace_id' % (shard_1_slave1.tablet_alias), auto_log=True)

    # wait for tablet's binlog server service to be enabled after snapshot,
    # and check all the others while we're at it
    self._wait_for_binlog_server_state(shard_1_master, "Disabled")
    self._wait_for_binlog_server_state(shard_1_slave1, "Enabled")

    # perform the restore.
    utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/80-C0', shard_1_slave1.tablet_alias], auto_log=True)
    utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/C0-', shard_1_slave1.tablet_alias], auto_log=True)

    # check the startup values are in the right place
    self._check_startup_values()

    # check the schema too
    utils.run_vtctl('ValidateSchemaKeyspace test_keyspace', auto_log=True)

    # check the binlog players are running
    self._wait_for_binlog_player_count(shard_2_master, 1)
    self._wait_for_binlog_player_count(shard_3_master, 1)

    # testing filtered replication: insert a bunch of data on shard 1,
    # check we get most of it after a few seconds, wait for binlog server
    # timeout, check we get all of it.
    logging.debug("Inserting lots of data on source shard")
    self._insert_lots(1000)
    logging.debug("Checking 80 percent of data is sent quickly")
    self._check_lots_timeout(1000, 80, 5)
    logging.debug("Checking all data goes through eventually")
    self._check_lots_timeout(1000, 100, 20)
    logging.debug("Checking no data was sent the wrong way")
    self._check_lots_not_present(1000)

    # start a thread to insert data into shard_1 in the background
    # with current time, and monitor the delay
    insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000)
    insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000)
    monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low")
    monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high")

    # tests a failover switching serving to a different replica
    utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica'])
    utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare'])
    shard_1_slave2.wait_for_vttablet_state('OPEN')
    shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')

    # test data goes through again
    logging.debug("Inserting lots of data on source shard")
    self._insert_lots(1000, base=1000)
    logging.debug("Checking 80 percent of data was sent quickly")
    self._check_lots_timeout(1000, 80, 5, base=1000)

    # now serve rdonly from the split shards
    utils.run_fail(utils.vtroot+'/bin/vtctl MigrateServedTypes test_keyspace/80- master')
    utils.run_vtctl('MigrateServedTypes test_keyspace/80- rdonly', auto_log=True)
    self._check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n' +
                             'Partitions(rdonly): -80 80-C0 C0-\n' +
                             'Partitions(replica): -80 80-\n' +
                             'TabletTypes: master,replica')

    # then serve replica from the split shards
    utils.run_vtctl('MigrateServedTypes test_keyspace/80- replica', auto_log=True)
    self._check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n' +
                             'Partitions(rdonly): -80 80-C0 C0-\n' +
                             'Partitions(replica): -80 80-C0 C0-\n' +
                             'TabletTypes: master,replica')

    # move replica back and forth
    utils.run_vtctl('MigrateServedTypes -reverse test_keyspace/80- replica', auto_log=True)
    self._check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n' +
                             'Partitions(rdonly): -80 80-C0 C0-\n' +
                             'Partitions(replica): -80 80-\n' +
                             'TabletTypes: master,replica')
    utils.run_vtctl('MigrateServedTypes test_keyspace/80- replica', auto_log=True)
    self._check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n' +
                             'Partitions(rdonly): -80 80-C0 C0-\n' +
                             'Partitions(replica): -80 80-C0 C0-\n' +
                             'TabletTypes: master,replica')

    # reparent shard_2 to shard_2_replica1, then insert more data and
    # see it flow through still
    utils.run_vtctl('ReparentShard test_keyspace/80-C0 %s' % shard_2_replica1.tablet_alias)
    logging.debug("Inserting lots of data on source shard after reparenting")
    self._insert_lots(3000, base=2000)
    logging.debug("Checking 80 percent of data was sent fairly quickly")
    self._check_lots_timeout(3000, 80, 10, base=2000)

    # going to migrate the master now, check the delays
    monitor_thread_1.done = True
    monitor_thread_2.done = True
    insert_thread_1.done = True
    insert_thread_2.done = True
    logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u",
                  monitor_thread_1.object_name,
                  monitor_thread_1.max_lag,
                  monitor_thread_1.lag_sum / monitor_thread_1.sample_count)
    logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u",
                  monitor_thread_2.object_name,
                  monitor_thread_2.max_lag,
                  monitor_thread_2.lag_sum / monitor_thread_2.sample_count)

    # then serve master from the split shards
    utils.run_vtctl('MigrateServedTypes test_keyspace/80- master', auto_log=True)
    self._check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-C0 C0-\n' +
                             'Partitions(rdonly): -80 80-C0 C0-\n' +
                             'Partitions(replica): -80 80-C0 C0-\n' +
                             'TabletTypes: master,replica')

    # check the binlog players are gone now
    self._wait_for_binlog_player_count(shard_2_master, 0)
    self._wait_for_binlog_player_count(shard_3_master, 0)

    # kill everything
    for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_slave1,
              shard_1_slave2, shard_2_master, shard_2_replica1,
              shard_2_replica2, shard_3_master, shard_3_replica]:
      t.kill_vttablet()