Exemplo n.º 1
0
    def test_webinterface(self):
        worker_base_url = 'http://localhost:%d' % int(self.worker_port)
        # Wait for /status to become available.
        timeout = 10
        while True:
            done = False
            try:
                urllib2.urlopen(worker_base_url + '/status').read()
                done = True
            except urllib2.URLError:
                pass
            if done:
                break
            timeout = utils.wait_step(
                'worker /status webpage must be available', timeout)

        # Run the command twice to make sure it's idempotent.
        for _ in range(2):
            # Run Ping command.
            try:
                urllib2.urlopen(worker_base_url + '/Debugging/Ping',
                                data=urllib.urlencode({'message':
                                                       'pong'})).read()
                raise Exception(
                    'Should have thrown an HTTPError for the redirect.')
            except urllib2.HTTPError as e:
                self.assertEqual(e.code, 307)
            # Wait for the Ping command to finish.
            utils.poll_for_vars(
                'vtworker',
                self.worker_port,
                'WorkerState == done',
                condition_fn=lambda v: v.get('WorkerState') == 'done')
            # Verify that the command logged something and its available at /status.
            status = urllib2.urlopen(worker_base_url + '/status').read()
            self.assertIn("Ping command was called with message: 'pong'",
                          status,
                          'Command did not log output to /status: %s' % status)

            # Reset the job.
            urllib2.urlopen(worker_base_url + '/reset').read()
            status_after_reset = urllib2.urlopen(worker_base_url +
                                                 '/status').read()
            self.assertIn(
                'This worker is idle.', status_after_reset,
                '/status does not indicate that the reset was successful')
Exemplo n.º 2
0
  def test_webinterface(self):
    worker_base_url = 'http://localhost:%d' % int(self.worker_port)
    # Wait for /status to become available.
    timeout = 10
    while True:
      done = False
      try:
        urllib2.urlopen(worker_base_url + '/status').read()
        done = True
      except urllib2.URLError:
        pass
      if done:
        break
      timeout = utils.wait_step(
          'worker /status webpage must be available', timeout)

    # Run the command twice to make sure it's idempotent.
    for _ in range(2):
      # Run Ping command.
      try:
        urllib2.urlopen(
            worker_base_url + '/Debugging/Ping',
            data=urllib.urlencode({'message': 'pong'})).read()
        raise Exception('Should have thrown an HTTPError for the redirect.')
      except urllib2.HTTPError as e:
        self.assertEqual(e.code, 307)
      # Wait for the Ping command to finish.
      utils.poll_for_vars(
          'vtworker', self.worker_port,
          'WorkerState == done',
          condition_fn=lambda v: v.get('WorkerState') == 'done')
      # Verify that the command logged something and it's available at /status.
      status = urllib2.urlopen(worker_base_url + '/status').read()
      self.assertIn(
          "Ping command was called with message: 'pong'", status,
          'Command did not log output to /status: %s' % status)

      # Reset the job.
      urllib2.urlopen(worker_base_url + '/reset').read()
      status_after_reset = urllib2.urlopen(worker_base_url + '/status').read()
      self.assertIn(
          'This worker is idle.', status_after_reset,
          '/status does not indicate that the reset was successful')
Exemplo n.º 3
0
    def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
        """Verifies that vtworker can successfully copy data for a SplitClone.

    Order of operations:
    1. Run a background vtworker
    2. Wait until the worker successfully resolves the destination masters.
    3. Reparent the destination tablets
    4. Wait until the vtworker copy is finished
    5. Verify that the worker was forced to reresolve topology and retry writes
      due to the reparent.
    6. Verify that the data was copied successfully to both new shards

    Args:
      mysql_down: boolean. If True, we take down the MySQL instances on the
        destination masters at first, then bring them back and reparent away.

    Raises:
      AssertionError if things didn't go as expected.
    """
        if mysql_down:
            logging.debug('Shutting down mysqld on destination masters.')
            utils.wait_procs([
                shard_0_master.shutdown_mysql(),
                shard_1_master.shutdown_mysql()
            ])

        worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
            ['--cell', 'test_nj'], auto_log=True)

        # --max_tps is only specified to enable the throttler and ensure that the
        # code is executed. But the intent here is not to throttle the test, hence
        # the rate limit is set very high.
        # --chunk_count is 2 because rows are currently ordered by primary key such
        # that all rows of the first shard come first and then the second shard.
        # TODO(mberlin): Remove --offline=false once vtworker ensures that the
        #                destination shards are not behind the master's replication
        #                position.
        args = [
            'SplitClone', '--offline=false', '--destination_writer_count', '1',
            '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999'
        ]
        if not mysql_down:
            # Make the clone as slow as necessary such that there is enough time to
            # run PlannedReparent in the meantime.
            # TOOD(mberlin): Once insert_values is fixed to uniformly distribute the
            #                rows across shards when sorted by primary key, remove
            #                --chunk_count 2, --min_rows_per_chunk 1 and set
            #                --source_reader_count back to 1.
            args.extend([
                '--source_reader_count', '2', '--chunk_count', '2',
                '--min_rows_per_chunk', '1', '--write_query_max_rows', '1'
            ])
        args.append('test_keyspace/0')
        workerclient_proc = utils.run_vtworker_client_bg(args, worker_rpc_port)

        if mysql_down:
            # If MySQL is down, we wait until vtworker retried at least once to make
            # sure it reached the point where a write failed due to MySQL being down.
            # There should be two retries at least, one for each destination shard.
            utils.poll_for_vars(
                'vtworker',
                worker_port,
                'WorkerRetryCount >= 2',
                condition_fn=lambda v: v.get('WorkerRetryCount') >= 2)
            logging.debug(
                'Worker has retried at least twice, starting reparent now')

            # vtworker is blocked at this point. This is a good time to test that its
            # throttler server is reacting to RPCs.
            self.check_throttler_service(
                'localhost:%d' % worker_rpc_port,
                ['test_keyspace/-80', 'test_keyspace/80-'], 9999)

            # Bring back masters. Since we test with semi-sync now, we need at least
            # one replica for the new master. This test is already quite expensive,
            # so we bring back the old master as a replica rather than having a third
            # replica up the whole time.
            logging.debug('Restarting mysqld on destination masters')
            utils.wait_procs(
                [shard_0_master.start_mysql(),
                 shard_1_master.start_mysql()])

            # Reparent away from the old masters.
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/-80',
                shard_0_replica.tablet_alias
            ],
                            auto_log=True)
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/80-',
                shard_1_replica.tablet_alias
            ],
                            auto_log=True)

        else:
            # NOTE: There is a race condition around this:
            #   It's possible that the SplitClone vtworker command finishes before the
            #   PlannedReparentShard vtctl command, which we start below, succeeds.
            #   Then the test would fail because vtworker did not have to retry.
            #
            # To workaround this, the test takes a parameter to increase the number of
            # rows that the worker has to copy (with the idea being to slow the worker
            # down).
            # You should choose a value for num_insert_rows, such that this test
            # passes for your environment (trial-and-error...)
            # Make sure that vtworker got past the point where it picked a master
            # for each destination shard ("finding targets" state).
            utils.poll_for_vars(
                'vtworker',
                worker_port,
                'WorkerState == cloning the data (online)',
                condition_fn=lambda v: v.get('WorkerState') == 'cloning the'
                ' data (online)')
            logging.debug('Worker is in copy state, starting reparent now')

            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/-80',
                shard_0_replica.tablet_alias
            ],
                            auto_log=True)
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/80-',
                shard_1_replica.tablet_alias
            ],
                            auto_log=True)

        utils.wait_procs([workerclient_proc])

        # Verify that we were forced to re-resolve and retry.
        worker_vars = utils.get_vars(worker_port)
        self.assertGreater(
            worker_vars['WorkerRetryCount'], 1,
            "expected vtworker to retry each of the two reparented"
            " destination masters at least once, but it didn't")
        self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
                            "expected vtworker to retry, but it didn't")
        utils.kill_sub_process(worker_proc, soft=True)

        # Wait for the destination RDONLYs to catch up or the following offline
        # clone will try to insert rows which already exist.
        # TODO(mberlin): Remove this once SplitClone supports it natively.
        utils.wait_for_replication_pos(shard_0_replica, shard_0_rdonly1)
        utils.wait_for_replication_pos(shard_1_replica, shard_1_rdonly1)
        # Run final offline clone to enable filtered replication.
        _, _ = utils.run_vtworker([
            '-cell', 'test_nj', 'SplitClone', '--online=false',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'
        ],
                                  auto_log=True)

        # Make sure that everything is caught up to the same replication point
        self.run_split_diff('test_keyspace/-80', all_shard_tablets,
                            shard_0_tablets)
        self.run_split_diff('test_keyspace/80-', all_shard_tablets,
                            shard_1_tablets)

        self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
        self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
Exemplo n.º 4
0
  def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
    """Verifies that vtworker can successfully copy data for a SplitClone.

    Order of operations:
    1. Run a background vtworker
    2. Wait until the worker successfully resolves the destination masters.
    3. Reparent the destination tablets
    4. Wait until the vtworker copy is finished
    5. Verify that the worker was forced to reresolve topology and retry writes
      due to the reparent.
    6. Verify that the data was copied successfully to both new shards

    Args:
      mysql_down: boolean. If True, we take down the MySQL instances on the
        destination masters at first, then bring them back and reparent away.

    Raises:
      AssertionError if things didn't go as expected.
    """
    if mysql_down:
      logging.debug('Shutting down mysqld on destination masters.')
      utils.wait_procs(
          [shard_0_master.shutdown_mysql(),
           shard_1_master.shutdown_mysql()])

    worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj'],
        auto_log=True)

    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--source_reader_count', '1',
         '--destination_pack_count', '1',
         '--destination_writer_count', '1',
         'test_keyspace/0'],
        worker_rpc_port)

    if mysql_down:
      # If MySQL is down, we wait until resolving at least twice (to verify that
      # we do reresolve and retry due to MySQL being down).
      worker_vars = utils.poll_for_vars(
          'vtworker', worker_port,
          'WorkerDestinationActualResolves >= 2',
          condition_fn=lambda v: v.get('WorkerDestinationActualResolves') >= 2)
      self.assertNotEqual(
          worker_vars['WorkerRetryCount'], {},
          "expected vtworker to retry, but it didn't")
      logging.debug('Worker has resolved at least twice, starting reparent now')

      # Bring back masters. Since we test with semi-sync now, we need at least
      # one replica for the new master. This test is already quite expensive,
      # so we bring back the old master as a replica rather than having a third
      # replica up the whole time.
      logging.debug('Restarting mysqld on destination masters')
      utils.wait_procs(
          [shard_0_master.start_mysql(),
           shard_1_master.start_mysql()])

      # Reparent away from the old masters.
      utils.run_vtctl(
          ['PlannedReparentShard', 'test_keyspace/-80',
           shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(
          ['PlannedReparentShard', 'test_keyspace/80-',
           shard_1_replica.tablet_alias], auto_log=True)

    else:
      # NOTE: There is a race condition around this:
      #   It's possible that the SplitClone vtworker command finishes before the
      #   PlannedReparentShard vtctl command, which we start below, succeeds.
      #   Then the test would fail because vtworker did not have to resolve the
      #   master tablet again (due to the missing reparent).
      #
      # To workaround this, the test takes a parameter to increase the number of
      # rows that the worker has to copy (with the idea being to slow the worker
      # down).
      # You should choose a value for num_insert_rows, such that this test
      # passes for your environment (trial-and-error...)
      utils.poll_for_vars(
          'vtworker', worker_port,
          'WorkerDestinationActualResolves >= 1',
          condition_fn=lambda v: v.get('WorkerDestinationActualResolves') >= 1)
      logging.debug('Worker has resolved at least once, starting reparent now')

      utils.run_vtctl(
          ['PlannedReparentShard', 'test_keyspace/-80',
           shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(
          ['PlannedReparentShard', 'test_keyspace/80-',
           shard_1_replica.tablet_alias], auto_log=True)

    utils.wait_procs([workerclient_proc])

    # Verify that we were forced to reresolve and retry.
    worker_vars = utils.get_vars(worker_port)
    self.assertGreater(worker_vars['WorkerDestinationActualResolves'], 1)
    self.assertGreater(worker_vars['WorkerDestinationAttemptedResolves'], 1)
    self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
                        "expected vtworker to retry, but it didn't")
    utils.kill_sub_process(worker_proc, soft=True)

    # Make sure that everything is caught up to the same replication point
    self.run_split_diff('test_keyspace/-80', all_shard_tablets, shard_0_tablets)
    self.run_split_diff('test_keyspace/80-', all_shard_tablets, shard_1_tablets)

    self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
    self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
Exemplo n.º 5
0
  def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
    """Verifies that vtworker can successfully copy data for a SplitClone.

    Order of operations:
    1. Run a background vtworker
    2. Wait until the worker successfully resolves the destination masters.
    3. Reparent the destination tablets
    4. Wait until the vtworker copy is finished
    5. Verify that the worker was forced to reresolve topology and retry writes
      due to the reparent.
    6. Verify that the data was copied successfully to both new shards

    Args:
      mysql_down - boolean, True iff we expect the MySQL instances on the
        destination masters to be down.

    Raises:
      AssertionError if things didn't go as expected.
    """
    worker_proc, worker_port, _ = utils.run_vtworker_bg(['--cell', 'test_nj',
                        'SplitClone',
                        '--source_reader_count', '1',
                        '--destination_pack_count', '1',
                        '--destination_writer_count', '1',
                        '--strategy=-populate_blp_checkpoint',
                        'test_keyspace/0'],
                       auto_log=True)

    if mysql_down:
      # If MySQL is down, we wait until resolving at least twice (to verify that
      # we do reresolve and retry due to MySQL being down).
      worker_vars = utils.poll_for_vars('vtworker', worker_port,
        'WorkerDestinationActualResolves >= 2',
        condition_fn=lambda v: v.get('WorkerDestinationActualResolves') >= 2)
      self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
        "expected vtworker to retry, but it didn't")
      logging.debug("Worker has resolved at least twice, starting reparent now")

      # Original masters have no running MySQL, so need to force the reparent
      utils.run_vtctl(['EmergencyReparentShard', 'test_keyspace/-80',
        shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(['EmergencyReparentShard', 'test_keyspace/80-',
        shard_1_replica.tablet_alias], auto_log=True)

    else:
      utils.poll_for_vars('vtworker', worker_port,
        'WorkerDestinationActualResolves >= 1',
        condition_fn=lambda v: v.get('WorkerDestinationActualResolves') >= 1)
      logging.debug("Worker has resolved at least once, starting reparent now")

      utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/-80',
        shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/80-',
        shard_1_replica.tablet_alias], auto_log=True)

    logging.debug("Polling for worker state")
    # There are a couple of race conditions around this, that we need to be careful of:
    # 1. It's possible for the reparent step to take so long that the worker will
    #   actually finish before we get to the polling step. To workaround this,
    #   the test takes a parameter to increase the number of rows that the worker
    #   has to copy (with the idea being to slow the worker down).
    # 2. If the worker has a huge number of rows to copy, it's possible for the
    #   polling to timeout before the worker has finished copying the data.
    #
    # You should choose a value for num_insert_rows, such that this test passes
    # for your environment (trial-and-error...)
    worker_vars = utils.poll_for_vars('vtworker', worker_port,
      'WorkerState == cleaning up',
      condition_fn=lambda v: v.get('WorkerState') == 'cleaning up',
      # We know that vars should already be ready, since we read them earlier
      require_vars=True,
      # We're willing to let the test run for longer to make it less flaky.
      # This should still fail fast if something goes wrong with vtworker,
      # because of the require_vars flag above.
      timeout=5*60)

    # Verify that we were forced to reresolve and retry.
    self.assertGreater(worker_vars['WorkerDestinationActualResolves'], 1)
    self.assertGreater(worker_vars['WorkerDestinationAttemptedResolves'], 1)
    self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
      "expected vtworker to retry, but it didn't")

    utils.wait_procs([worker_proc])

    # Make sure that everything is caught up to the same replication point
    self.run_split_diff('test_keyspace/-80', shard_tablets, shard_0_tablets)
    self.run_split_diff('test_keyspace/80-', shard_tablets, shard_1_tablets)

    self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
    self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
Exemplo n.º 6
0
  def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
    """Verifies that vtworker can successfully copy data for a SplitClone.

    Order of operations:
    1. Run a background vtworker
    2. Wait until the worker successfully resolves the destination masters.
    3. Reparent the destination tablets
    4. Wait until the vtworker copy is finished
    5. Verify that the worker was forced to reresolve topology and retry writes
      due to the reparent.
    6. Verify that the data was copied successfully to both new shards

    Args:
      mysql_down: boolean. If True, we take down the MySQL instances on the
        destination masters at first, then bring them back and reparent away.

    Raises:
      AssertionError if things didn't go as expected.
    """
    if mysql_down:
      logging.debug('Shutting down mysqld on destination masters.')
      utils.wait_procs(
          [shard_0_master.shutdown_mysql(),
           shard_1_master.shutdown_mysql()])

    worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj', '--use_v3_resharding_mode=false'],
        auto_log=True)

    # --max_tps is only specified to enable the throttler and ensure that the
    # code is executed. But the intent here is not to throttle the test, hence
    # the rate limit is set very high.
    # --chunk_count is 2 because rows are currently ordered by primary key such
    # that all rows of the first shard come first and then the second shard.
    # TODO(mberlin): Remove --offline=false once vtworker ensures that the
    #                destination shards are not behind the master's replication
    #                position.
    args = ['SplitClone',
            '--offline=false',
            '--destination_writer_count', '1',
            '--min_healthy_rdonly_tablets', '1',
            '--max_tps', '9999']
    if not mysql_down:
      # Make the clone as slow as necessary such that there is enough time to
      # run PlannedReparent in the meantime.
      # TODO(mberlin): Once insert_values is fixed to uniformly distribute the
      #                rows across shards when sorted by primary key, remove
      #                --chunk_count 2, --min_rows_per_chunk 1 and set
      #                --source_reader_count back to 1.
      args.extend(['--source_reader_count', '2',
                   '--chunk_count', '2',
                   '--min_rows_per_chunk', '1',
                   '--write_query_max_rows', '1'])
    args.append('test_keyspace/0')
    workerclient_proc = utils.run_vtworker_client_bg(args, worker_rpc_port)

    if mysql_down:
      # If MySQL is down, we wait until vtworker retried at least once to make
      # sure it reached the point where a write failed due to MySQL being down.
      # There should be two retries at least, one for each destination shard.
      utils.poll_for_vars(
          'vtworker', worker_port,
          'WorkerRetryCount >= 2',
          condition_fn=lambda v: v.get('WorkerRetryCount') >= 2)
      logging.debug('Worker has retried at least twice, starting reparent now')

      # vtworker is blocked at this point. This is a good time to test that its
      # throttler server is reacting to RPCs.
      self.check_throttler_service('localhost:%d' % worker_rpc_port,
                                   ['test_keyspace/-80', 'test_keyspace/80-'],
                                   9999)

      # Bring back masters. Since we test with semi-sync now, we need at least
      # one replica for the new master. This test is already quite expensive,
      # so we bring back the old master as a replica rather than having a third
      # replica up the whole time.
      logging.debug('Restarting mysqld on destination masters')
      utils.wait_procs(
          [shard_0_master.start_mysql(),
           shard_1_master.start_mysql()])

      # Reparent away from the old masters.
      utils.run_vtctl(
          ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/-80',
           '-new_master', shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(
          ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-',
           '-new_master', shard_1_replica.tablet_alias], auto_log=True)

    else:
      # NOTE: There is a race condition around this:
      #   It's possible that the SplitClone vtworker command finishes before the
      #   PlannedReparentShard vtctl command, which we start below, succeeds.
      #   Then the test would fail because vtworker did not have to retry.
      #
      # To workaround this, the test takes a parameter to increase the number of
      # rows that the worker has to copy (with the idea being to slow the worker
      # down).
      # You should choose a value for num_insert_rows, such that this test
      # passes for your environment (trial-and-error...)
      # Make sure that vtworker got past the point where it picked a master
      # for each destination shard ("finding targets" state).
      utils.poll_for_vars(
          'vtworker', worker_port,
          'WorkerState == cloning the data (online)',
          condition_fn=lambda v: v.get('WorkerState') == 'cloning the'
          ' data (online)')
      logging.debug('Worker is in copy state, starting reparent now')

      utils.run_vtctl(
          ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/-80',
           '-new_master', shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(
          ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-',
           '-new_master', shard_1_replica.tablet_alias], auto_log=True)

    utils.wait_procs([workerclient_proc])

    # Verify that we were forced to re-resolve and retry.
    worker_vars = utils.get_vars(worker_port)
    self.assertGreater(worker_vars['WorkerRetryCount'], 1,
                       "expected vtworker to retry each of the two reparented"
                       " destination masters at least once, but it didn't")
    self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
                        "expected vtworker to retry, but it didn't")
    utils.kill_sub_process(worker_proc, soft=True)

    # Wait for the destination RDONLYs to catch up or the following offline
    # clone will try to insert rows which already exist.
    # TODO(mberlin): Remove this once SplitClone supports it natively.
    utils.wait_for_replication_pos(shard_0_replica, shard_0_rdonly1)
    utils.wait_for_replication_pos(shard_1_replica, shard_1_rdonly1)
    # Run final offline clone to enable filtered replication.
    _, _ = utils.run_vtworker(['-cell', 'test_nj',
                               '--use_v3_resharding_mode=false',
                               'SplitClone',
                               '--online=false',
                               '--min_healthy_rdonly_tablets', '1',
                               'test_keyspace/0'], auto_log=True)

    # Make sure that everything is caught up to the same replication point
    self.run_split_diff('test_keyspace/-80', all_shard_tablets, shard_0_tablets)
    self.run_split_diff('test_keyspace/80-', all_shard_tablets, shard_1_tablets)

    self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
    self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
Exemplo n.º 7
0
    def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
        """Verifies that vtworker can successfully copy data for a SplitClone.

    Order of operations:
    1. Run a background vtworker
    2. Wait until the worker successfully resolves the destination masters.
    3. Reparent the destination tablets
    4. Wait until the vtworker copy is finished
    5. Verify that the worker was forced to reresolve topology and retry writes
      due to the reparent.
    6. Verify that the data was copied successfully to both new shards

    Args:
      mysql_down: boolean, True iff we expect the MySQL instances on the
        destination masters to be down.

    Raises:
      AssertionError if things didn't go as expected.
    """
        worker_proc, worker_port, _ = utils.run_vtworker_bg([
            '--cell', 'test_nj', 'SplitClone', '--source_reader_count', '1',
            '--destination_pack_count', '1', '--destination_writer_count', '1',
            '--strategy=-populate_blp_checkpoint', 'test_keyspace/0'
        ],
                                                            auto_log=True)

        if mysql_down:
            # If MySQL is down, we wait until resolving at least twice (to verify that
            # we do reresolve and retry due to MySQL being down).
            worker_vars = utils.poll_for_vars(
                'vtworker',
                worker_port,
                'WorkerDestinationActualResolves >= 2',
                condition_fn=lambda v: v.get('WorkerDestinationActualResolves'
                                             ) >= 2)
            self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
                                "expected vtworker to retry, but it didn't")
            logging.debug(
                'Worker has resolved at least twice, starting reparent now')

            # Original masters have no running MySQL, so need to force the reparent
            utils.run_vtctl([
                'EmergencyReparentShard', 'test_keyspace/-80',
                shard_0_replica.tablet_alias
            ],
                            auto_log=True)
            utils.run_vtctl([
                'EmergencyReparentShard', 'test_keyspace/80-',
                shard_1_replica.tablet_alias
            ],
                            auto_log=True)

        else:
            utils.poll_for_vars('vtworker',
                                worker_port,
                                'WorkerDestinationActualResolves >= 1',
                                condition_fn=lambda v: v.get(
                                    'WorkerDestinationActualResolves') >= 1)
            logging.debug(
                'Worker has resolved at least once, starting reparent now')

            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/-80',
                shard_0_replica.tablet_alias
            ],
                            auto_log=True)
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/80-',
                shard_1_replica.tablet_alias
            ],
                            auto_log=True)

        logging.debug('Polling for worker state')
        # There are a couple of race conditions around this, that we need
        # to be careful of:
        #
        # 1. It's possible for the reparent step to take so long that the
        #   worker will actually finish before we get to the polling
        #   step. To workaround this, the test takes a parameter to
        #   increase the number of rows that the worker has to copy (with
        #   the idea being to slow the worker down).
        #
        # 2. If the worker has a huge number of rows to copy, it's
        #   possible for the polling to timeout before the worker has
        #   finished copying the data.
        #
        # You should choose a value for num_insert_rows, such that this test passes
        # for your environment (trial-and-error...)
        worker_vars = utils.poll_for_vars(
            'vtworker',
            worker_port,
            'WorkerState == cleaning up',
            condition_fn=lambda v: v.get('WorkerState') == 'cleaning up',
            # We know that vars should already be ready, since we read them earlier
            require_vars=True,
            # We're willing to let the test run for longer to make it less flaky.
            # This should still fail fast if something goes wrong with vtworker,
            # because of the require_vars flag above.
            timeout=5 * 60)

        # Verify that we were forced to reresolve and retry.
        self.assertGreater(worker_vars['WorkerDestinationActualResolves'], 1)
        self.assertGreater(worker_vars['WorkerDestinationAttemptedResolves'],
                           1)
        self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
                            "expected vtworker to retry, but it didn't")

        utils.wait_procs([worker_proc])

        # Make sure that everything is caught up to the same replication point
        self.run_split_diff('test_keyspace/-80', all_shard_tablets,
                            shard_0_tablets)
        self.run_split_diff('test_keyspace/80-', all_shard_tablets,
                            shard_1_tablets)

        self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
        self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
Exemplo n.º 8
0
    def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
        """Verifies that vtworker can successfully copy data for a SplitClone.

    Order of operations:
    1. Run a background vtworker
    2. Wait until the worker successfully resolves the destination masters.
    3. Reparent the destination tablets
    4. Wait until the vtworker copy is finished
    5. Verify that the worker was forced to reresolve topology and retry writes
      due to the reparent.
    6. Verify that the data was copied successfully to both new shards

    Args:
      mysql_down: boolean. If True, we take down the MySQL instances on the
        destination masters at first, then bring them back and reparent away.

    Raises:
      AssertionError if things didn't go as expected.
    """
        if mysql_down:
            logging.debug('Shutting down mysqld on destination masters.')
            utils.wait_procs([
                shard_0_master.shutdown_mysql(),
                shard_1_master.shutdown_mysql()
            ])

        worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
            ['--cell', 'test_nj'], auto_log=True)

        # --max_tps is only specified to enable the throttler and ensure that the
        # code is executed. But the intent here is not to throttle the test, hence
        # the rate limit is set very high.
        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--source_reader_count', '1',
            '--destination_pack_count', '1', '--destination_writer_count', '1',
            '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999',
            'test_keyspace/0'
        ], worker_rpc_port)

        if mysql_down:
            # If MySQL is down, we wait until vtworker retried at least once to make
            # sure it reached the point where a write failed due to MySQL being down.
            # There should be two retries at least, one for each destination shard.
            utils.poll_for_vars(
                'vtworker',
                worker_port,
                'WorkerRetryCount >= 2',
                condition_fn=lambda v: v.get('WorkerRetryCount') >= 2)
            logging.debug(
                'Worker has retried at least twice, starting reparent now')

            # vtworker is blocked at this point. This is a good time to test that its
            # throttler server is reacting to RPCs.
            self.check_binlog_throttler(
                'localhost:%d' % worker_rpc_port,
                ['test_keyspace/-80', 'test_keyspace/80-'], 9999)

            # Bring back masters. Since we test with semi-sync now, we need at least
            # one replica for the new master. This test is already quite expensive,
            # so we bring back the old master as a replica rather than having a third
            # replica up the whole time.
            logging.debug('Restarting mysqld on destination masters')
            utils.wait_procs(
                [shard_0_master.start_mysql(),
                 shard_1_master.start_mysql()])

            # Reparent away from the old masters.
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/-80',
                shard_0_replica.tablet_alias
            ],
                            auto_log=True)
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/80-',
                shard_1_replica.tablet_alias
            ],
                            auto_log=True)

        else:
            # NOTE: There is a race condition around this:
            #   It's possible that the SplitClone vtworker command finishes before the
            #   PlannedReparentShard vtctl command, which we start below, succeeds.
            #   Then the test would fail because vtworker did not have to retry.
            #
            # To workaround this, the test takes a parameter to increase the number of
            # rows that the worker has to copy (with the idea being to slow the worker
            # down).
            # You should choose a value for num_insert_rows, such that this test
            # passes for your environment (trial-and-error...)
            # Make sure that vtworker got past the point where it picked a master
            # for each destination shard ("finding targets" state).
            utils.poll_for_vars('vtworker',
                                worker_port,
                                'WorkerState == copying the data',
                                condition_fn=lambda v: v.get('WorkerState') ==
                                'copying the data')
            logging.debug('Worker is in copy state, starting reparent now')

            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/-80',
                shard_0_replica.tablet_alias
            ],
                            auto_log=True)
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/80-',
                shard_1_replica.tablet_alias
            ],
                            auto_log=True)

        utils.wait_procs([workerclient_proc])

        # Verify that we were forced to re-resolve and retry.
        worker_vars = utils.get_vars(worker_port)
        # There should be two retries at least, one for each destination shard.
        self.assertGreater(worker_vars['WorkerRetryCount'], 1)
        self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
                            "expected vtworker to retry, but it didn't")
        utils.kill_sub_process(worker_proc, soft=True)

        # Make sure that everything is caught up to the same replication point
        self.run_split_diff('test_keyspace/-80', all_shard_tablets,
                            shard_0_tablets)
        self.run_split_diff('test_keyspace/80-', all_shard_tablets,
                            shard_1_tablets)

        self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
        self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
Exemplo n.º 9
0
  def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
    """Verifies that vtworker can successfully copy data for a SplitClone.

    Order of operations:
    1. Run a background vtworker
    2. Wait until the worker successfully resolves the destination masters.
    3. Reparent the destination tablets
    4. Wait until the vtworker copy is finished
    5. Verify that the worker was forced to reresolve topology and retry writes
      due to the reparent.
    6. Verify that the data was copied successfully to both new shards

    Args:
      mysql_down: boolean. If True, we take down the MySQL instances on the
        destination masters at first, then bring them back and reparent away.

    Raises:
      AssertionError if things didn't go as expected.
    """
    if mysql_down:
      logging.debug('Shutting down mysqld on destination masters.')
      utils.wait_procs(
          [shard_0_master.shutdown_mysql(),
           shard_1_master.shutdown_mysql()])

    worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj'],
        auto_log=True)

    # --max_tps is only specified to enable the throttler and ensure that the
    # code is executed. But the intent here is not to throttle the test, hence
    # the rate limit is set very high.
    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--source_reader_count', '1',
         '--destination_pack_count', '1',
         '--destination_writer_count', '1',
         '--min_healthy_rdonly_tablets', '1',
         '--max_tps', '9999',
         'test_keyspace/0'],
        worker_rpc_port)

    if mysql_down:
      # If MySQL is down, we wait until vtworker retried at least once to make
      # sure it reached the point where a write failed due to MySQL being down.
      # There should be two retries at least, one for each destination shard.
      utils.poll_for_vars(
          'vtworker', worker_port,
          'WorkerRetryCount >= 2',
          condition_fn=lambda v: v.get('WorkerRetryCount') >= 2)
      logging.debug('Worker has retried at least twice, starting reparent now')

      # vtworker is blocked at this point. This is a good time to test that its
      # throttler server is reacting to RPCs.
      self.check_binlog_throttler('localhost:%d' % worker_rpc_port,
                                  ['test_keyspace/-80', 'test_keyspace/80-'],
                                  9999)

      # Bring back masters. Since we test with semi-sync now, we need at least
      # one replica for the new master. This test is already quite expensive,
      # so we bring back the old master as a replica rather than having a third
      # replica up the whole time.
      logging.debug('Restarting mysqld on destination masters')
      utils.wait_procs(
          [shard_0_master.start_mysql(),
           shard_1_master.start_mysql()])

      # Reparent away from the old masters.
      utils.run_vtctl(
          ['PlannedReparentShard', 'test_keyspace/-80',
           shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(
          ['PlannedReparentShard', 'test_keyspace/80-',
           shard_1_replica.tablet_alias], auto_log=True)

    else:
      # NOTE: There is a race condition around this:
      #   It's possible that the SplitClone vtworker command finishes before the
      #   PlannedReparentShard vtctl command, which we start below, succeeds.
      #   Then the test would fail because vtworker did not have to retry.
      #
      # To workaround this, the test takes a parameter to increase the number of
      # rows that the worker has to copy (with the idea being to slow the worker
      # down).
      # You should choose a value for num_insert_rows, such that this test
      # passes for your environment (trial-and-error...)
      # Make sure that vtworker got past the point where it picked a master
      # for each destination shard ("finding targets" state).
      utils.poll_for_vars(
          'vtworker', worker_port,
          'WorkerState == copying the data',
          condition_fn=lambda v: v.get('WorkerState') == 'copying the data')
      logging.debug('Worker is in copy state, starting reparent now')

      utils.run_vtctl(
          ['PlannedReparentShard', 'test_keyspace/-80',
           shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(
          ['PlannedReparentShard', 'test_keyspace/80-',
           shard_1_replica.tablet_alias], auto_log=True)

    utils.wait_procs([workerclient_proc])

    # Verify that we were forced to re-resolve and retry.
    worker_vars = utils.get_vars(worker_port)
    # There should be two retries at least, one for each destination shard.
    self.assertGreater(worker_vars['WorkerRetryCount'], 1)
    self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
                        "expected vtworker to retry, but it didn't")
    utils.kill_sub_process(worker_proc, soft=True)

    # Make sure that everything is caught up to the same replication point
    self.run_split_diff('test_keyspace/-80', all_shard_tablets, shard_0_tablets)
    self.run_split_diff('test_keyspace/80-', all_shard_tablets, shard_1_tablets)

    self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
    self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)