def internal_reparent(self, keyspace, new_cell, shard, num_shards, new_task_num, emergency=False): shard_name = utils.get_shard_name(shard, num_shards) cell_number = self.cells.index(new_cell) + 1 new_master = '%s-%02d00000%d%02d' % ( new_cell, cell_number, shard + 1, new_task_num) reparent_command = ( 'EmergencyReparentShard' if emergency else 'PlannedReparentShard') self.vtctl_helper.execute_vtctl_command( [reparent_command, '%s/%s' % (keyspace, shard_name), new_master]) self.vtctl_helper.execute_vtctl_command(['RebuildKeyspaceGraph', keyspace]) return 0, 'No output'
def implicit_reparent(self, keyspace, shard, num_shards): """Performs an implicit reparent. This function will call borg restart on the current master task and verify that decider selected a new task to be the master. Args: keyspace: Name of the keyspace to reparent (string) shard: Numeric ID of the shard to reparent (zero based int) num_shards: Total number of shards (int) """ shard_name = utils.get_shard_name(shard, num_shards) original_master_name = (self.env.get_current_master_name( keyspace, shard_name)) original_master_cell = self.env.get_tablet_cell(original_master_name) master_task_num = self.env.get_tablet_task_number(original_master_name) logging.info('Restarting %s/%s, current master: %s, task: %d', keyspace, shard_name, original_master_name, master_task_num) ret_val = self.env.restart_mysql_task(original_master_cell, keyspace, shard, master_task_num, 'replica', 'mysql-alloc', True) self.assertEquals(ret_val, 0, msg='restartalloc failed (returned %d)' % ret_val) start_time = time.time() while time.time() - start_time < self.reparent_timeout_threshold: new_master_name = self.env.get_current_master_name( keyspace, shard_name) new_master_task_num = self.env.get_tablet_task_number( new_master_name) if new_master_name != original_master_name: break time.sleep(1) self.assertNotEquals( new_master_name, original_master_name, msg='Expected master tablet to change, but it remained as %s' % (new_master_name)) logging.info( 'restartalloc on %s/%s resulted in new master: %s, task: %d', keyspace, shard_name, new_master_name, new_master_task_num)
def internal_reparent(self, keyspace, new_cell, shard, num_shards, new_task_num, emergency=False): shard_name = utils.get_shard_name(shard, num_shards) cell_number = self.cells.index(new_cell) + 1 new_master = '%s-%02d00000%d%02d' % (new_cell, cell_number, shard + 1, new_task_num) reparent_command = ('EmergencyReparentShard' if emergency else 'PlannedReparentShard') self.vtctl_helper.execute_vtctl_command( [reparent_command, '%s/%s' % (keyspace, shard_name), new_master]) self.vtctl_helper.execute_vtctl_command( ['RebuildKeyspaceGraph', keyspace]) return 0, 'No output'
def implicit_reparent(self, keyspace, shard, num_shards): """Performs an implicit reparent. This function will call borg restart on the current master task and verify that decider selected a new task to be the master. Args: keyspace: Name of the keyspace to reparent (string) shard: Numeric ID of the shard to reparent (zero based int) num_shards: Total number of shards (int) """ shard_name = utils.get_shard_name(shard, num_shards) original_master_name = ( self.env.get_current_master_name(keyspace, shard_name)) original_master_cell = self.env.get_tablet_cell(original_master_name) master_task_num = self.env.get_tablet_task_number(original_master_name) logging.info('Restarting %s/%s, current master: %s, task: %d', keyspace, shard_name, original_master_name, master_task_num) ret_val = self.env.restart_mysql_task( original_master_cell, keyspace, shard, master_task_num, 'replica', 'mysql-alloc', True) self.assertEquals(ret_val, 0, msg='restartalloc failed (returned %d)' % ret_val) start_time = time.time() while time.time() - start_time < self.reparent_timeout_threshold: new_master_name = self.env.get_current_master_name(keyspace, shard_name) new_master_task_num = self.env.get_tablet_task_number(new_master_name) if new_master_name != original_master_name: break time.sleep(1) self.assertNotEquals( new_master_name, original_master_name, msg='Expected master tablet to change, but it remained as %s' % ( new_master_name)) logging.info('restartalloc on %s/%s resulted in new master: %s, task: %d', keyspace, shard_name, new_master_name, new_master_task_num)
def explicit_reparent(self, keyspace, num_shards, external=False, cross_cell=False): """Performs an explicit reparent. This function will call decider to explicity select a new master and verify that the topology is updated. Args: keyspace: Name of the keyspace to reparent (string) num_shards: Total number of shards (int) external: Whether the reparent should be external or through vtctl (bool) cross_cell: Whether to reparent to a different cell (bool) Returns: How long we waited for the serving graph to be updated. The time begins just before calling Decider. This is a list of floats, one for each shard. For cross-cell reparents, it returns []. """ original_masters = [] next_masters = [] shard_names = [] durations = [] for shard in xrange(num_shards): shard_name = utils.get_shard_name(shard, num_shards) shard_names.append(shard_name) original_master_name = self.env.get_current_master_name( keyspace, shard_name) original_master = { 'cell': self.env.get_tablet_cell(original_master_name), 'task': self.env.get_tablet_task_number(original_master_name), } original_masters.append(original_master) next_master_cell, next_master_task = self.env.get_next_master( keyspace, shard_name, cross_cell) next_master = { 'cell': next_master_cell, 'task': next_master_task, } next_masters.append(next_master) self.env.wait_for_good_failover_status(keyspace, shard_name) # Call Reparent in a separate thread. def reparent_shard(shard, shard_name, original_master, next_master): logging.info('Reparenting %s/%s from %s to %s', keyspace, shard_name, original_master, next_master) reparent_fn = ( self.env.external_reparent if external else self.env.internal_reparent) return_code, return_output = reparent_fn( keyspace, next_master['cell'], shard, num_shards, new_task_num=next_master['task']) logging.info('Reparent returned %d for %s/%s: %s', return_code, keyspace, shard_name, return_output) thread = threading.Thread(target=reparent_shard, args=[shard, shard_name, original_master, next_master]) start_time = time.time() thread.start() if not cross_cell: # Wait for the shard to be updated. # This doesn't work for cross-cell, because mapping a task # number to a tablet UID is more trouble than it's worth. uid = (self.env.get_tablet_uid(original_master_name) - original_master['task'] + next_master['task']) while True: if time.time() - start_time > self.reparent_timeout_threshold: self.fail('Timed out waiting for serving graph update on %s/%s' % ( keyspace, shard_name)) try: shard_info = json.loads(self.env.vtctl_helper.execute_vtctl_command( ['GetShard', '%s/%s' % (keyspace, shard_name)])) if int(shard_info['master_alias']['uid']) == uid: duration = time.time() - start_time durations.append(duration) logging.info('Shard record updated for %s/%s after %f seconds', keyspace, shard_name, duration) break except (IndexError, KeyError, vtctl_helper.VtctlClientError): pass thread.join() for shard_name, next_master in zip(shard_names, next_masters): start_time = time.time() while True: if time.time() - start_time > self.reparent_timeout_threshold: self.fail('%s/%s master was not updated to %s within %d seconds' % ( keyspace, shard_name, next_master, self.reparent_timeout_threshold)) if self.verify_new_master( keyspace, shard_name, next_master['cell'], next_master['task']): logging.info('%s/%s\'s new master is %s', keyspace, shard_name, next_master) break time.sleep(1) return durations
def use_named(self, instance_name): # Check to make sure kubectl exists try: subprocess.check_output(['kubectl']) except OSError: raise base_environment.VitessEnvironmentError( 'kubectl not found, please install by visiting kubernetes.io or ' 'running gcloud components update kubectl if using compute engine.' ) get_address_template = ( '{{if ge (len .status.loadBalancer) 1}}' '{{index (index .status.loadBalancer.ingress 0) "ip"}}' '{{end}}') get_address_params = [ 'kubectl', 'get', '-o', 'template', '--template', get_address_template, 'service', '--namespace', instance_name ] start_time = time.time() vtctld_addr = '' while time.time() - start_time < 60 and not vtctld_addr: vtctld_addr = subprocess.check_output(get_address_params + ['vtctld'], stderr=subprocess.STDOUT) self.vtctl_addr = '%s:15999' % vtctld_addr self.vtctl_helper = vtctl_helper.VtctlHelper('grpc', self.vtctl_addr) self.cluster_name = instance_name keyspaces = self.vtctl_helper.execute_vtctl_command(['GetKeyspaces']) self.mobs = filter(None, keyspaces.split('\n')) self.keyspaces = self.mobs if not self.keyspaces: raise base_environment.VitessEnvironmentError( 'Invalid environment, no keyspaces found') self.num_shards = [] for keyspace in self.keyspaces: keyspace_info = json.loads( self.vtctl_helper.execute_vtctl_command( ['GetKeyspace', keyspace])) if not keyspace_info: self.num_shards.append(1) else: self.num_shards.append(keyspace_info['split_shard_count']) # This assumes that all keyspaces use the same set of cells self.cells = json.loads( self.vtctl_helper.execute_vtctl_command([ 'GetShard', '%s/%s' % (self.keyspaces[0], utils.get_shard_name(0, self.num_shards[0])) ]))['cells'] self.primary_cells = self.cells self.replica_instances = [] self.rdonly_instances = [] # This assumes that all cells are equivalent for k8s environments. all_tablets_in_a_cell = self.vtctl_helper.execute_vtctl_command( ['ListAllTablets', self.cells[0]]) all_tablets_in_a_cell = [ x.split(' ') for x in filter(None, all_tablets_in_a_cell.split('\n')) ] for index, keyspace in enumerate(self.keyspaces): keyspace_tablets_in_cell = [ tablet for tablet in all_tablets_in_a_cell if tablet[1] == keyspace ] replica_tablets_in_cell = [ tablet for tablet in keyspace_tablets_in_cell if tablet[3] == 'master' or tablet[3] == 'replica' ] replica_instances = len( replica_tablets_in_cell) / self.num_shards[index] self.replica_instances.append(replica_instances) self.rdonly_instances.append((len(keyspace_tablets_in_cell) / self.num_shards[index]) - replica_instances) # Converts keyspace name and alias to number of instances self.keyspace_alias_to_num_instances_dict = {} for index, keyspace in enumerate(self.keyspaces): self.keyspace_alias_to_num_instances_dict[keyspace] = { 'replica': int(self.replica_instances[index]), 'rdonly': int(self.rdonly_instances[index]) } start_time = time.time() self.vtgate_addrs = {} self.vtgate_conns = {} for cell in self.cells: self.vtgate_addr = '' while time.time() - start_time < 60 and not self.vtgate_addr: vtgate_addr = subprocess.check_output(get_address_params + ['vtgate-%s' % cell], stderr=subprocess.STDOUT) self.vtgate_addrs[cell] = '%s:15001' % vtgate_addr self.vtgate_conns[cell] = vtgate_client.connect( protocols_flavor.protocols_flavor().vtgate_python_protocol(), self.vtgate_addrs[cell], 60)
def explicit_reparent(self, keyspace, num_shards, external=False, cross_cell=False): """Performs an explicit reparent. This function will call decider to explicity select a new master and verify that the topology is updated. Args: keyspace: Name of the keyspace to reparent (string) num_shards: Total number of shards (int) external: Whether the reparent should be external or through vtctl (bool) cross_cell: Whether to reparent to a different cell (bool) Returns: How long we waited for the serving graph to be updated. The time begins just before calling Decider. This is a list of floats, one for each shard. For cross-cell reparents, it returns []. """ original_masters = [] next_masters = [] shard_names = [] durations = [] for shard in xrange(num_shards): shard_name = utils.get_shard_name(shard, num_shards) shard_names.append(shard_name) original_master_name = self.env.get_current_master_name( keyspace, shard_name) original_master = { 'cell': self.env.get_tablet_cell(original_master_name), 'task': self.env.get_tablet_task_number(original_master_name), } original_masters.append(original_master) next_master_cell, next_master_task = self.env.get_next_master( keyspace, shard_name, cross_cell) next_master = { 'cell': next_master_cell, 'task': next_master_task, } next_masters.append(next_master) self.env.wait_for_good_failover_status(keyspace, shard_name) # Call Reparent in a separate thread. def reparent_shard(shard, shard_name, original_master, next_master): logging.info('Reparenting %s/%s from %s to %s', keyspace, shard_name, original_master, next_master) reparent_fn = (self.env.external_reparent if external else self.env.internal_reparent) return_code, return_output = reparent_fn( keyspace, next_master['cell'], shard, num_shards, new_task_num=next_master['task']) logging.info('Reparent returned %d for %s/%s: %s', return_code, keyspace, shard_name, return_output) thread = threading.Thread( target=reparent_shard, args=[shard, shard_name, original_master, next_master]) start_time = time.time() thread.start() if not cross_cell: # Wait for the serving graph to be updated. # This doesn't work for cross-cell, because mapping a task # number to a tablet UID is more trouble than it's worth. uid = (self.env.get_tablet_uid(original_master_name) - original_master['task'] + next_master['task']) while True: if time.time( ) - start_time > self.reparent_timeout_threshold: self.fail( 'Timed out waiting for serving graph update on %s/%s' % (keyspace, shard_name)) try: endpoints = json.loads( self.env.vtctl_helper.execute_vtctl_command([ 'GetEndPoints', next_master['cell'], '%s/%s' % (keyspace, shard_name), 'master' ])) if int(endpoints['entries'][0]['uid']) == uid: duration = time.time() - start_time durations.append(duration) logging.info( 'Serving graph updated for %s/%s after %f seconds', keyspace, shard_name, duration) break except (IndexError, KeyError, vtctl_helper.VtctlClientError): pass thread.join() for shard_name, next_master in zip(shard_names, next_masters): start_time = time.time() while True: if time.time() - start_time > self.reparent_timeout_threshold: self.fail( '%s/%s master was not updated to %s within %d seconds' % (keyspace, shard_name, next_master, self.reparent_timeout_threshold)) if self.verify_new_master(keyspace, shard_name, next_master['cell'], next_master['task']): logging.info('%s/%s\'s new master is %s', keyspace, shard_name, next_master) break time.sleep(1) return durations
def use_named(self, instance_name): # Check to make sure kubectl exists try: subprocess.check_output(['kubectl']) except OSError: raise base_environment.VitessEnvironmentError( 'kubectl not found, please install by visiting kubernetes.io or ' 'running gcloud components update kubectl if using compute engine.') get_address_template = ( '{{if ge (len .status.loadBalancer) 1}}' '{{index (index .status.loadBalancer.ingress 0) "ip"}}' '{{end}}') get_address_params = ['kubectl', 'get', '-o', 'template', '--template', get_address_template, 'service', '--namespace', instance_name] start_time = time.time() vtctld_addr = '' while time.time() - start_time < 60 and not vtctld_addr: vtctld_addr = subprocess.check_output( get_address_params + ['vtctld'], stderr=subprocess.STDOUT) self.vtctl_addr = '%s:15999' % vtctld_addr self.vtctl_helper = vtctl_helper.VtctlHelper('grpc', self.vtctl_addr) self.cluster_name = instance_name keyspaces = self.vtctl_helper.execute_vtctl_command(['GetKeyspaces']) self.mobs = filter(None, keyspaces.split('\n')) self.keyspaces = self.mobs if not self.keyspaces: raise base_environment.VitessEnvironmentError( 'Invalid environment, no keyspaces found') self.num_shards = [] for keyspace in self.keyspaces: keyspace_info = json.loads(self.vtctl_helper.execute_vtctl_command( ['GetKeyspace', keyspace])) if not keyspace_info: self.num_shards.append(1) else: self.num_shards.append(keyspace_info['split_shard_count']) # This assumes that all keyspaces use the same set of cells self.cells = json.loads(self.vtctl_helper.execute_vtctl_command( ['GetShard', '%s/%s' % ( self.keyspaces[0], utils.get_shard_name(0, self.num_shards[0]))] ))['cells'] self.primary_cells = self.cells self.replica_instances = [] self.rdonly_instances = [] # This assumes that all cells are equivalent for k8s environments. all_tablets_in_a_cell = self.vtctl_helper.execute_vtctl_command( ['ListAllTablets', self.cells[0]]) all_tablets_in_a_cell = [x.split(' ') for x in filter(None, all_tablets_in_a_cell.split('\n'))] for index, keyspace in enumerate(self.keyspaces): keyspace_tablets_in_cell = [ tablet for tablet in all_tablets_in_a_cell if tablet[1] == keyspace] replica_tablets_in_cell = [ tablet for tablet in keyspace_tablets_in_cell if tablet[3] == 'master' or tablet[3] == 'replica'] replica_instances = len(replica_tablets_in_cell) / self.num_shards[index] self.replica_instances.append(replica_instances) self.rdonly_instances.append( (len(keyspace_tablets_in_cell) / self.num_shards[index]) - replica_instances) # Converts keyspace name and alias to number of instances self.keyspace_alias_to_num_instances_dict = {} for index, keyspace in enumerate(self.keyspaces): self.keyspace_alias_to_num_instances_dict[keyspace] = { 'replica': int(self.replica_instances[index]), 'rdonly': int(self.rdonly_instances[index]) } start_time = time.time() self.vtgate_addrs = {} self.vtgate_conns = {} for cell in self.cells: self.vtgate_addr = '' while time.time() - start_time < 60 and not self.vtgate_addr: vtgate_addr = subprocess.check_output( get_address_params + ['vtgate-%s' % cell], stderr=subprocess.STDOUT) self.vtgate_addrs[cell] = '%s:15001' % vtgate_addr self.vtgate_conns[cell] = vtgate_client.connect( protocols_flavor.protocols_flavor().vtgate_python_protocol(), self.vtgate_addrs[cell], 60)