def _post(self, component=None, pattern=None, node=None, role=None): # if 'pd_servers' in self._diff: # reload_pd = True # else: # reload_pd = False self.topology.replace(self._new_topo) term.info('Update configuration.') ans = ansibleapi.ANSRunner(user=self.topology.user, topology=self.topology._topology( self._new_topo), tiargs=self._args) act = Action(ans=ans, topo=self.topology) if 'pd_servers' in self._diff: act.deploy_component(component='pd', pattern='pd_servers') act.deploy_component(component='tikv', pattern='tikv_servers') act.deploy_component(component='tidb', pattern='tidb_servers') act.deploy_component(component='pump', pattern='pump_servers') act.deploy_component(component='drainer', pattern='drainer_servers') act.deploy_component(component='prometheus', pattern='monitoring_server') act.stop_component(component='prometheus', pattern='monitoring_server') act.start_component(component='prometheus', pattern='monitoring_server') term.notice('Finished scaling out.')
class OprStop(OperationBase): def __init__(self, args=None, topology=None): super(OprStop, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Stop specified node in cluster.') elif role: term.notice('Stop specified role in cluster.') else: term.notice('Stop TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group[::-1]: component, pattern = self.check_exist( service, config=_topology) if not component and not pattern: continue if not node: term.normal('Stopping {}.'.format(component)) self.act.stop_component(component, pattern) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.normal('Stopping {}, node list: {}.'.format( component, ','.join(_uuid))) self.act.stop_component(component, pattern, ','.join(_uuid)) def _post(self, component=None, pattern=None, node=None, role=None): term.notice('Finished stop.')
class OprScaleIn(OperationBase): def __init__(self, args=None, topology=None, node=None): if not node: msg = 'Node ID not specified.' term.error(msg) raise exceptions.TiOPSConfigError(msg) self._new_topo, self._diff = topology.remove(node) super(OprScaleIn, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) def _prepare(self, component=None, pattern=None, node=None, role=None): term.notice('Begin delete node for TiDB cluster.') self._cluster = modules.ClusterAPI(topology=self.topology) self._pd_status = self._cluster.status() self._tikv_stores = self._cluster.tikv_stores() def _process(self, component=None, pattern=None, node=None, role=None): _unhealth_node = [] for _pd_node in self._cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='scaleIn') _current_pd_num = len(self._pd_status) _current_tikv_num = len(self._tikv_stores) if 'pd_servers' in self._diff and len( self._diff['pd_servers']) == _current_pd_num: term.fatal('Can not delete all pd node.') exit(1) if 'tikv_servers' in self._diff and len( self._diff['tikv_servers']) == _current_tikv_num: term.fatal('Can not delete all tikv node.') exit(1) term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, self._diff) if not component and not pattern: continue uuid = [x['uuid'] for x in self._diff[pattern]] term.normal('Delete {}, node list: {}'.format( component, ','.join(uuid))) for _uuid in uuid: self.__delete_component(self._diff, component, pattern, _uuid) if component not in ['tikv', 'pump', 'drainer']: self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.destroy_component(component=component, pattern=pattern, node=_uuid) if component != 'blackbox_exporter': self.topology.replace(self.topology.remove(_uuid)[0]) def _post(self, component=None, pattern=None, node=None, role=None): ans = ansibleapi.ANSRunner(user=self.topology.user, topology=self.topology(), tiargs=self._args) act = Action(ans=ans, topo=self.topology) if 'pd_servers' in self._diff: act.deploy_component(component='pd', pattern='pd_servers') act.deploy_component(component='tikv', pattern='tikv_servers') act.deploy_component(component='tidb', pattern='tidb_servers') act.deploy_component(component='pump', pattern='pump_servers') act.deploy_component(component='drainer', pattern='drainer_servers') # self.deploy.deploy_component(component='prometheus', pattern='monitoring_server', ans=ans) # self.reload.do(component='prometheus', pattern='monitoring_server') term.notice('Finished scaling in.') def __delete_component(self, config=None, component=None, pattern=None, uuid=None): if component == 'pd': try: self._cluster.del_pd(uuid) except exceptions.TiOPSException as e: term.fatal( 'Unable to delete PD node from cluster: {}'.format(e)) exit(1) if component == 'tikv': _tikv_info = '' for _tikv_node in config[pattern]: if _tikv_node['uuid'] != uuid: continue if _tikv_node['offline']: return _tikv_info = _tikv_node for ctikv in self._tikv_stores['stores']: # check if node in cluster if '{}:{}'.format( _tikv_info['ip'], _tikv_info['port']) == ctikv['store']['address']: _store_id = ctikv['store']['id'] # delete store through api try: self._cluster.del_store(_store_id) except exceptions.TiOPSException as e: term.fatal('Unable to delete store: {}'.format(e)) exit(1) if component == 'drainer': _binlog = modules.BinlogAPI(topology=self.topology) _binlog.delete_drainer(node_id=uuid) if component == 'pump': _binlog = modules.BinlogAPI(topology=self.topology) _binlog.delete_pump(node_id=uuid)
def check_tombstone(self, topology=None, args=None): if not topology: topology = self.topology if not args: args = self._args _remove_uuid = [] _cluster = ClusterAPI(topology) _binlog = BinlogAPI(topology) if _cluster.tikv_stores() and _cluster.tikv_tombstone(): # get tombstone tikv node for _node in topology()['tikv_servers']: _tombstone = False if not _node['offline']: continue # online tikv node list _online_list = [ x['store']['address'] for x in _cluster.tikv_stores()['stores'] ] # tombstone status tikv list _tombstone_list = [ x['store']['address'] for x in _cluster.tikv_tombstone()['stores'] ] _address = '{}:{}'.format(_node['ip'], _node['port']) # if node is online, skip it if _address in _online_list: continue # if node is tombstone, will delete it from topology elif _address in _tombstone_list: _remove_uuid.append(_node['uuid']) if _binlog.pump_status: # get tombstone pump node for _node in topology()['pump_servers']: _tombstone = False if not _node['offline']: continue _online_list = [ x['nodeId'] for x in _binlog.pump_status['status'].itervalues() if x['state'] != 'offline' ] _tombstone_list = [ x['nodeId'] for x in _binlog.pump_status['status'].itervalues() if x['state'] == 'offline' ] if _node['uuid'] in _online_list: continue elif _node['uuid'] in _tombstone_list: _remove_uuid.append(_node['uuid']) for _node in topology()['drainer_servers']: _tombstone = False if not _node['offline']: continue _online_list = [ x['nodeId'] for x in _binlog.drainer_status if x['state'] != 'offline' ] _tombstone_list = [ x['nodeId'] for x in _binlog.drainer_status if x['state'] == 'offline' ] if _node['uuid'] in _online_list: continue elif _node['uuid'] in _tombstone_list: _remove_uuid.append(_node['uuid']) if not _remove_uuid: return _new_topo, _diff = topology.remove(','.join(_remove_uuid), delete=True) ans = ansibleapi.ANSRunner(user=topology.user, topology=_diff, tiargs=args) act = Action(ans=ans, topo=topology) for service in [{ 'drainer': 'drainer_servers' }, { 'pump': 'pump_servers' }, { 'tikv': 'tikv_servers' }]: component, pattern = self.check_exist(service, _diff) if not component and not pattern: continue act.stop_component(component=component, pattern=pattern) act.destroy_component(component=component, pattern=pattern) topology.replace(_new_topo)
class OprReload(OperationBase): def __init__(self, args=None, topology=None): super(OprReload, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Reload specified node in cluster.') elif role: term.notice('Reload specified role in cluster.') else: term.notice('Reload TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) _cluster = modules.ClusterAPI(topology=self.topology) _unhealth_node = [] for _pd_node in _cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='reload') term.info('Check ssh connection.') self.act.check_ssh_connection() # every time should only contain one item for service in self.topology.service_group: component, pattern = self.check_exist(service=service, config=_topology) if not component and not pattern: continue # upgrade pd server, upgrade leader node finally if component == 'pd': _pd_list = [] for _node in _topology[pattern]: if _node['uuid'] == _cluster.pd_leader(): _leader = _node else: _pd_list.append(_node) _pd_list.append(_leader) for _node in _pd_list: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Reload {}, node id: {}.'.format( component, _uuid)) if _uuid == _cluster.pd_leader(): _cluster.evict_pd_leader(uuid=_uuid) self.act.deploy_component(component=component, pattern=pattern, node=_uuid) self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.start_component(component=component, pattern=pattern, node=_uuid) continue if pattern in [ 'monitored_servers', 'monitoring_server', 'grafana_server', 'alertmanager_server' ]: if not node: term.normal('Reload {}.'.format(component)) self.act.deploy_component(component=component, pattern=pattern) self.act.stop_component(component=component, pattern=pattern) self.act.start_component(component=component, pattern=pattern) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.normal('Reload {}, node list: {}.'.format( component, ','.join(_uuid))) self.act.deploy_component(component=component, pattern=pattern, node=','.join(_uuid)) self.act.stop_component(component=component, pattern=pattern, node=','.join(_uuid)) self.act.start_component(component=component, pattern=pattern, node=','.join(_uuid)) continue for _node in _topology[pattern]: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Reload {}, node id: {}.'.format(component, _uuid)) if pattern == 'tikv_servers': _port = _node['port'] _cluster.evict_store_leaders(host=_host, port=_port) self.act.deploy_component(component=component, pattern=pattern, node=_uuid) self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.start_component(component=component, pattern=pattern, node=_uuid) if pattern == 'tikv_servers': _cluster.remove_evict(host=_host, port=_port) def _post(self, component=None, pattern=None, node=None, role=None): term.notice('Finished reload config for {} cluster.'.format( self.topology.version))
class OprUpgrade(OprDeploy): def __init__(self, args=None, topology=None): super(OprUpgrade, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) try: self.arg_ver = args.tidb_version except AttributeError: raise exceptions.TiOPSConfigError( '--tidb-version is not set when upgrade, abort.') try: self.force = args.force except AttributeError: self.force = False # check versions, it update version related variables in memory, but not writting them to disk def __check_version(self): new_ver = self.arg_ver.lstrip('v') curr_ver = self.topology.version.lstrip('v') _cmp = semver.compare(curr_ver, new_ver) if _cmp == 0: raise exceptions.TiOPSArgumentError( 'Already running version {}.'.format(curr_ver)) elif _cmp > 0: raise exceptions.TiOPSRuntimeError( 'Downgrade is not supported, keep running {}.'.format(curr_ver), operation='upgrade') # update version and related variables self.old_ver = curr_ver self.new_ver = new_ver self.topology.version = 'v{}'.format(new_ver) self.topology.tiversion_dir = os.path.join( self.topology.tidown_dir, '{}'.format(self.topology.version)) self.topology.resource_dir = utils.profile_path( 'downloads', '{}/resources'.format(self.topology.version)) self.topology.dashboard_dir = utils.profile_path( 'downloads', '{}/dashboards'.format(self.topology.version)) self.topology.package_dir = utils.profile_path( 'downloads', '{}/packages'.format(self.topology.version)) self.topology.config_dir = utils.profile_path( 'downloads', '{}/configs'.format(self.topology.version)) # Check if the configuration of the tidb component is reasonable def _check_config(self, topology=None): if not topology: topology = self.topology() _servers = [ {'pd': 'pd_servers'}, {'tikv': 'tikv_servers'}, {'tidb': 'tidb_servers'}, ] for _service in _servers: _component, _pattern = self.check_exist( _service, config=topology) if not _component and not _pattern: continue term.info('Check {} configuration.'.format(_component)) self.act.configCheck(component=_component, pattern=_pattern, node=topology[_pattern][0]['uuid']) # TODO: check and merge configs def __check_config(self): pass def _prepare(self, component=None, pattern=None, node=None, role=None): # check versions before processing self.__check_version() term.notice('Upgrading from v{} to v{}.'.format( self.old_ver, self.new_ver)) # download packages for new version term.info('Downloading TiDB related binary, it may take a few minutes.') try: _local = self._args.local_pkg except AttributeError: _local = None self.act.download(version=self.new_ver, local_pkg=_local) # check configs self.__check_config() def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Upgrade specified node in cluster.') elif role: term.notice('Upgrade specified role in cluster.') else: term.notice('Upgrade TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) if self._args.enable_check_config: self._check_config() # for service in ['pd', 'tikv', 'pump', 'tidb']: # grp = [x for x in self.topology.service_group if service in x.keys()] _cluster = modules.ClusterAPI(topology=self.topology) _unhealth_node = [] for _pd_node in _cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='upgrade') term.info('Check ssh connection.') self.act.check_ssh_connection() if self.force: for service in self.topology.service_group: component, pattern = self.check_exist( service=service, config=_topology) if not component and not pattern: continue if pattern in ['monitored_servers', 'monitoring_server', 'grafana_server', 'alertmanager_server']: term.normal('Upgrade {}.'.format(component)) self.act.deploy_component( component=component, pattern=pattern) self.act.stop_component( component=component, pattern=pattern) self.act.start_component( component=component, pattern=pattern) continue for _node in _topology[pattern]: _uuid = _node['uuid'] term.normal('Upgrade {}, node id: {}.'.format( component, _uuid)) self.act.deploy_component( component=component, pattern=pattern, node=_uuid) self.act.stop_component( component=component, pattern=pattern, node=_uuid) self.act.start_component( component=component, pattern=pattern, node=_uuid) return # every time should only contain one item for service in self.topology.service_group: component, pattern = self.check_exist( service=service, config=_topology) if not component and not pattern: continue # upgrade pd server, upgrade leader node finally if component == 'pd': _pd_list = [] for _node in _topology[pattern]: if _node['uuid'] == _cluster.pd_leader(): _leader = _node else: _pd_list.append(_node) _pd_list.append(_leader) for _node in _pd_list: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Upgrade {}, node id: {}.'.format( component, _uuid)) if _uuid == _cluster.pd_leader(): _cluster.evict_pd_leader(uuid=_uuid) self.act.deploy_component( component=component, pattern=pattern, node=_uuid) self.act.stop_component( component=component, pattern=pattern, node=_uuid) self.act.start_component( component=component, pattern=pattern, node=_uuid) continue if pattern in ['monitored_servers', 'monitoring_server', 'grafana_server', 'alertmanager_server']: term.normal('Upgrade {}.'.format(component)) self.act.deploy_component(component=component, pattern=pattern) self.act.stop_component(component=component, pattern=pattern) self.act.start_component(component=component, pattern=pattern) continue for _node in _topology[pattern]: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Upgrade {}, node id: {}.'.format(component, _uuid)) if pattern == 'tikv_servers': _port = _node['port'] _cluster.evict_store_leaders(host=_host, port=_port) self.act.deploy_component( component=component, pattern=pattern, node=_uuid) self.act.stop_component( component=component, pattern=pattern, node=_uuid) self.act.start_component( component=component, pattern=pattern, node=_uuid) if pattern == 'tikv_servers': _cluster.remove_evict(host=_host, port=_port) def _post(self, component=None, pattern=None, node=None, role=None): self.topology.set_meta(version=self.new_ver) term.notice('Upgraded to {}.'.format(self.topology.version))
class OprDestroy(OperationBase): def __init__(self, args=None, topology=None): super(OprDestroy, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) def _prepare(self, component=None, pattern=None, node=None, role=None): term.warn('The TiDB cluster {} ({}) is going to be destroyed.'.format( self.topology.cluster_name, self.topology.version)) rm_promt = 'This operation will ' + term.warn_red('remove') \ + ' the TiDB cluster ' + term.highlight_red(self.topology.cluster_name) \ + '. It can NOT be undone. ' + term.yes_no() + ':' notice = term.input(rm_promt) if notice.lower() not in ['y', 'yes']: term.notice('Terminate the destroy operation.') raise exceptions.TiOPSRuntimeError('Operation cancelled by user.') def _process(self, component=None, pattern=None, node=None, role=None): term.info('Check ssh connection.') self.act.check_ssh_connection() term.info('Stopping TiDB cluster.') for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue try: self.act.stop_component(component=component, pattern=pattern, node=node) except exceptions.TiOPSWarning as e: term.debug(str(e)) pass for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue term.normal('{} is being destroyed.'.format(component)) try: self.act.destroy_component(component=component, pattern=pattern, node=node) except exceptions.TiOPSWarning as e: term.debug(str(e)) pass # remove deploy dir self.ans.run_model('shell', 'rm -rf {{ full_deploy_dir | cluster_dir }}', become=True, group='*') self.ans.run_model('shell', 'rm -rf {{ full_data_dir | cluster_dir }}', become=True, group='*') def _post(self, component=None, pattern=None, node=None, role=None): try: utils.remove_dir(utils.profile_path(self.topology.cluster_dir)) except Exception as e: logging.warning(e) term.notice('TiDB cluster destroyed.')