def check_tombstone(self, topology=None, args=None): if not topology: topology = self.topology if not args: args = self._args _remove_uuid = [] _cluster = ClusterAPI(topology) _binlog = BinlogAPI(topology) if _cluster.tikv_stores() and _cluster.tikv_tombstone(): # get tombstone tikv node for _node in topology()['tikv_servers']: _tombstone = False if not _node['offline']: continue # online tikv node list _online_list = [ x['store']['address'] for x in _cluster.tikv_stores()['stores'] ] # tombstone status tikv list _tombstone_list = [ x['store']['address'] for x in _cluster.tikv_tombstone()['stores'] ] _address = '{}:{}'.format(_node['ip'], _node['port']) # if node is online, skip it if _address in _online_list: continue # if node is tombstone, will delete it from topology elif _address in _tombstone_list: _remove_uuid.append(_node['uuid']) if _binlog.pump_status: # get tombstone pump node for _node in topology()['pump_servers']: _tombstone = False if not _node['offline']: continue _online_list = [ x['nodeId'] for x in _binlog.pump_status['status'].itervalues() if x['state'] != 'offline' ] _tombstone_list = [ x['nodeId'] for x in _binlog.pump_status['status'].itervalues() if x['state'] == 'offline' ] if _node['uuid'] in _online_list: continue elif _node['uuid'] in _tombstone_list: _remove_uuid.append(_node['uuid']) for _node in topology()['drainer_servers']: _tombstone = False if not _node['offline']: continue _online_list = [ x['nodeId'] for x in _binlog.drainer_status if x['state'] != 'offline' ] _tombstone_list = [ x['nodeId'] for x in _binlog.drainer_status if x['state'] == 'offline' ] if _node['uuid'] in _online_list: continue elif _node['uuid'] in _tombstone_list: _remove_uuid.append(_node['uuid']) if not _remove_uuid: return _new_topo, _diff = topology.remove(','.join(_remove_uuid), delete=True) ans = ansibleapi.ANSRunner(user=topology.user, topology=_diff, tiargs=args) act = Action(ans=ans, topo=topology) for service in [{ 'drainer': 'drainer_servers' }, { 'pump': 'pump_servers' }, { 'tikv': 'tikv_servers' }]: component, pattern = self.check_exist(service, _diff) if not component and not pattern: continue act.stop_component(component=component, pattern=pattern) act.destroy_component(component=component, pattern=pattern) topology.replace(_new_topo)
class OprScaleIn(OperationBase): def __init__(self, args=None, topology=None, node=None): if not node: msg = 'Node ID not specified.' term.error(msg) raise exceptions.TiOPSConfigError(msg) self._new_topo, self._diff = topology.remove(node) super(OprScaleIn, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) def _prepare(self, component=None, pattern=None, node=None, role=None): term.notice('Begin delete node for TiDB cluster.') self._cluster = modules.ClusterAPI(topology=self.topology) self._pd_status = self._cluster.status() self._tikv_stores = self._cluster.tikv_stores() def _process(self, component=None, pattern=None, node=None, role=None): _unhealth_node = [] for _pd_node in self._cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='scaleIn') _current_pd_num = len(self._pd_status) _current_tikv_num = len(self._tikv_stores) if 'pd_servers' in self._diff and len( self._diff['pd_servers']) == _current_pd_num: term.fatal('Can not delete all pd node.') exit(1) if 'tikv_servers' in self._diff and len( self._diff['tikv_servers']) == _current_tikv_num: term.fatal('Can not delete all tikv node.') exit(1) term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, self._diff) if not component and not pattern: continue uuid = [x['uuid'] for x in self._diff[pattern]] term.normal('Delete {}, node list: {}'.format( component, ','.join(uuid))) for _uuid in uuid: self.__delete_component(self._diff, component, pattern, _uuid) if component not in ['tikv', 'pump', 'drainer']: self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.destroy_component(component=component, pattern=pattern, node=_uuid) if component != 'blackbox_exporter': self.topology.replace(self.topology.remove(_uuid)[0]) def _post(self, component=None, pattern=None, node=None, role=None): ans = ansibleapi.ANSRunner(user=self.topology.user, topology=self.topology(), tiargs=self._args) act = Action(ans=ans, topo=self.topology) if 'pd_servers' in self._diff: act.deploy_component(component='pd', pattern='pd_servers') act.deploy_component(component='tikv', pattern='tikv_servers') act.deploy_component(component='tidb', pattern='tidb_servers') act.deploy_component(component='pump', pattern='pump_servers') act.deploy_component(component='drainer', pattern='drainer_servers') # self.deploy.deploy_component(component='prometheus', pattern='monitoring_server', ans=ans) # self.reload.do(component='prometheus', pattern='monitoring_server') term.notice('Finished scaling in.') def __delete_component(self, config=None, component=None, pattern=None, uuid=None): if component == 'pd': try: self._cluster.del_pd(uuid) except exceptions.TiOPSException as e: term.fatal( 'Unable to delete PD node from cluster: {}'.format(e)) exit(1) if component == 'tikv': _tikv_info = '' for _tikv_node in config[pattern]: if _tikv_node['uuid'] != uuid: continue if _tikv_node['offline']: return _tikv_info = _tikv_node for ctikv in self._tikv_stores['stores']: # check if node in cluster if '{}:{}'.format( _tikv_info['ip'], _tikv_info['port']) == ctikv['store']['address']: _store_id = ctikv['store']['id'] # delete store through api try: self._cluster.del_store(_store_id) except exceptions.TiOPSException as e: term.fatal('Unable to delete store: {}'.format(e)) exit(1) if component == 'drainer': _binlog = modules.BinlogAPI(topology=self.topology) _binlog.delete_drainer(node_id=uuid) if component == 'pump': _binlog = modules.BinlogAPI(topology=self.topology) _binlog.delete_pump(node_id=uuid)
class OprDestroy(OperationBase): def __init__(self, args=None, topology=None): super(OprDestroy, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) def _prepare(self, component=None, pattern=None, node=None, role=None): term.warn('The TiDB cluster {} ({}) is going to be destroyed.'.format( self.topology.cluster_name, self.topology.version)) rm_promt = 'This operation will ' + term.warn_red('remove') \ + ' the TiDB cluster ' + term.highlight_red(self.topology.cluster_name) \ + '. It can NOT be undone. ' + term.yes_no() + ':' notice = term.input(rm_promt) if notice.lower() not in ['y', 'yes']: term.notice('Terminate the destroy operation.') raise exceptions.TiOPSRuntimeError('Operation cancelled by user.') def _process(self, component=None, pattern=None, node=None, role=None): term.info('Check ssh connection.') self.act.check_ssh_connection() term.info('Stopping TiDB cluster.') for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue try: self.act.stop_component(component=component, pattern=pattern, node=node) except exceptions.TiOPSWarning as e: term.debug(str(e)) pass for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue term.normal('{} is being destroyed.'.format(component)) try: self.act.destroy_component(component=component, pattern=pattern, node=node) except exceptions.TiOPSWarning as e: term.debug(str(e)) pass # remove deploy dir self.ans.run_model('shell', 'rm -rf {{ full_deploy_dir | cluster_dir }}', become=True, group='*') self.ans.run_model('shell', 'rm -rf {{ full_data_dir | cluster_dir }}', become=True, group='*') def _post(self, component=None, pattern=None, node=None, role=None): try: utils.remove_dir(utils.profile_path(self.topology.cluster_dir)) except Exception as e: logging.warning(e) term.notice('TiDB cluster destroyed.')