def check_systemd_version(self, info=None): # info: systemd info callback by ansible _lower_version = [] for _host, _vars in info['success'].iteritems(): for systemd_info in _vars['results']: # get sysmted version if systemd_info['yumstate'] == 'installed': _version = '{}-{}'.format(systemd_info['version'], systemd_info['release']) # when version less than 219-52.el7, will record if _version < '219-52.el7': _lower_version.append([_host, _version]) if _lower_version: term.warn( 'Some machine\'s systemd service version lower than "219-52.el7".' ) _length = max(max([len(str(x[0])) for x in _lower_version]), len('IP')) term.normal('IP'.ljust(_length + 2) + 'Systemd_Version') for _node in _lower_version: term.normal('{}{}'.format(_node[0].ljust(_length + 2), _node[1])) term.warn( 'There are some memory bugs in lower systemd version(lower than "219-52.el7"). ' 'Refer to https://access.redhat.com/discussions/3536621.') if not utils.ticontinue(): exit(1)
def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Start specified node in cluster.') elif role: term.notice('Start specified role in cluster.') else: term.notice('Start TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) if not self.demo: term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group: component, pattern = self.check_exist(service, config=_topology) if not component and not pattern: continue if not node: term.normal('Starting {}.'.format(component)) self.act.start_component(component, pattern) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.normal('Starting {}, node list: {}.'.format( component, ','.join(_uuid))) self.act.start_component(component, pattern, ','.join(_uuid))
def _process(self, component=None, pattern=None, node=None, role=None): # creart directory term.info('Create directory in all nodes.') for service in self.topology.service_group: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue self.act.create_directory(component=component, pattern=pattern) if not self.demo: self.act.check_machine_config() # start run deploy if self.demo: term.warn( 'FirewallD is being disabled on deployment machines in quick deploy mode.' ) for service in self.topology.service_group: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue term.normal('Deploy {}.'.format(component)) self.act.deploy_component(component=component, pattern=pattern) self.act.deploy_firewall(component=component, pattern=pattern) if not self.demo: self.act.deploy_tool()
def check_hostname(self, facts=None): # facts is ansible callback _hostname_list = {} for _host, _vars in facts['success'].iteritems(): _hostname = _vars['ansible_facts']['ansible_hostname'] if _hostname_list.has_key(_hostname): _hostname_list[_hostname].append(_host) else: _hostname_list[_hostname] = [_host] # check if have conflict hostname between different host _cache_hostname_list = copy.deepcopy(_hostname_list) for _host_name, _ip in _hostname_list.iteritems(): if len(_ip) == 1: del _cache_hostname_list[_host_name] if _cache_hostname_list: term.warn("Some machine\'s hostname conflict.") _length = max( max([len(str(x)) for x in _cache_hostname_list.keys()]), len('Hostname')) term.normal('Hostname'.ljust(_length + 2) + 'Hosts') for _hostname, _hosts in _cache_hostname_list.iteritems(): term.normal('{}{}'.format(_hostname.ljust(_length + 2), ', '.join(_hosts))) if not utils.ticontinue(): exit(1)
def _prepare(self, component=None, pattern=None, node=None, role=None): try: self.cmd = ' '.join(self._args.cmd) except AttributeError: raise exceptions.TiOPSArgumentError( 'No command specified, do nothing.') term.notice('Run raw shell command on {} cluster.'.format( self.topology.cluster_name)) term.normal('{}'.format(self.cmd))
def _process(self, component=None, pattern=None, node=None, role=None): _unhealth_node = [] for _pd_node in self._cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='scaleIn') _current_pd_num = len(self._pd_status) _current_tikv_num = len(self._tikv_stores) if 'pd_servers' in self._diff and len( self._diff['pd_servers']) == _current_pd_num: term.fatal('Can not delete all pd node.') exit(1) if 'tikv_servers' in self._diff and len( self._diff['tikv_servers']) == _current_tikv_num: term.fatal('Can not delete all tikv node.') exit(1) term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, self._diff) if not component and not pattern: continue uuid = [x['uuid'] for x in self._diff[pattern]] term.normal('Delete {}, node list: {}'.format( component, ','.join(uuid))) for _uuid in uuid: self.__delete_component(self._diff, component, pattern, _uuid) if component not in ['tikv', 'pump', 'drainer']: self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.destroy_component(component=component, pattern=pattern, node=_uuid) if component != 'blackbox_exporter': self.topology.replace(self.topology.remove(_uuid)[0])
def check_os_version(self, facts=None): _lower_version = [] for _host, _vars in facts['success'].iteritems(): # get system version _sysversion = str( _vars['ansible_facts']['ansible_distribution_version']) if _sysversion < '7': _lower_version.append([_host, _sysversion]) if _lower_version: term.fatal('Some machine\'s OS version dosen\'t support.') _length = max(max([len(str(x[0])) for x in _lower_version]), len('IP')) term.normal('IP'.ljust(_length + 2) + 'OS_Version') for _node in _lower_version: term.normal('{}{}'.format(_node[0].ljust(_length + 2), _node[1])) exit(1)
def _process(self, component=None, pattern=None, node=None, role=None): term.info('Check ssh connection.') self.act.check_ssh_connection() term.info('Stopping TiDB cluster.') for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue try: self.act.stop_component(component=component, pattern=pattern, node=node) except exceptions.TiOPSWarning as e: term.debug(str(e)) pass for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue term.normal('{} is being destroyed.'.format(component)) try: self.act.destroy_component(component=component, pattern=pattern, node=node) except exceptions.TiOPSWarning as e: term.debug(str(e)) pass # remove deploy dir self.ans.run_model('shell', 'rm -rf {{ full_deploy_dir | cluster_dir }}', become=True, group='*') self.ans.run_model('shell', 'rm -rf {{ full_data_dir | cluster_dir }}', become=True, group='*')
def check_os_platform(self, facts=None): _unsupport_os = [] # get operation system platform for _host, _vars in facts['success'].iteritems(): _platform = _vars['ansible_facts']['ansible_os_family'] if 'redhat' == _platform.lower(): continue _unsupport_os.append([_host, _platform]) if _unsupport_os: term.fatal( 'Some machine\'s OS is not support, Please use Redhat / CentOS.' ) _length = max(max([len(str(x[0])) for x in _unsupport_os]), len('IP')) term.normal('IP'.ljust(_length + 2) + 'OS_Family') for _node in _unsupport_os: term.normal('{}{}'.format(_node[0].ljust(_length + 2), _node[1])) exit(1)
def init(self, demo=False): term.notice('Start init management machine.') key_home = utils.profile_path('.ssh') if not os.path.exists(key_home): utils.create_dir(key_home) os.chmod(os.path.join(key_home), 0o700) if not os.path.isfile(os.path.join(key_home, 'id_rsa')) or \ not os.path.isfile(os.path.join(key_home, 'id_rsa.pub')): term.info('There is not SSH key. Start generating.'.format( getpass.getuser())) os.system( '/usr/bin/ssh-keygen -t rsa -N \'\' -f {}/id_rsa -q'.format( key_home)) else: term.normal('Already have SSH key, skip create.'.format( getpass.getuser())) if demo: term.notice('Finished init management machine.') else: term.notice('Done!!!')
def _check_config(self): _servers = [ { 'pd': 'pd_servers' }, { 'tikv': 'tikv_servers' }, { 'tidb': 'tidb_servers' }, ] for _service in _servers: _component, _pattern = self.check_exist(_service, config=self.topology()) if not _component and not _pattern: continue term.normal('Check {} configuration.'.format(_component)) self.act.configCheck(component=_component, pattern=_pattern, node=self.topology()[_pattern][0]['uuid'])
def _process(self, component=None, pattern=None, node=None, role=None): term.info('Check ssh connection.') self.act.check_ssh_connection() self.act.edit_file() try: term.info('Create directory in all add nodes.') for service in self.topology.service_group: component, pattern = self.check_exist(service, self._diff) if not component and not pattern: continue uuid = [x['uuid'] for x in self._diff[pattern]] self.act.create_directory(component=component, pattern=pattern, node=','.join(uuid)) # check machine cpu / memory / disk self.act.check_machine_config(self._diff) # start run scale-out for service in self.topology.service_group: component, pattern = self.check_exist(service, self._diff) if not component and not pattern: continue uuid = [x['uuid'] for x in self._diff[pattern]] term.normal('Add {}, node list: {}.'.format( component, ','.join(uuid))) _template_dir = self.topology.cache_template_dir self.act.deploy_component(component=component, pattern=pattern, node=','.join(uuid), template_dir=_template_dir) self.act.deploy_firewall(component=component, pattern=pattern, node=','.join(uuid)) self.act.start_component(component=component, pattern=pattern, node=','.join(uuid)) finally: os.popen('rm -rf {}'.format(self.topology.cache_template_dir))
def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Reload specified node in cluster.') elif role: term.notice('Reload specified role in cluster.') else: term.notice('Reload TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) _cluster = modules.ClusterAPI(topology=self.topology) _unhealth_node = [] for _pd_node in _cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='reload') term.info('Check ssh connection.') self.act.check_ssh_connection() # every time should only contain one item for service in self.topology.service_group: component, pattern = self.check_exist(service=service, config=_topology) if not component and not pattern: continue # upgrade pd server, upgrade leader node finally if component == 'pd': _pd_list = [] for _node in _topology[pattern]: if _node['uuid'] == _cluster.pd_leader(): _leader = _node else: _pd_list.append(_node) _pd_list.append(_leader) for _node in _pd_list: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Reload {}, node id: {}.'.format( component, _uuid)) if _uuid == _cluster.pd_leader(): _cluster.evict_pd_leader(uuid=_uuid) self.act.deploy_component(component=component, pattern=pattern, node=_uuid) self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.start_component(component=component, pattern=pattern, node=_uuid) continue if pattern in [ 'monitored_servers', 'monitoring_server', 'grafana_server', 'alertmanager_server' ]: if not node: term.normal('Reload {}.'.format(component)) self.act.deploy_component(component=component, pattern=pattern) self.act.stop_component(component=component, pattern=pattern) self.act.start_component(component=component, pattern=pattern) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.normal('Reload {}, node list: {}.'.format( component, ','.join(_uuid))) self.act.deploy_component(component=component, pattern=pattern, node=','.join(_uuid)) self.act.stop_component(component=component, pattern=pattern, node=','.join(_uuid)) self.act.start_component(component=component, pattern=pattern, node=','.join(_uuid)) continue for _node in _topology[pattern]: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Reload {}, node id: {}.'.format(component, _uuid)) if pattern == 'tikv_servers': _port = _node['port'] _cluster.evict_store_leaders(host=_host, port=_port) self.act.deploy_component(component=component, pattern=pattern, node=_uuid) self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.start_component(component=component, pattern=pattern, node=_uuid) if pattern == 'tikv_servers': _cluster.remove_evict(host=_host, port=_port)