def __init__(self, ip, username, password, client): # type: (str, str, str, SSHClient) -> None """ Intialize an IPMIController :param ip: IP of the host to control through IPMI :type ip: str :param username: IPMI username of the host to control through IPMI :type username: str :param password: IPMI password of the host to control through IPMI :type password: str :param client: SSHClient to perform all IPMI commands on :type client: SSHClient """ actual_params = {'ip': ip, 'username': username, 'password': password, 'client': client} required_params = {'ip': (str, ExtensionsToolbox.regex_ip, True), 'username': (str, None, True), 'password': (str, None, True), 'client': (SSHClient, None, True)} ExtensionsToolbox.verify_required_params(actual_params=actual_params, required_params=required_params) self.ip = ip self.username = username self._basic_command = ['ipmi-power', '-h', self.ip, '-u', self.username, '-p', self._pwd] self._client = client self._pwd = password
def create_blktap_device(client, diskname, edge_info, logger=LOGGER): """ Creates a blk tap device from a vdisk :return: blktap device location """ required_edge_params = { 'port': (int, { 'min': 1, 'max': 65535 }), 'protocol': (str, ['tcp', 'udp', 'rdma']), 'ip': (str, Toolbox.regex_ip), 'username': (str, None, False), 'password': (str, None, False) } ExtensionsToolbox.verify_required_params(required_edge_params, edge_info) if edge_info.get('username') and edge_info.get('password'): ovs_edge_connection = "openvstorage+{0}:{1}:{2}/{3}:username={4}:password={5}".format( edge_info['protocol'], edge_info['ip'], edge_info['port'], diskname, edge_info['username'], edge_info['password']) else: ovs_edge_connection = "openvstorage+{0}:{1}:{2}/{3}".format( edge_info['protocol'], edge_info['ip'], edge_info['port'], diskname) cmd = ["tap-ctl", "create", "-a", ovs_edge_connection] logger.debug('Creating blktap device: {}'.format(' '.join(cmd))) return client.run(cmd)
def create_hprm_config_files(self, local_storagerouter, storagerouter, parameters): """ DEPRECATED API CALL - USE /vpool/vpool_guid/create_hprm_config_files instead Create the required configuration files to be able to make use of HPRM (aka PRACC) These configuration will be zipped and made available for download :param local_storagerouter: StorageRouter this call is executed on :type local_storagerouter: ovs.dal.hybrids.storagerouter.StorageRouter :param storagerouter: The StorageRouter for which a HPRM manager needs to be deployed :type storagerouter: ovs.dal.hybrids.storagerouter.StorageRouter :param parameters: Additional information required for the HPRM configuration files :type parameters: dict :return: Asynchronous result of a CeleryTask :rtype: celery.result.AsyncResult """ _ = storagerouter ExtensionsToolbox.verify_required_params( actual_params=parameters, required_params={ 'vpool_guid': (str, ExtensionsToolbox.regex_guid) }) return VPoolController.create_hprm_config_files.delay( parameters=parameters, vpool_guid=parameters['vpool_guid'], local_storagerouter_guid=local_storagerouter.guid)
def add_domain_to_sr(self, storagerouter_ip, name, recovery=False): """ Add domains, present in the model, to a storage router. :param storagerouter_ip: ip of the storage router :type storagerouter_ip: str :param name: name of the domain to add to the storagerouter :type name: str :param recovery: true or false whether the domain is a recovery domain or not :type recovery: bool """ self._valid_storagerouter(storagerouter_ip) ExtensionsToolbox.verify_required_params( required_params={'name': (str, None, True)}, actual_params={'name': name}, verify_keys=True) if name not in self._domains: raise ValueError('Invalid domain passed: {0}'.format(name)) path = self.config['setup']['storagerouters'][storagerouter_ip] if 'domains' not in path.keys(): path['domains'] = {} path = path['domains'] config_key = 'domain_guids' if recovery is False else 'recovery_domain_guids' if config_key not in path: path[config_key] = [] path[config_key].append(name)
def convert_image(client, image_location, diskname, edge_info, logger=LOGGER): """ Converts an image file with qemu over edge connection :return: None """ required_edge_params = { 'port': (int, { 'min': 1, 'max': 65535 }), 'protocol': (str, ['tcp', 'udp', 'rdma']), 'ip': (str, Toolbox.regex_ip), 'username': (str, None, False), 'password': (str, None, False) } ExtensionsToolbox.verify_required_params(required_edge_params, edge_info) if edge_info.get('username') and edge_info.get('password'): ovs_edge_connection = "openvstorage+{0}:{1}:{2}/{3}:username={4}:password={5}".format( edge_info['protocol'], edge_info['ip'], edge_info['port'], diskname, edge_info['username'], edge_info['password']) else: ovs_edge_connection = "openvstorage+{0}:{1}:{2}/{3}".format( edge_info['protocol'], edge_info['ip'], edge_info['port'], diskname) cmd = ["qemu-img", "convert", image_location, ovs_edge_connection] logger.debug('Converting an image with qemu using: {}'.format( ' '.join(cmd))) client.run(cmd)
def add_disk_to_sr(self, storagerouter_ip, name, roles): """ Add disk with given name and roles to a storagerouter in the model. :param storagerouter_ip: :type storagerouter_ip: str :param name: name of the disk :type name: str :param roles: roles to assign to the disk :type roles: list """ self._valid_storagerouter(storagerouter_ip) required_params = { 'name': (str, None, True), 'roles': (list, None, True) } ExtensionsToolbox.verify_required_params( required_params=required_params, actual_params={ 'name': name, 'roles': roles }, verify_keys=True) for role in roles: if role not in DiskPartition.ROLES: raise ValueError( 'Provided role {0} is not an allowed role for disk {1}.'. format(role, name)) disk_dict = {name: {'roles': roles}} if 'disks' not in self.config['setup']['storagerouters'][ storagerouter_ip]: self.config['setup']['storagerouters'][storagerouter_ip][ 'disks'] = {} self.config['setup']['storagerouters'][storagerouter_ip][ 'disks'].update(disk_dict)
def add_storagerouter(self, storagerouter_ip, hostname): """ Add a storagerouter to the model given the provided ip and hostname. :param storagerouter_ip: ip address of the storage router :type storagerouter_ip: str :param hostname: hostname of the storagerouter :type hostname: str """ self._validate_ip(storagerouter_ip) required_params = {'hostname': (str, None, True)} ExtensionsToolbox.verify_required_params( required_params=required_params, actual_params={'hostname': hostname}, verify_keys=True) if 'setup' not in self.config.keys(): self.config['setup'] = {} if 'storagerouters' in self.config['setup'].keys(): if storagerouter_ip in self.config['setup']['storagerouters']: raise ValueError( 'Storagerouter with given ip {0} already defined.'.format( storagerouter_ip)) else: if 'storagerouters' not in self.config['setup']: self.config['setup']['storagerouters'] = {} self.config['setup']['storagerouters'][storagerouter_ip] = { 'hostname': hostname }
def change_config(vpool_name, vpool_details, storagerouter_ip, *args, **kwargs): # Settings volumedriver storagedriver_config = vpool_details.get('storagedriver') if storagedriver_config is not None: ExtensionsToolbox.verify_required_params( StoragedriverSetup.STORAGEDRIVER_PARAMS, storagedriver_config) StoragedriverSetup.LOGGER.info( 'Updating volumedriver configuration of vPool `{0}` on storagerouter `{1}`.' .format(vpool_name, storagerouter_ip)) vpool = VPoolHelper.get_vpool_by_name(vpool_name) storagedriver = [ sd for sd in vpool.storagedrivers if sd.storagerouter.ip == storagerouter_ip ][0] if not storagedriver: error_msg = 'Unable to find the storagedriver of vPool {0} on storagerouter {1}'.format( vpool_name, storagerouter_ip) raise RuntimeError(error_msg) StoragedriverHelper.change_config(storagedriver, storagedriver_config) vpool.invalidate_dynamics('configuration') StoragedriverSetup.LOGGER.info( 'Updating volumedriver config of vPool `{0}` should have succeeded on storagerouter `{1}`' .format(vpool_name, storagerouter_ip))
def get_instance(cls, connection_info, cache_store=None, version=6): """ Retrieve an OVSClient instance to the connection information passed :param connection_info: Connection information, includes: 'host', 'port', 'client_id', 'client_secret' :type connection_info: dict :param cache_store: Store in which to keep the generated token for the client :type cache_store: object :param version: Version for the API :type version: int :return: An instance of the OVSClient class :rtype: ovs_extensions.api.client.OVSClient """ ExtensionsToolbox.verify_required_params( actual_params=connection_info, required_params={ 'host': (str, ExtensionsToolbox.regex_ip), 'port': (int, { 'min': 1, 'max': 65535 }), 'client_id': (str, None), 'client_secret': (str, None), 'local': (bool, None, False) }) return cls(ip=connection_info['host'], port=connection_info['port'], credentials=(connection_info['client_id'], connection_info['client_secret']), version=version, cache_store=cache_store)
def validate(self, storagerouter=None, storagedriver=None): """ Perform some validations before creating or extending a vPool :param storagerouter: StorageRouter on which the vPool will be created or extended :type storagerouter: ovs.dal.hybrids.storagerouter.StorageRouter :param storagedriver: When passing a StorageDriver, perform validations when shrinking a vPool :type storagedriver: ovs.dal.hybrids.storagedriver.StorageDriver :raises ValueError: If extending a vPool which status is not RUNNING RuntimeError: If this vPool's configuration does not meet the requirements If the vPool has already been extended on the specified StorageRouter :return: None :rtype: NoneType """ if self.vpool is not None: if self.vpool.status != VPool.STATUSES.RUNNING: raise ValueError('vPool should be in {0} status'.format( VPool.STATUSES.RUNNING)) ExtensionsToolbox.verify_required_params( actual_params=self.vpool.configuration, required_params={ 'sco_size': (int, StorageDriverClient.TLOG_MULTIPLIER_MAP.keys()), 'dtl_mode': (str, StorageDriverClient.VPOOL_DTL_MODE_MAP.keys()), 'write_buffer': (float, None), 'dtl_transport': (str, StorageDriverClient.VPOOL_DTL_TRANSPORT_MAP.keys()), 'tlog_multiplier': (int, StorageDriverClient.TLOG_MULTIPLIER_MAP.values()) }) if storagerouter is not None: for vpool_storagedriver in self.vpool.storagedrivers: if vpool_storagedriver.storagerouter_guid == storagerouter.guid: raise RuntimeError( 'A StorageDriver is already linked to this StorageRouter for vPool {0}' .format(self.vpool.name)) if storagedriver is not None: VDiskController.sync_with_reality(vpool_guid=self.vpool.guid) storagedriver.invalidate_dynamics('vdisks_guids') if len(storagedriver.vdisks_guids) > 0: raise RuntimeError( 'There are still vDisks served from the given StorageDriver' ) self.mds_services = [ mds_service for mds_service in self.vpool.mds_services if mds_service.service.storagerouter_guid == storagedriver.storagerouter_guid ] for mds_service in self.mds_services: if len(mds_service.storagedriver_partitions ) == 0 or mds_service.storagedriver_partitions[ 0].storagedriver is None: raise RuntimeError( 'Failed to retrieve the linked StorageDriver to this MDS Service {0}' .format(mds_service.service.name))
def __getattr__(self, item): from ovs_extensions.generic.toolbox import ExtensionsToolbox if item.startswith('configure_'): section = ExtensionsToolbox.remove_prefix(item, 'configure_') return lambda **kwargs: self._add(section, **kwargs) if item.startswith('clear_'): section = ExtensionsToolbox.remove_prefix(item, 'clear_') return lambda: self._delete(section)
def update_storagedriver_of_vpool(self, sr_ip, vpool_name, sr_params=None): ''' Update all or some data of a storagedriver, assigned to a vpool on a specific storagerouter. :param sr_ip: ip of the storagerouter on which the vpool is located :type sr_ip: str :param vpool_name: name of the vpool of which to update the storagedriver data :type vpool_name: str :param sr_params: parameters to update of the referenced storagedriver :type sr_params: dict ''' required_params = { 'sco_size': (int, StorageDriverClient.TLOG_MULTIPLIER_MAP.keys()), 'cluster_size': (int, StorageDriverClient.CLUSTER_SIZES), 'volume_write_buffer': (int, { 'min': 128, 'max': 10240 }, False), 'global_read_buffer': (int, { 'min': 128, 'max': 10240 }, False), 'strategy': (str, None, False), 'deduplication': (str, None, False), 'dtl_transport': (str, StorageDriverClient.VPOOL_DTL_TRANSPORT_MAP.keys()), 'dtl_mode': (str, StorageDriverClient.VPOOL_DTL_MODE_MAP.keys()) } default_params = { 'sco_size': 4, 'cluster_size': 4, 'volume_write_buffer': 512, 'strategy': 'none', 'global_write_buffer': 128, 'global_read_buffer': 128, 'deduplication': 'non_dedupe', 'dtl_transport': 'tcp', 'dtl_mode': 'sync' } if sr_params is None: sr_params = {} default_params.update(sr_params) if not isinstance(default_params, dict): raise ValueError('Parameters should be of type "dict"') ExtensionsToolbox.verify_required_params(required_params, default_params) if sr_ip not in self.config['setup']['storagerouters'].keys(): raise KeyError('Storagerouter with ip is not defined') if vpool_name not in self.config['setup']['storagerouters'][sr_ip][ 'vpools']: raise KeyError( 'Vpool with name {0} is not defined on storagerouter with ip {1}' .format(vpool_name, sr_ip)) self.config['setup']['storagerouters'][sr_ip]['vpools'][vpool_name][ 'storagedriver'] = default_params
def _validate_ip(self, ip): required_params = {'storagerouter_ip': (str, Toolbox.regex_ip, True)} try: ExtensionsToolbox.verify_required_params( required_params=required_params, actual_params={'storagerouter_ip': ip}, verify_keys=True) except RuntimeError as e: raise ValueError(e) if os.system('ping -c 1 {0}'.format(ip)) != 0: raise ValueError('No response from ip {0}'.format(ip))
def change_cache(self, storagerouter_ip, vpool, block_cache=True, fragment_cache=True, on_read=True, on_write=True): """ Change the caching parameters of a given vpool on a given storagerouter. By default, change parameters of both block chache and fragment cache. :param storagerouter_ip: search for vpool on this storagerouter :type storagerouter_ip: str :param vpool: change cache options of given vpool :type vpool: str :param block_cache: change block cache parameters, default True :type block_cache: bool :param fragment_cache: change fragment cache parameters, default True :type fragment_cache: bool :param on_read: change onread parameters, default True :type on_read: bool :param on_write: chance onwrite parameters, default True :type on_write: bool """ self._valid_storagerouter(storagerouter_ip=storagerouter_ip) required_params = { 'vpool': (str, None, True), 'block_cache': (bool, None, False), 'fragment_cache': (bool, None, False), 'on_read': (bool, None, False), 'on_write': (bool, None, False) } actual_params = { 'vpool': vpool, 'block_cache': block_cache, 'fragment_cache': fragment_cache, 'on_read': on_read, 'on_write': on_write } ExtensionsToolbox.verify_required_params( required_params=required_params, actual_params=actual_params, verify_keys=True) try: vpool = self.config['setup']['storagerouters'][storagerouter_ip][ 'vpools'][vpool] except KeyError: raise ValueError('Vpool {0} not found'.format(vpool)) if block_cache is True: vpool['block_cache']['strategy']['cache_on_read'] = on_read vpool['block_cache']['strategy']['cache_on_write'] = on_write if fragment_cache is True: vpool['fragment_cache']['strategy']['cache_on_read'] = on_read vpool['fragment_cache']['strategy']['cache_on_write'] = on_write
def __init__(self, ip, user, password, type): required_params = {'ip': (str, Toolbox.regex_ip), 'user': (str, None), 'password': (str, None), 'type': (str, ['KVM', 'VMWARE'])} ExtensionsToolbox.verify_required_params(required_params, {'ip': ip, 'user': user, 'password': password, 'type': type}) self.ip = ip self.user = user self.password = password self.type = type
def monitor_services(self): # type: () -> None """ Monitor the local services :return: None :rtype: NoneType """ try: grep = ['egrep "{0}"'.format(prefix) for prefix in self._monitor_prefixes] previous_output = None while True: # Gather service states running_services = {} non_running_services = {} longest_service_name = 0 for service_name in check_output('systemctl list-unit-files --full --type=service --no-legend --no-pager | {0} | tr -s " " | cut -d " " -f 1'.format(' | '.join(grep)), shell=True).splitlines(): try: service_state = check_output('systemctl is-active {0}'.format(service_name), shell=True).strip() except CalledProcessError as cpe: service_state = cpe.output.strip() service_name = service_name.replace('.service', '') if service_state == 'active': service_pid = check_output('systemctl show {0} --property=MainPID'.format(service_name), shell=True).strip().split('=')[1] running_services[service_name] = (service_state, service_pid) else: non_running_services[service_name] = service_state if len(service_name) > longest_service_name: longest_service_name = len(service_name) # Put service states in list output = ['Running processes', '=================\n'] for service_name in sorted(running_services, key=lambda service: ExtensionsToolbox.advanced_sort(service, '_')): output.append('{0} {1} {2} {3}'.format(service_name, ' ' * (longest_service_name - len(service_name)), running_services[service_name][0], running_services[service_name][1])) output.extend(['\n\nNon-running processes', '=====================\n']) for service_name in sorted(non_running_services, key=lambda service: ExtensionsToolbox.advanced_sort(service, '_')): output.append('{0} {1} {2}'.format(service_name, ' ' * (longest_service_name - len(service_name)), non_running_services[service_name])) # Print service states (only if changes) if previous_output != output: print '\x1b[2J\x1b[H' for line in output: print line previous_output = list(output) time.sleep(1) except KeyboardInterrupt: pass
def regenerate_service(self, name, client, target_name): # type: (str, SSHClient, str) -> None """ Regenerates the service files of a service. :param name: Template name of the service to regenerate :type name: str :param client: Client on which to regenerate the service :type client: ovs_extensions.generic.sshclient.SSHClient :param target_name: The current service name eg ovs-volumedriver_flash01.service :type target_name: str :return: None :rtype: NoneType :raises: RuntimeError if the regeneration failed """ configuration_key = self.service_config_key.format(self._system.get_my_machine_id(client), ExtensionsToolbox.remove_prefix(target_name, self.OVS_SERVICE_PREFIX)) # If the entry is stored in arakoon, it means the service file was previously made if not self._configuration.exists(configuration_key): raise RuntimeError('Service {0} was not previously added and cannot be regenerated.'.format(target_name)) # Rewrite the service file service_params = self._configuration.get(configuration_key) startup_dependency = service_params['STARTUP_DEPENDENCY'] if startup_dependency == '': startup_dependency = None else: startup_dependency = '.'.join(startup_dependency.split('.')[:-1]) # Remove .service from startup dependency output = self.add_service(name=name, client=client, params=service_params, target_name=target_name, startup_dependency=startup_dependency, delay_registration=True) if output is None: raise RuntimeError('Regenerating files for service {0} has failed'.format(target_name))
def list(self, key, recursive=False): # type: (str, bool) -> Generator[str] """ List all keys starting with specified key :param key: Key to list :type key: str :param recursive: List keys recursively :type recursive: bool :return: Generator with all keys :rtype: generator """ key = self._clean_key(key) entries = [] for entry in self._client.prefix(key): if entry.startswith('_'): continue if recursive is True: parts = entry.split('/') for index, part in enumerate(parts): if index == len(parts) - 1: # Last part yield entry # Every entry is unique, so when having reached last part, we yield it else: dir_name = '{0}/'.format('/'.join(parts[:index + 1])) if dir_name not in entries: entries.append(dir_name) yield dir_name else: if key == '' or entry.startswith(key.rstrip('/') + '/'): cleaned = ExtensionsToolbox.remove_prefix( entry, key).strip('/').split('/')[0] if cleaned not in entries: entries.append(cleaned) yield cleaned
def check_arakoon_ports(cls, result_handler): """ Verifies that the Arakoon clusters still respond to connections :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ arakoon_clusters = cls._get_arakoon_clusters(result_handler) result_handler.info('Starting Arakoon ports test.', add_to_result=False) result_handler.info( 'Retrieving all collapsing statistics. This might take a while', add_to_result=False) start = time.time() arakoon_stats = cls._get_port_connections(result_handler, arakoon_clusters) result_handler.info( 'Retrieving all collapsing statistics succeeded (duration: {0})'. format(time.time() - start), add_to_result=False) for cluster_type, clusters in arakoon_stats.iteritems(): result_handler.info( 'Testing the collapse of {0} Arakoons'.format(cluster_type), add_to_result=False) for cluster in clusters: cluster_name = cluster['cluster_name'] connection_result = cluster['connection_result'] connection_result = OrderedDict( sorted(connection_result.items(), key=lambda item: ExtensionsToolbox.advanced_sort( item[0].ip, separator='.'))) for node, stats in connection_result.iteritems(): identifier_log = 'Arakoon cluster {0} on node {1}'.format( cluster_name, node.ip) if len(stats['errors']) > 0: # Determine where issues were found for step, exception in stats['errors']: if step == 'test_connection': try: # Raise the thrown exception raise exception except Exception: message = 'Connection to {0} could not be established due to an unhandled exception.'.format( identifier_log) cls.logger.exception(message) result_handler.exception( message, code=ErrorCodes.unhandled_exception) continue if stats['result'] is True: result_handler.success( 'Connection established to {0}'.format( identifier_log), code=ErrorCodes.arakoon_connection_ok) else: result_handler.failure( 'Connection could not be established to {0}'. format(identifier_log), code=ErrorCodes.arakoon_connection_failure)
def add_service(self, name, client, params=None, target_name=None, startup_dependency=None, delay_registration=False, path=None): # type: (str, SSHClient, dict, str, str, bool, str) -> dict """ Add a service :param name: Template name of the service to add :type name: str :param client: Client on which to add the service :type client: ovs_extensions.generic.sshclient.SSHClient :param params: Additional information about the service :type params: dict or None :param target_name: Overrule default name of the service with this name :type target_name: str or None :param startup_dependency: Additional startup dependency :type startup_dependency: str or None :param delay_registration: Register the service parameters in the config management right away or not :type delay_registration: bool :param path: path to add service to :type path: str :return: Parameters used by the service :rtype: dict """ if params is None: params = {} if path is None: path = self._config_template_dir.format('systemd') else: path = path.format('systemd') service_name = self._get_name(name, client, path) template_file = '{0}/{1}.service'.format(path, service_name) if not client.file_exists(template_file): # Given template doesn't exist so we are probably using system init scripts return {} if target_name is not None: service_name = target_name params.update({'SERVICE_NAME': ExtensionsToolbox.remove_prefix(service_name, 'ovs-'), 'RUN_FILE_DIR': self._run_file_dir, 'STARTUP_DEPENDENCY': '' if startup_dependency is None else '{0}.service'.format(startup_dependency)}) template_content = client.file_read(template_file) for key, value in params.iteritems(): template_content = template_content.replace('<{0}>'.format(key), str(value)) service_path = self.get_service_file_path(service_name) client.file_write(service_path, template_content) try: client.run(['systemctl', 'daemon-reload']) client.run(['systemctl', 'enable', '{0}.service'.format(service_name)]) except CalledProcessError as cpe: self._logger.exception('Add {0}.service failed, {1}'.format(service_name, cpe.output)) raise Exception('Add {0}.service failed, {1}'.format(service_name, cpe.output)) if delay_registration is False: self.register_service(service_metadata=params, node_name=self._system.get_my_machine_id(client)) return params
def add_backend(self, backend_name, domains=None, scaling='LOCAL'): """ Add a backend with provided domains and scaling to the model. :param backend_name: name of the backend :type backend_name: str :param domains: domains the backend is linked to :type domains: {} :param scaling: :type scaling: str """ if domains is None: domains = [] else: for domain_name in domains: if domain_name not in self._domains: raise ValueError( 'Invalid domain passed: {0}'.format(domain_name)) ExtensionsToolbox.verify_required_params(required_params={ 'backend_name': (str, Toolbox.regex_backend, True), 'domains': (list, self._domains, True), 'scaling': (str, AlbaBackend.SCALINGS, True) }, actual_params={ 'backend_name': backend_name, 'domains': domains, 'scaling': scaling }, verify_keys=True) be_dict = { 'name': backend_name, 'domains': { 'domain_guids': domains }, 'scaling': scaling } if 'setup' not in self.config.keys(): self.config['setup'] = {} self._backends.append(be_dict['name']) if 'backends' not in self.config['setup']: self.config['setup']['backends'] = [] self.config['setup']['backends'].append(be_dict)
def configure_proxy(backend_name, proxy_configuration): faulty_keys = [ key for key in proxy_configuration.keys() if key not in ProxySetup.PARAMS ] if len(faulty_keys) > 0: raise ValueError( '{0} are unsupported keys for proxy configuration.'.format( ', '.join(faulty_keys))) ExtensionsToolbox.verify_required_params(ProxySetup.PARAMS, proxy_configuration) vpools = VPoolList.get_vpools() service_manager = ServiceFactory.get_manager() with open('/root/old_proxies', 'w') as backup_file: for vpool in vpools: if vpool.metadata['backend']['backend_info'][ 'name'] != backend_name: continue for storagedriver in vpool.storagedrivers: for proxy in storagedriver.alba_proxies: config_loc = 'ovs/vpools/{0}/proxies/{1}/config/main'.format( vpool.guid, proxy.guid) proxy_service = Service(proxy.service_guid) proxy_config = Configuration.get(config_loc) old_proxy_config = dict(proxy_config) backup_file.write('{} -- {}\n'.format( config_loc, old_proxy_config)) proxy_config.update(proxy_configuration) ProxySetup.LOGGER.info( "Changed {0} to {1} for proxy {2}".format( old_proxy_config, proxy_config, config_loc)) ProxySetup.LOGGER.info("Changed items {0}".format([ (key, value) for key, value in proxy_config.iteritems() if key not in old_proxy_config.keys() ])) Configuration.set(config_loc, json.dumps(proxy_config, indent=4), raw=True) client = SSHClient(storagedriver.storage_ip, username='******') service_manager.restart_service(proxy_service.name, client=client)
def collapse_arakoon(): """ Collapse Arakoon's Tlogs :return: None """ from ovs_extensions.generic.toolbox import ExtensionsToolbox GenericController._logger.info('Arakoon collapse started') cluster_info = [] storagerouters = StorageRouterList.get_storagerouters() if os.environ.get('RUNNING_UNITTESTS') != 'True': cluster_info = [('cacc', storagerouters[0])] cluster_names = [] for service in ServiceList.get_services(): if service.is_internal is True and service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): cluster = ExtensionsToolbox.remove_prefix(service.name, 'arakoon-') if cluster in cluster_names and cluster not in [ARAKOON_NAME, ARAKOON_NAME_UNITTEST]: continue cluster_names.append(cluster) cluster_info.append((cluster, service.storagerouter)) workload = {} cluster_config_map = {} for cluster, storagerouter in cluster_info: GenericController._logger.debug(' Collecting info for cluster {0}'.format(cluster)) ip = storagerouter.ip if cluster in [ARAKOON_NAME, ARAKOON_NAME_UNITTEST] else None try: config = ArakoonClusterConfig(cluster_id=cluster, source_ip=ip) cluster_config_map[cluster] = config except: GenericController._logger.exception(' Retrieving cluster information on {0} for {1} failed'.format(storagerouter.ip, cluster)) continue for node in config.nodes: if node.ip not in workload: workload[node.ip] = {'node_id': node.name, 'clusters': []} workload[node.ip]['clusters'].append((cluster, ip)) for storagerouter in storagerouters: try: if storagerouter.ip not in workload: continue node_workload = workload[storagerouter.ip] client = SSHClient(storagerouter) for cluster, ip in node_workload['clusters']: try: GenericController._logger.debug(' Collapsing cluster {0} on {1}'.format(cluster, storagerouter.ip)) client.run(['arakoon', '--collapse-local', node_workload['node_id'], '2', '-config', cluster_config_map[cluster].external_config_path]) GenericController._logger.debug(' Collapsing cluster {0} on {1} completed'.format(cluster, storagerouter.ip)) except: GenericController._logger.exception(' Collapsing cluster {0} on {1} failed'.format(cluster, storagerouter.ip)) except UnableToConnectException: GenericController._logger.error(' Could not collapse any cluster on {0} (not reachable)'.format(storagerouter.name)) GenericController._logger.info('Arakoon collapse finished')
def configure_mds(self, config): """ Configure the global MDS settings for this vPool :param config: MDS configuration settings (Can contain amount of tlogs to wait for during MDS checkup, MDS safety and the maximum load for an MDS) :type config: dict :raises RuntimeError: If specified safety not between 1 and 5 If specified amount of tlogs is less than 1 If specified maximum load is less than 10% :return: None :rtype: NoneType """ if self.vpool is None: raise RuntimeError( 'Cannot configure MDS settings when no vPool has been created yet' ) ExtensionsToolbox.verify_required_params(verify_keys=True, actual_params=config, required_params={ 'mds_tlogs': (int, { 'min': 1 }, False), 'mds_safety': (int, { 'min': 1, 'max': 5 }, False), 'mds_maxload': (int, { 'min': 10 }, False) }) # Don't set a default value here, because we need to know whether these values have been specifically set or were set at None self.mds_tlogs = config.get('mds_tlogs') self.mds_safety = config.get('mds_safety') self.mds_maxload = config.get('mds_maxload') Configuration.set(key='/ovs/vpools/{0}/mds_config'.format( self.vpool.guid), value={ 'mds_tlogs': self.mds_tlogs or 100, 'mds_safety': self.mds_safety or 3, 'mds_maxload': self.mds_maxload or 75 })
def get_run_file_path(self, name): # type: (str) -> str """ Get the path to the run file for the given service This is tied to the template files as they specify something like `/opt/OpenvStorage/run/<SERVICE_NAME>.version` :param name: Name of the service :type name: str :return: Path to the file :rtype: str """ non_ovs_name = ExtensionsToolbox.remove_prefix(name, self.OVS_SERVICE_PREFIX) return os.path.join(self._run_file_dir, non_ovs_name, '.version')
def unregister_service(self, node_name, service_name): # type: (str, str) -> None """ Un-register the metadata of a service from the configuration management :param node_name: Name of the node on which to un-register the service :type node_name: str :param service_name: Name of the service to clean from the configuration management :type service_name: str :return: None :rtype: NoneType """ self._configuration.delete(key=self.service_config_key.format(node_name, ExtensionsToolbox.remove_prefix(service_name, self.OVS_SERVICE_PREFIX)))
def __init__(self, ip=None, port=None, database=None): # type: (str, int, str) -> None """ Create client instance for graphite and validate parameters :param ip: IP address of the client to send graphite data towards :type ip: str :param port: port of the UDP listening socket :type port: int :param database: name of the database :type database: str """ graphite_data = {} if all(p is None for p in [ip, port]): # Nothing specified graphite_data = self.get_graphite_config() if not graphite_data: raise RuntimeError( 'No graphite data found in config path `{0}`'.format( self.CONFIG_PATH)) ip = ip or graphite_data['ip'] port = port or graphite_data.get('port', 2003) ExtensionsToolbox.verify_required_params( verify_keys=True, actual_params={ 'host': ip, 'port': port }, required_params={ 'host': (str, ExtensionsToolbox.regex_ip, True), 'port': (int, { 'min': 1025, 'max': 65535 }, True) }) super(GraphiteClient, self).__init__(ip=ip, port=port, database=database)
def validate_and_retrieve_config(cls): """ Retrieve and validate the configuration for StatsMonkey :return: The configuration set at /ovs/framework/monitoring/stats_monkey :rtype: dict """ config_key = '/ovs/framework/monitoring/stats_monkey' config = cls._get_configuration() if not config.exists(config_key): raise ValueError( 'StatsMonkey requires a configuration key at {0}'.format( config_key)) config = config.get(config_key) if not isinstance(config, dict): raise ValueError('StatsMonkey configuration must be of type dict') required_params = { 'host': (str, ExtensionsToolbox.regex_ip), 'port': (int, { 'min': 1025, 'max': 65535 }), 'interval': (int, { 'min': 1 }, False), 'database': (str, None), 'transport': (str, ['influxdb', 'redis', 'graphite']), 'environment': (str, None) } if config.get('transport') == 'influxdb': required_params['username'] = (str, None) if config.get('transport') in ['influxdb', 'reddis']: required_params['password'] = (str, None) ExtensionsToolbox.verify_required_params( actual_params=config, required_params=required_params) cls._config = config return cls._config
def test_filter_dict_for_none(self): d = {'a': 'a', 'b': {'b1': 'b1', 'b2': None}, 'c': None, 'd': {'d1': {'d11': {'d111': 'd111'}}}, 'e': {'e1': None}} result_dict = {'a': 'a', 'b': {'b1': 'b1'}, 'd': {'d1': {'d11': {'d111': 'd111'}}}} filtered_dict = ExtensionsToolbox.filter_dict_for_none(d) self.assertEquals(filtered_dict, result_dict)
def register_service(self, node_name, service_metadata): # type: (str, dict) -> None """ Register the metadata of the service to the configuration management :param node_name: Name of the node on which the service is running :type node_name: str :param service_metadata: Metadata of the service :type service_metadata: dict :return: None :rtype: NoneType """ service_name = service_metadata['SERVICE_NAME'] self._configuration.set(key=self.service_config_key.format(node_name, ExtensionsToolbox.remove_prefix(service_name, self.OVS_SERVICE_PREFIX)), value=service_metadata)
def check_collapse(cls, result_handler, max_collapse_age=3, min_tlx_amount=10): """ Verifies collapsing has occurred for all Arakoons :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param max_collapse_age: tlx files may not be longer than x days :type max_collapse_age: int :param min_tlx_amount: Minimum amount of tlxes before making collapsing mandatory (defaults to 10) :type min_tlx_amount: int :return: None :rtype: NoneType """ arakoon_clusters = cls._get_arakoon_clusters(result_handler) result_handler.info('Starting Arakoon collapse test', add_to_result=False) max_age_seconds = timedelta(days=max_collapse_age).total_seconds() result_handler.info('Retrieving all collapsing statistics. This might take a while', add_to_result=False) start = time.time() arakoon_stats = cls._retrieve_stats(result_handler, arakoon_clusters) result_handler.info('Retrieving all collapsing statistics succeeded (duration: {0})'.format(time.time() - start), add_to_result=False) for cluster_type, clusters in arakoon_stats.iteritems(): result_handler.info('Testing the collapse of {0} Arakoons'.format(cluster_type), add_to_result=False) for cluster in clusters: cluster_name = cluster['cluster_name'] collapse_result = cluster['collapse_result'] collapse_result = OrderedDict(sorted(collapse_result.items(), key=lambda item: ExtensionsToolbox.advanced_sort(item[0].ip, separator='.'))) for node, stats in collapse_result.iteritems(): identifier_log = 'Arakoon cluster {0} on node {1}'.format(cluster_name, node.ip) if len(stats['errors']) > 0: # Determine where issues were found for step, exception in stats['errors']: if step == 'build_client': try: # Raise the thrown exception raise exception except TimeOutException: result_handler.warning('Connection to {0} has timed out'.format(identifier_log), code=ErrorCodes.ssh_connection_time) except (socket.error, UnableToConnectException): result_handler.failure( 'Connection to {0} could not be established'.format(identifier_log), code=ErrorCodes.ssh_connection_fail) except NotAuthenticatedException: result_handler.skip('Connection to {0} could not be authenticated. This node has no access to the Arakoon node.'.format(identifier_log), code=ErrorCodes.ssh_connection_authentication) except Exception: message = 'Connection to {0} could not be established due to an unhandled exception.'.format(identifier_log) cls.logger.exception(message) result_handler.exception(message, code=ErrorCodes.unhandled_exception) elif step == 'stat_dir': try: raise exception except Exception: message = 'Unable to list the contents of the tlog directory ({0}) for {1}'.format(node.tlog_dir, identifier_log) cls.logger.exception(message) result_handler.exception(message, code=ErrorCodes.unhandled_exception) continue tlx_files = stats['result']['tlx'] tlog_files = stats['result']['tlog'] headdb_files = stats['result']['headDB'] avail_size = stats['result']['avail_size'] if any(item is None for item in [tlx_files, tlog_files, avail_size]): # Exception occurred but no errors were logged result_handler.exception('Either the tlx or tlog files or available size could be found in/of the tlog directory ({0}) for {1}'.format(node.tlog_dir, identifier_log), code=ErrorCodes.tlx_tlog_not_found) continue if len(headdb_files) > 0: headdb_size = sum([int(i[2]) for i in headdb_files]) collapse_size_msg = 'Spare space for local collapse is' if avail_size >= headdb_size * 4: result_handler.success('{0} sufficient (n > 4x head.db size)'.format(collapse_size_msg)) elif avail_size >= headdb_size * 3: result_handler.warning('{0} running short (n > 3x head.db size)'.format(collapse_size_msg)) elif avail_size >= headdb_size * 2: result_handler.failure('{0} just enough (n > 2x head.db size'.format(collapse_size_msg)) else: result_handler.failure('{0} insufficient (n <2 x head.db size'.format(collapse_size_msg)) if len(tlog_files) == 0: # A tlog should always be present result_handler.failure('{0} has no open tlog'.format(identifier_log), code=ErrorCodes.tlog_not_found) continue if len(tlx_files) < min_tlx_amount: result_handler.skip('{0} only has {1} tlx, not worth collapsing (required: {2})'.format(identifier_log, len(tlx_files), min_tlx_amount)) continue # Compare youngest tlog and oldest tlx timestamp seconds_difference = int(tlog_files[-1][0]) - int(tlx_files[0][0]) if max_age_seconds > seconds_difference: result_handler.success('{0} should not be collapsed. The oldest tlx is at least {1} days younger than the youngest tlog (actual age: {2})'.format(identifier_log, max_collapse_age, str(timedelta(seconds=seconds_difference))), code=ErrorCodes.collapse_ok) else: result_handler.failure('{0} should be collapsed. The oldest tlx is currently {1} old'.format(identifier_log, str(timedelta(seconds=seconds_difference))), code=ErrorCodes.collapse_not_ok)
def check_if_proxies_work(cls, result_handler): """ Checks if all Alba Proxies work on a local machine, it creates a namespace and tries to put and object :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ namespace_params = {'bucket_count': (list, None), 'logical': (int, None), 'storage': (int, None), 'storage_per_osd': (list, None)} result_handler.info('Checking the ALBA proxies.', add_to_result=False) amount_of_presets_not_working = [] # ignore possible subprocess output fnull = open(os.devnull, 'w') # try put/get/verify on all available proxies on the local node local_proxies = ServiceHelper.get_local_proxy_services() if len(local_proxies) == 0: result_handler.info('Found no proxies.', add_to_result=False) return amount_of_presets_not_working api_cache = {} for service in local_proxies: try: result_handler.info('Checking ALBA proxy {0}.'.format(service.name), add_to_result=False) ip = service.alba_proxy.storagedriver.storage_ip # Encapsulating try to determine test output try: # Determine what to what backend the proxy is connected proxy_client_cfg = AlbaCLI.run(command='proxy-client-cfg', named_params={'host': ip, 'port': service.ports[0]}) except AlbaException: result_handler.failure('Fetching proxy info has failed. Please verify if {0}:{1} is the correct address for proxy {2}.'.format(ip, service.ports[0], service.name), code=ErrorCodes.alba_cmd_fail) continue # Fetch arakoon information abm_name = proxy_client_cfg.get('cluster_id') # Check if proxy config is correctly setup if abm_name is None: raise ConfigNotMatchedException('Proxy config for proxy {0} does not have the correct format on node {1} with port {2}.'.format(service.name, ip, service.ports[0])) abm_config = Configuration.get_configuration_path('/ovs/vpools/{0}/proxies/{1}/config/abm' .format(service.alba_proxy.storagedriver.vpool.guid, service.alba_proxy.guid)) # Determine presets / backend try: presets = AlbaCLI.run(command='list-presets', config=abm_config) except AlbaException: result_handler.failure('Listing the presets has failed. Please check the arakoon config path. We used {0}'.format(abm_config), code=ErrorCodes.alba_cmd_fail) continue for preset in presets: # If preset is not in use, test will fail so add a skip if preset['in_use'] is False: result_handler.skip('Preset {0} is not in use and will not be checked'.format(preset['name'])) continue preset_name = preset['name'] # Encapsulation try for cleanup try: # Generate new namespace name using the preset namespace_key_prefix = 'ovs-healthcheck-ns-{0}-{1}'.format(preset_name, AlbaHealthCheck.LOCAL_ID) namespace_key = '{0}_{1}'.format(namespace_key_prefix, uuid.uuid4()) object_key = 'ovs-healthcheck-obj-{0}'.format(str(uuid.uuid4())) # Create namespace AlbaCLI.run(command='proxy-create-namespace', named_params={'host': ip, 'port': service.ports[0]}, extra_params=[namespace_key, preset_name]) # Wait until fully created namespace_start_time = time.time() for index in xrange(2): # Running twice because the first one could give a false positive as the osds will alert the nsm # and the nsm would respond with got messages but these were not the ones we are after AlbaCLI.run(command='deliver-messages', config=abm_config) while True: if time.time() - namespace_start_time > AlbaHealthCheck.NAMESPACE_TIMEOUT: raise AlbaTimeOutException('Creating namespace has timed out after {0}s'.format(time.time() - namespace_start_time), 'deliver-messages') list_ns_osds_output = AlbaCLI.run(command='list-ns-osds', config=abm_config, extra_params=[namespace_key]) # Example output: [[0, [u'Active']], [3, [u'Active']]] namespace_ready = True for osd_info in list_ns_osds_output: if osd_info[1][0] != 'Active': # If we found an OSD not Active, check if preset is satisfiable namespace_ready = False break if namespace_ready is True: break else: result_handler.info('Not all OSDs have responded to the creation message. Fetching the safety', add_to_result=False) try: # Fetch the preset information on the Framework # This add an extra delay for the messages to propagate too vpool = service.alba_proxy.storagedriver.vpool alba_backend_guid = vpool.metadata['backend']['backend_info']['alba_backend_guid'] api_url = 'alba/backends/{0}'.format(alba_backend_guid) if api_url not in api_cache: connection_info = vpool.metadata['backend']['backend_info']['connection_info'] api_client = OVSClient(connection_info['host'], connection_info['port'], (connection_info['client_id'], connection_info['client_secret'])) start = time.time() _presets = api_client.get(api_url, params={'contents': 'presets'})['presets'] api_cache[api_url] = _presets result_handler.info('Fetching the safety took {0} seconds'.format(time.time() - start)) _presets = api_cache[api_url] _preset = filter(lambda p: p['name'] == preset_name, _presets)[0] if _preset['is_available'] is True: # Preset satisfiable, don't care about osds availability result_handler.info('Requested preset is available, no longer waiting on \'deliver_messages\'', add_to_result=False) break else: raise ValueError('Requested preset is marked as unavailable. Please check the disk safety'.format(time.time() - namespace_start_time)) except ValueError: raise except Exception: msg = 'Could not query the preset data. Checking the preset might timeout' result_handler.warning(msg) cls.logger.exception(msg) # Sleep for syncing purposes time.sleep(1) result_handler.success('Namespace successfully created on proxy {0} with preset {1}!'.format(service.name, preset_name), code=ErrorCodes.proxy_namespace_create) namespace_info = AlbaCLI.run(command='show-namespace', config=abm_config, extra_params=[namespace_key]) ExtensionsToolbox.verify_required_params(required_params=namespace_params, actual_params=namespace_info) result_handler.success('Namespace successfully fetched on proxy {0} with preset {1}!'.format(service.name, preset_name), code=ErrorCodes.proxy_namespace_fetch) # Put test object to given dir with open(AlbaHealthCheck.TEMP_FILE_LOC, 'wb') as output_file: output_file.write(os.urandom(AlbaHealthCheck.TEMP_FILE_SIZE)) AlbaCLI.run(command='proxy-upload-object', named_params={'host': ip, 'port': service.ports[0]}, extra_params=[namespace_key, AlbaHealthCheck.TEMP_FILE_LOC, object_key]) result_handler.success('Successfully uploaded the object to namespace {0}'.format(namespace_key), code=ErrorCodes.proxy_upload_obj) # download object AlbaCLI.run(command='proxy-download-object', named_params={'host': ip, 'port': service.ports[0]}, extra_params=[namespace_key, object_key, AlbaHealthCheck.TEMP_FILE_FETCHED_LOC]) result_handler.success('Successfully downloaded the object to namespace {0}'.format(namespace_key), code=ErrorCodes.proxy_download_obj) # check if files exists - issue #57 if not(os.path.isfile(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC) and os.path.isfile(AlbaHealthCheck.TEMP_FILE_LOC)): # creation of object failed raise ObjectNotFoundException(ValueError('Creation of object has failed')) hash_original = hashlib.md5(open(AlbaHealthCheck.TEMP_FILE_LOC, 'rb').read()).hexdigest() hash_fetched = hashlib.md5(open(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC, 'rb').read()).hexdigest() if hash_original == hash_fetched: result_handler.success('Fetched object {0} from namespace {1} on proxy {2} with preset {3} matches the created object!'.format(object_key, namespace_key, service.name, preset_name), code=ErrorCodes.proxy_verify_obj) else: result_handler.failure('Fetched object {0} from namespace {1} on proxy {2} with preset {3} does not match the created object!'.format(object_key, namespace_key, service.name, preset_name), code=ErrorCodes.proxy_verify_obj_fail) except ValueError: result_handler.failure('The preset is not available for use') except ObjectNotFoundException as ex: amount_of_presets_not_working.append(preset_name) result_handler.failure('Failed to put object on namespace {0} failed on proxy {1}with preset {2} With error {3}'.format(namespace_key, service.name, preset_name, ex)) except AlbaTimeOutException as ex: result_handler.failure(str(ex)) except AlbaException as ex: code = ErrorCodes.alba_cmd_fail if ex.alba_command == 'proxy-create-namespace': result_handler.failure('Create namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(str(ex), namespace_key, service.name, preset_name), code=code) elif ex.alba_command == 'show-namespace': result_handler.failure('Show namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(str(ex), namespace_key, service.name, preset_name), code=code) elif ex.alba_command == 'proxy-upload-object': result_handler.failure('Uploading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(str(ex), namespace_key, service.name, preset_name), code=code) elif ex.alba_command == 'proxy-download-object': result_handler.failure('Downloading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(str(ex), namespace_key, service.name, preset_name), code=code) finally: # Delete the created namespace and preset subprocess.call(['rm', str(AlbaHealthCheck.TEMP_FILE_LOC)], stdout=fnull, stderr=subprocess.STDOUT) subprocess.call(['rm', str(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC)], stdout=fnull, stderr=subprocess.STDOUT) try: namespaces = AlbaCLI.run(command='list-namespaces', config=abm_config) namespaces_to_remove = [] proxy_named_params = {'host': ip, 'port': service.ports[0]} for namespace in namespaces: if namespace['name'].startswith(namespace_key_prefix): namespaces_to_remove.append(namespace['name']) for namespace_name in namespaces_to_remove: if namespace_name == namespace_key: result_handler.info('Deleting namespace {0}.'.format(namespace_name)) else: result_handler.warning('Deleting namespace {0} which was leftover from a previous run.'.format(namespace_name)) AlbaCLI.run(command='proxy-delete-namespace', named_params=proxy_named_params, extra_params=[namespace_name]) namespace_delete_start = time.time() while True: try: AlbaCLI.run(command='show-namespace', config=abm_config, extra_params=[namespace_name]) # Will fail if the namespace does not exist except AlbaException: result_handler.success('Namespace {0} successfully removed.'.format(namespace_name)) break if time.time() - namespace_delete_start > AlbaHealthCheck.NAMESPACE_TIMEOUT: raise AlbaTimeOutException('Delete namespace has timed out after {0}s'.format(time.time() - namespace_start_time), 'show-namespace') # be tidy, and make the proxy forget the namespace try: AlbaCLI.run(command='proxy-statistics', named_params=proxy_named_params, extra_params=['--forget', namespace_name]) except: result_handler.warning('Failed to make proxy forget namespace {0}.'.format(namespace_name)) except AlbaException as ex: if ex.alba_command == 'list-namespaces': result_handler.failure( 'list namespaces has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format( str(ex), namespace_key, service.name, preset_name)) elif ex.alba_command == 'proxy-delete-namespace': result_handler.failure( 'Delete namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format( str(ex), namespace_key, service.name, preset_name)) except subprocess.CalledProcessError as ex: # this should stay for the deletion of the remaining files amount_of_presets_not_working.append(service.name) result_handler.failure('Proxy {0} has some problems. Got {1} as error'.format(service.name, ex), code=ErrorCodes.proxy_problems) except ConfigNotMatchedException as ex: amount_of_presets_not_working.append(service.name) result_handler.failure('Proxy {0} has some problems. Got {1} as error'.format(service.name, ex), code=ErrorCodes.proxy_problems)
def check_arakoon_ports(cls, result_handler): """ Verifies that the Arakoon clusters still respond to connections :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ arakoon_clusters = cls._get_arakoon_clusters(result_handler) result_handler.info('Starting Arakoon ports test.', add_to_result=False) result_handler.info('Retrieving all collapsing statistics. This might take a while', add_to_result=False) start = time.time() arakoon_stats = cls._get_port_connections(result_handler, arakoon_clusters) result_handler.info('Retrieving all collapsing statistics succeeded (duration: {0})'.format(time.time() - start), add_to_result=False) for cluster_type, clusters in arakoon_stats.iteritems(): result_handler.info('Testing the collapse of {0} Arakoons'.format(cluster_type), add_to_result=False) for cluster in clusters: cluster_name = cluster['cluster_name'] connection_result = cluster['connection_result'] connection_result = OrderedDict(sorted(connection_result.items(), key=lambda item: ExtensionsToolbox.advanced_sort(item[0].ip, separator='.'))) for node, stats in connection_result.iteritems(): identifier_log = 'Arakoon cluster {0} on node {1}'.format(cluster_name, node.ip) if len(stats['errors']) > 0: # Determine where issues were found for step, exception in stats['errors']: if step == 'test_connection': try: # Raise the thrown exception raise exception except Exception: message = 'Connection to {0} could not be established due to an unhandled exception.'.format(identifier_log) cls.logger.exception(message) result_handler.exception(message, code=ErrorCodes.unhandled_exception) continue if stats['result'] is True: result_handler.success('Connection established to {0}'.format(identifier_log), code=ErrorCodes.arakoon_connection_ok) else: result_handler.failure('Connection could not be established to {0}'.format(identifier_log), code=ErrorCodes.arakoon_connection_failure)
def check_arakoon_fd(cls, result_handler, fd_limit=30, passed_connections=None): """ Checks all current open tcp file descriptors for all Arakoon clusters in the OVS cluster Will raise warnings when these reach a certain threshold :param result_handler: Logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param fd_limit: Threshold for the number number of tcp connections for which to start logging warnings :type fd_limit: int :param passed_connections: checked TCP connections :type passed_connections: list :return: None :rtype: NoneType """ if passed_connections is None: passed_connections = ['ESTABLISHED', 'TIME_WAIT'] warning_threshold = fd_limit * 80 / 100 error_threshold = fd_limit * 95 / 100 result_handler.info('Starting Arakoon integrity test', add_to_result=False) arakoon_clusters = cls._get_arakoon_clusters(result_handler) start = time.time() arakoon_fd_results = cls._get_filedescriptors(result_handler, arakoon_clusters) result_handler.info('Retrieving all file descriptor information succeeded (duration: {0})'.format(time.time() - start), add_to_result=False) for cluster_type, clusters in arakoon_fd_results.iteritems(): result_handler.info('Checking the file descriptors of {0} Arakoons'.format(cluster_type), add_to_result=False) for cluster in clusters: cluster_name = cluster['cluster_name'] fd_result = cluster['fd_result'] fd_result = OrderedDict(sorted(fd_result.items(), key=lambda item: ExtensionsToolbox.advanced_sort(item[0].ip, separator='.'))) for node, stats in fd_result.iteritems(): identifier_log = 'Arakoon cluster {0} on node {1}'.format(cluster_name, node.ip) if len(stats['errors']) > 0: # Determine where issues were found for step, exception in stats['errors']: if step == 'build_client': try: # Raise the thrown exception raise exception except TimeOutException: result_handler.warning('Connection to {0} has timed out'.format(identifier_log), code=ErrorCodes.ssh_connection_time) except (socket.error, UnableToConnectException): result_handler.failure( 'Connection to {0} could not be established'.format(identifier_log), code=ErrorCodes.ssh_connection_fail) except NotAuthenticatedException: result_handler.skip('Connection to {0} could not be authenticated. This node has no access to the Arakoon node.'.format(identifier_log), code=ErrorCodes.ssh_connection_authentication) except Exception: message = 'Connection to {0} could not be established due to an unhandled exception.'.format(identifier_log) cls.logger.exception(message) result_handler.exception(message, code=ErrorCodes.unhandled_exception) elif step == 'lsof': try: raise exception except Exception: message = 'Unable to list the file descriptors for {0}'.format(identifier_log) cls.logger.exception(message) result_handler.exception(message, ErrorCodes.unhandled_exception) continue fds = stats['result']['fds'] filtered_fds = [i for i in fds if i.split()[-1].strip('(').strip(')') in passed_connections] if len(filtered_fds) >= warning_threshold: if len(filtered_fds) >= error_threshold: result_handler.warning('Number of TCP connections exceeded the 95% warning threshold for {0}, ({1}/{2})'.format(identifier_log, len(filtered_fds), fd_limit), code=ErrorCodes.arakoon_fd_95) else: result_handler.warning('Number of TCP connections exceeded the 80% warning threshold for {0}, ({1}/{2})'.format(identifier_log, len(filtered_fds), fd_limit), code=ErrorCodes.arakoon_fd_80) else: result_handler.success('Number of TCP connections for {0} is healthy ({1}/{2})'.format(identifier_log, len(filtered_fds), fd_limit), code=ErrorCodes.arakoon_fd_ok)