class HealthCheckShared(object): """ Constants for the HealthcheckCLI """ ADDON_TYPE = 'healthcheck' CACHE_KEY = 'ovs_healthcheck_discover_method' logger = Logger("healthcheck-ovs_clirunner") CMD_FOLDER = os.path.join(os.path.dirname(__file__), 'suites') # Folder to query for commands CONTEXT_SETTINGS_KEY = '/ovs/healthcheck/default_arguments' _context_settings = {} # Cache @staticmethod def get_healthcheck_results(result_handler): # type (HCResults) -> dict """ Output the Healthcheck results :param result_handler: HCResults instance :type result_handler: HCResults :return dict with information :rtype: dict """ recap_executer = 'Health Check' result = result_handler.get_results() result_handler.info("Recap of {0}!".format(recap_executer)) result_handler.info("======================") recount = [] # Order matters for severity in ['SUCCESS', 'FAILED', 'SKIPPED', 'WARNING', 'EXCEPTION']: recount.append((severity, result_handler.counter[severity])) result_handler.info(' '.join('{0}={1}'.format(s, v) for s, v in recount)) # returns dict with minimal and detailed information return {'result': result, 'recap': dict(recount)} @classmethod def get_default_arguments(cls): if not cls._context_settings: cls._context_settings = Configuration.get(cls.CONTEXT_SETTINGS_KEY, default={}) return cls._context_settings
class VolumedriverHealthCheck(object): """ A healthcheck for the volumedriver components """ MODULE = 'volumedriver' LOCAL_ID = System.get_my_machine_id() LOCAL_SR = System.get_my_storagerouter() VDISK_CHECK_SIZE = 1024 ** 3 # 1GB in bytes VDISK_HALTED_STATES = DataObject.enumerator('Halted_status', ['HALTED', 'FENCED']) VDISK_TIMEOUT_BEFORE_DELETE = 0.5 # Only used to check status of a fenced volume. This should not be used to link a status of a non-halted/fenced volume FENCED_HALTED_STATUS_MAP = {'max_redirect': {'status': VDisk.STATUSES.NON_RUNNING, 'severity': 'failure', 'halted': ('These volumes are not running: {0}', ErrorCodes.volume_max_redirect), 'fenced': ('These volumes are fenced but not running on another node: {0}', ErrorCodes.volume_fenced_max_redirect)}, 'halted': {'status': VDisk.STATUSES.HALTED, 'severity': 'failure', 'halted': ('These volumes are halted: {0}', ErrorCodes.volume_halted), 'fenced': ('These volumes are fenced and but halted on another node: {0}', ErrorCodes.volume_fenced_halted)}, 'connection_fail': {'status': 'UNKNOWN', 'severity': 'failure', 'halted': ('These volumes experienced a connectivity/timeout problem: {0}', ErrorCodes.voldrv_connection_problem), 'fenced': ('These volumes are fenced but experienced a connectivity/timeout problem on another node: {0}', ErrorCodes.voldrv_connection_problem)}, 'ok': {'status': VDisk.STATUSES.RUNNING, 'severity': 'failure', 'halted': ('These volumes are running: {0}', ErrorCodes.volume_ok), 'fenced': ('These volumes are fenced but running on another node: {0}', ErrorCodes.volume_fenced_ok)}, 'not_found': {'status': 'NOT_FOUND', 'severity': 'warning', 'halted': ('These volumes could not be queried for information: {0}', ErrorCodes.volume_not_found), 'fenced': ('These volumes are fenced but could not be queried for information on another node: {0}', ErrorCodes.volume_fenced_not_found)}} logger = Logger('healthcheck-ovs_volumedriver') @staticmethod @expose_to_cli(MODULE, 'dtl-test', HealthCheckCLI.ADDON_TYPE, help='Verify that all VDisks their DTL is properly running', short_help='Test if DTL is properly running') def check_dtl(result_handler): """ Checks the dtl for all vdisks on the local node :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ # Fetch vdisks hosted on this machine local_sr = System.get_my_storagerouter() if len(local_sr.vdisks_guids) == 0: return result_handler.skip('No VDisks present in cluster.') for vdisk_guid in local_sr.vdisks_guids: vdisk = VDisk(vdisk_guid) vdisk.invalidate_dynamics(['dtl_status', 'info']) if vdisk.dtl_status == 'ok_standalone' or vdisk.dtl_status == 'disabled': result_handler.success('VDisk {0}s DTL is disabled'.format(vdisk.name), code=ErrorCodes.volume_dtl_standalone) elif vdisk.dtl_status == 'ok_sync': result_handler.success('VDisk {0}s DTL is enabled and running.'.format(vdisk.name), code=ErrorCodes.volume_dtl_ok) elif vdisk.dtl_status == 'degraded': result_handler.warning('VDisk {0}s DTL is degraded.'.format(vdisk.name), code=ErrorCodes.volume_dtl_degraded) elif vdisk.dtl_status == 'checkup_required': result_handler.warning('VDisk {0}s DTL should be configured.'.format(vdisk.name), code=ErrorCodes.volume_dtl_checkup_required) elif vdisk.dtl_status == 'catch_up': result_handler.warning('VDisk {0}s DTL is enabled but still syncing.'.format(vdisk.name), code=ErrorCodes.volume_dtl_catch_up) else: result_handler.warning('VDisk {0}s DTL has an unknown status: {1}.'.format(vdisk.name, vdisk.dtl_status), code=ErrorCodes.volume_dtl_unknown) @staticmethod @timeout_decorator.timeout(30) def _check_volumedriver(vdisk_name, storagedriver_guid, logger, vdisk_size=VDISK_CHECK_SIZE): """ Checks if the volumedriver can create a new vdisk :param vdisk_name: name of a vdisk (e.g. test.raw) :type vdisk_name: str :param storagedriver_guid: guid of a storagedriver :type storagedriver_guid: str :param vdisk_size: size of the volume in bytes (e.g. 10737418240 is 10GB in bytes) :type vdisk_size: int :param logger: logger instance :type logger: ovs.extensions.healthcheck.result.HCResults :return: True if succeeds :rtype: bool """ try: VDiskController.create_new(vdisk_name, vdisk_size, storagedriver_guid) except FileExistsException: # can be ignored until fixed in framework # https://github.com/openvstorage/framework/issues/1247 return True except Exception as ex: logger.failure('Creation of the vdisk failed. Got {0}'.format(str(ex))) return False return True @staticmethod @timeout_decorator.timeout(30) def _check_volumedriver_remove(vpool_name, vdisk_name, present=True): """ Remove a vdisk from a vpool :param vdisk_name: name of a vdisk (e.g. test.raw) :type vdisk_name: str :param vpool_name: name of a vpool :type vpool_name: str :param present: should the disk be present? :type present: bool :return: True if disk is not present anymore :rtype: bool """ try: vdisk = VDiskHelper.get_vdisk_by_name(vdisk_name=vdisk_name, vpool_name=vpool_name) VDiskController.delete(vdisk.guid) return True except VDiskNotFoundError: # not found, if it should be present, re-raise the exception if present: raise else: return True @staticmethod # @expose_to_cli(MODULE, 'volumedrivers-test', HealthCheckCLI.ADDON_TYPE, # help='Verify that the Volumedrivers are responding to events', # short_help='Test if Volumedrivers are responding to events') def check_volumedrivers(result_handler): """ Checks if the VOLUMEDRIVERS work on a local machine (compatible with multiple vPools) :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ result_handler.info('Checking volumedrivers.', add_to_result=False) vpools = VPoolList.get_vpools() if len(vpools) == 0: result_handler.skip('No vPools found!') return for vp in vpools: name = 'ovs-healthcheck-test-{0}.raw'.format(VolumedriverHealthCheck.LOCAL_ID) if vp.guid not in VolumedriverHealthCheck.LOCAL_SR.vpools_guids: result_handler.skip('Skipping vPool {0} because it is not living here.'.format(vp.name)) continue try: # delete if previous vdisk with this name exists storagedriver_guid = next((storagedriver.guid for storagedriver in vp.storagedrivers if storagedriver.storagedriver_id == vp.name + VolumedriverHealthCheck.LOCAL_ID)) # create a new one volume = VolumedriverHealthCheck._check_volumedriver(name, storagedriver_guid, result_handler) if volume is True: # delete the recently created try: VolumedriverHealthCheck._check_volumedriver_remove(vpool_name=vp.name, vdisk_name=name) except Exception as ex: raise RuntimeError('Could not delete the created volume. Got {0}'.format(str(ex))) # Working at this point result_handler.success('Volumedriver of vPool {0} is working fine!'.format(vp.name)) else: # not working result_handler.failure('Something went wrong during vdisk creation on vpool {0}.'.format(vp.name)) except TimeoutError: # timeout occurred, action took too long result_handler.warning('Volumedriver of vPool {0} seems to timeout.'.format(vp.name)) except IOError as ex: # can be input/output error by volumedriver result_handler.failure('Volumedriver of vPool {0} seems to have IO problems. Got `{1}` while executing.'.format(vp.name, ex.message)) except RuntimeError as ex: result_handler.failure('Volumedriver of vPool {0} seems to have problems. Got `{1}` while executing.'.format(vp.name, ex)) except VDiskNotFoundError: result_handler.warning('Volume on vPool {0} was not found, please retry again'.format(vp.name)) except Exception as ex: result_handler.failure('Uncaught exception for Volumedriver of vPool {0}.Got {1} while executing.'.format(vp.name, ex)) finally: # Attempt to delete the created vdisk try: VolumedriverHealthCheck._check_volumedriver_remove(vpool_name=vp.name, vdisk_name=name, present=False) except: pass @classmethod def _is_volumedriver_timeout(cls, exception): """ Validates whether a certain exception is a timeout exception (RuntimeError, prior to NodeNotReachable in voldriver 6.17) :param exception: Exception object to check :return: True if it is a timeout or False if it's not :rtype: bool """ return isinstance(exception, ClusterNotReachableException) or isinstance(exception, RuntimeError) and 'failed to send XMLRPC request' in str(exception) @classmethod @expose_to_cli(MODULE, 'halted-volumes-test', HealthCheckCLI.ADDON_TYPE, help='Verify that there are no halted/fenced volumes within the cluster', short_help='Test if there are no halted/fenced volumes') def check_for_halted_volumes(cls, result_handler): """ Checks for halted volumes on a single or multiple vPools This will only check the volume states on the current node. If any other volumedriver would be down, only the HA'd volumes would pop-up as they could appear halted here (should be verified by the volumedriver team) :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ vpools = VPoolList.get_vpools() local_sr = System.get_my_storagerouter() if len(vpools) == 0: result_handler.skip('No vPools found!'.format(len(vpools)), code=ErrorCodes.vpools_none) return for vpool in vpools: log_start = 'Halted volumes test vPool {0}'.format(vpool.name) if vpool.guid not in local_sr.vpools_guids: result_handler.skip('{0} - Skipping vPool {1} because it is not living here.'.format(log_start, vpool.name), code=ErrorCodes.vpool_not_local, add_to_result=False) continue result_handler.info('{0} - Retrieving all information'.format(log_start), add_to_result=False) storagedriver = None for std in vpool.storagedrivers: if std.storagerouter_guid == local_sr.guid: storagedriver = std break if storagedriver is None: result_handler.failure('{0} - Could not associate a StorageDriver with this StorageRouter'.format(log_start), code=ErrorCodes.std_no_str) continue volume_fenced_states = dict((key, []) for key in cls.FENCED_HALTED_STATUS_MAP.keys()) volume_lists = {cls.VDISK_HALTED_STATES.HALTED: [], cls.VDISK_HALTED_STATES.FENCED: []} volume_states = {cls.VDISK_HALTED_STATES.HALTED: {cls.VDISK_HALTED_STATES.HALTED: volume_lists[cls.VDISK_HALTED_STATES.HALTED]}, cls.VDISK_HALTED_STATES.FENCED: volume_fenced_states} # Less loops to write for outputting result_handler.info('{0} - Scanning for halted volumes'.format(log_start), add_to_result=False) try: voldrv_client = vpool.storagedriver_client objectregistry_client = vpool.objectregistry_client except Exception: cls.logger.exception('{0} - Unable to instantiate the required clients'.format(log_start)) result_handler.exception('{0} - Unable to load the Volumedriver clients'.format(log_start), code=ErrorCodes.voldr_unknown_problem) continue try: # Listing all halted volumes with the volumedriver client as it detects stolen volumes too (fenced instances) volumes = voldrv_client.list_halted_volumes(str(storagedriver.storagedriver_id)) except Exception as ex: cls.logger.exception('{0} - Exception occurred when listing volumes'.format(log_start)) if cls._is_volumedriver_timeout(ex) is False: # Unhandled exception at this point result_handler.exception('{0} - Unable to list the Volumes due to an unidentified problem. Please check the logging'.format(log_start), code=ErrorCodes.voldr_unknown_problem) else: result_handler.failure('{0} - Could not list the volumes for due to a connection problem.'.format(log_start), code=ErrorCodes.voldrv_connection_problem) continue # Retrieve the parent of the current volume. If this id would not be identical to the one we fetched for, that would mean it is fenced # Object registry goes to Arakoon # Capturing any possible that would occur to provide a clearer vision of what went wrong for volume in volumes: try: registry_entry = objectregistry_client.find(volume) if registry_entry.node_id() == storagedriver.storagedriver_id: volume_lists[cls.VDISK_HALTED_STATES.HALTED].append(volume) else: # Fenced volume_lists[cls.VDISK_HALTED_STATES.FENCED].append(volume) except Exception: msg = '{0} - Unable to consult the object registry client for volume \'{1}\''.format(log_start, volume) cls.logger.exception(msg) result_handler.exception(msg, code=ErrorCodes.voldr_unknown_problem) # Include fenced - OTHER state combo for volume in volume_lists[cls.VDISK_HALTED_STATES.FENCED]: try: _, state = cls._get_volume_issue(voldrv_client, volume, log_start) volume_fenced_states[state].append(volume) except Exception: # Only unhandled at this point result_handler.exception('{0} - Unable to the volume info for volume {1} due to an unidentified problem. Please check the logging'.format(log_start, volume), code=ErrorCodes.voldr_unknown_problem) for halted_state, volume_state_info in volume_states.iteritems(): for state, volumes in volume_state_info.iteritems(): if len(volumes) == 0: continue # Skip OK/empty lists map_value = cls.FENCED_HALTED_STATUS_MAP[state.lower()] log_func = getattr(result_handler, map_value['severity']) message, code = map_value[halted_state.lower()] log_func('{0} - {1}'.format(log_start, message.format(', '.join(volumes))), code=code) # Call success in case nothing is wrong if all(len(l) == 0 for l in volume_lists.values()): result_handler.success('{0} - No volumes found in halted/fenced state'.format(log_start)) @classmethod def _get_volume_issue(cls, voldrv_client, volume_id, log_start): """ Maps all possible exceptions to a state. These states can be mapped to a status using the FENCED_HALTED_STATUS_MAP because the volumedriver does not return a state itself :param voldrv_client: Storagedriver client :param volume_id: Id of the volume :raises: The unhandled exception when such an exception could occur (we try to identify all problems but one could slip past us) :return: The volume_id and state :rtype: tuple(str, str) """ state = 'ok' try: # Check if the information can be retrieved about the volume vol_info = voldrv_client.info_volume(volume_id, req_timeout_secs=5) if vol_info.halted is True: state = 'halted' except Exception as ex: cls.logger.exception('{0} - Exception occurred when fetching the info for volume \'{1}\''.format(log_start, volume_id)) if isinstance(ex, ObjectNotFoundException): # Ignore ovsdb invalid entrees as model consistency will handle it. state = 'not_found' elif isinstance(ex, MaxRedirectsExceededException): # This means the volume is not halted but detached or unreachable for the Volumedriver state = 'max_redirect' # @todo replace RuntimeError with NodeNotReachableException elif any(isinstance(ex, exception) for exception in [ClusterNotReachableException, RuntimeError]): if cls._is_volumedriver_timeout(ex) is False: # Unhandled exception at this point raise # Timeout / connection problems state = 'connection_fail' else: # Something to be looked at raise return volume_id, state @staticmethod @timeout_decorator.timeout(5) def _check_filedriver(vp_name, test_name): """ Async method to checks if a FILEDRIVER `touch` works on a vpool Always try to check if the file exists after performing this method :param vp_name: name of the vpool :type vp_name: str :param test_name: name of the test file (e.g. `ovs-healthcheck-LOCAL_ID`) :type test_name: str :return: True if succeeded, False if failed :rtype: bool """ return subprocess.check_output('touch /mnt/{0}/{1}.xml'.format(vp_name, test_name), stderr=subprocess.STDOUT, shell=True) @staticmethod @timeout_decorator.timeout(5) def _check_filedriver_remove(vp_name): """ Async method to checks if a FILEDRIVER `remove` works on a vpool Always try to check if the file exists after performing this method :param vp_name: name of the vpool :type vp_name: str :return: True if succeeded, False if failed :rtype: bool """ subprocess.check_output('rm -f /mnt/{0}/ovs-healthcheck-test-*.xml'.format(vp_name), stderr=subprocess.STDOUT, shell=True) return not os.path.exists('/mnt/{0}/ovs-healthcheck-test-*.xml'.format(vp_name)) @staticmethod # @expose_to_cli(MODULE, 'filedrivers-test', HealthCheckCLI.ADDON_TYPE, # help='Verify that all Volumedrivers are accessible through FUSE', # short_help='Test if that the FUSE layer is responding') # @todo replace fuse test with edge test def check_filedrivers(result_handler): """ Checks if the file drivers work on a local machine (compatible with multiple vPools) :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults """ result_handler.info('Checking file drivers.', add_to_result=False) vpools = VPoolList.get_vpools() # perform tests if len(vpools) == 0: result_handler.skip('No vPools found!') return for vp in vpools: name = 'ovs-healthcheck-test-{0}'.format(VolumedriverHealthCheck.LOCAL_ID) if vp.guid not in VolumedriverHealthCheck.LOCAL_SR.vpools_guids: result_handler.skip('Skipping vPool {0} because it is not living here.'.format(vp.name)) continue try: VolumedriverHealthCheck._check_filedriver(vp.name, name) if os.path.exists('/mnt/{0}/{1}.xml'.format(vp.name, name)): # working VolumedriverHealthCheck._check_filedriver_remove(vp.name) result_handler.success('Filedriver for vPool {0} is working fine!'.format(vp.name)) else: # not working result_handler.failure('Filedriver for vPool {0} seems to have problems!'.format(vp.name)) except TimeoutError: # timeout occurred, action took too long result_handler.warning('Filedriver of vPool {0} seems to have `timeout` problems'.format(vp.name)) except subprocess.CalledProcessError: # can be input/output error by filedriver result_handler.failure('Filedriver of vPool {0} seems to have `input/output` problems'.format(vp.name)) @staticmethod @expose_to_cli(MODULE, 'volume-potential-test', HealthCheckCLI.ADDON_TYPE, help='Verify that the Volumedrivers have enough VDisk potential left', short_help='Test if the Volumedrivers can create enough VDisks') @expose_to_cli.option('--critical-vol-number', '-c', type=int, default=25, help='Minimum number of volumes left to create') def check_volume_potential(result_handler, critical_vol_number=25): """ Checks all local storage drivers from a volume driver. Results in a success if enough volumes are available, a warning if the number of volumes is lower then a threshold value (critical_volume_number) and a failure if the nr of volumes ==0) :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param critical_vol_number: Mimimal number of volumes that can be made before throwing a warning :type critical_vol_number: int """ result_handler.info('Checking volume potential of storagedrivers') if not isinstance(critical_vol_number, int) or critical_vol_number < 0: raise ValueError('Critical volume number should be a positive integer') for std in VolumedriverHealthCheck.LOCAL_SR.storagedrivers: try: std_config = StorageDriverConfiguration(std.vpool_guid, std.storagedriver_id) client = LocalStorageRouterClient(std_config.remote_path) vol_potential = client.volume_potential(str(std.storagedriver_id)) if vol_potential >= critical_vol_number: log_level = 'success' elif critical_vol_number > vol_potential > 0: log_level = 'warning' else: log_level = 'failure' getattr(result_handler, log_level)('Volume potential of local storage driver: {0}: {1} (potential at: {2})'.format(std.storagedriver_id, log_level.upper(), vol_potential)) except RuntimeError: result_handler.exception('Unable to retrieve configuration for storagedriver {0}'.format(std.storagedriver_id)) @staticmethod @expose_to_cli(MODULE, 'sco-cache-mountpoint-test', HealthCheckCLI.ADDON_TYPE, help='Verify that sco-cache mountpoints are up and running', short_help='Test if sco-cache mountpoints are up and running') def check_sco_cache_mountpoints(result_handler): """ Iterates over StorageDrivers of a local StorageRouter and will check all its sco cache mount points. Will result in a warning log if the sco is in offline state :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults """ result_handler.info('Checking sco cache mount points on all local storagedrivers') for std in VolumedriverHealthCheck.LOCAL_SR.storagedrivers: try: std_config = StorageDriverConfiguration(std.vpool_guid, std.storagedriver_id) client = LocalStorageRouterClient(std_config.remote_path) for std_info in client.sco_cache_mount_point_info(str(std.storagedriver_id)): if std_info.offlined is True: result_handler.warning('Mountpoint at location {0} of storagedriver {1} is in offline state'.format(std_info.path, std.storagedriver_id)) else: result_handler.success('Mountpoint at location {0} of storagedriver {1} is in online state'.format(std_info.path, std.storagedriver_id)) except RuntimeError: result_handler.exception('Unable to check sco cache mountpoint of storagedriver {0}'.format(std.storagedriver_id))
def run(command, config=None, named_params=None, extra_params=None, client=None, debug=False, to_json=True): """ Executes a command on ALBA When --to-json is NOT passed: * An error occurs --> exitcode != 0 * It worked --> exitcode == 0 When --to-json is passed: * An errors occurs during verification of parameters passed -> exitcode != 0 * An error occurs while executing the command --> exitcode == 0 (error in json output) * It worked --> exitcode == 0 :param command: The command to execute, eg: 'list-namespaces' :type command: str :param config: The configuration location to be used, eg: 'arakoon://config/ovs/arakoon/ovsdb/config?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini' :type config: str :param named_params: Additional parameters to be given to the command, eg: {'long-id': ','.join(asd_ids)} :type named_params: dict :param extra_params: Additional parameters to be given to the command, eg: [name] :type extra_params: list :param client: A client on which to execute the command :type client: ovs.extensions.generic.sshclient.SSHClient :param debug: Log additional output :type debug: bool :param to_json: Parse the output as json :type to_json: bool :return: The output of the command :rtype: dict """ if named_params is None: named_params = {} if extra_params is None: extra_params = [] logger = Logger('healthcheck-alba_cli') if os.environ.get('RUNNING_UNITTESTS') == 'True': # For the unittest, all commands are passed to a mocked Alba from ovs.extensions.plugins.tests.alba_mockups import VirtualAlbaBackend named_params.update({'config': config}) named_params.update({'extra_params': extra_params}) return getattr(VirtualAlbaBackend, command.replace('-', '_'))(**named_params) debug_log = [] try: if to_json is True: extra_options = ["--to-json"] else: extra_options = [] cmd_list = ['/usr/bin/alba', command] + extra_options if config is not None: cmd_list.append('--config={0}'.format(config)) for key, value in named_params.iteritems(): cmd_list.append('--{0}={1}'.format(key, value)) cmd_list.extend(extra_params) cmd_string = ' '.join(cmd_list) debug_log.append('Command: {0}'.format(cmd_string)) start = time.time() try: if client is None: try: if not hasattr(select, 'poll'): import subprocess subprocess._has_poll = False # Damn 'monkey patching' channel = Popen(cmd_list, stdout=PIPE, stderr=PIPE, universal_newlines=True) except OSError as ose: raise CalledProcessError(1, cmd_string, str(ose)) output, stderr = channel.communicate() output = re.sub(r'[^\x00-\x7F]+', '', output) stderr_debug = 'stderr: {0}'.format(stderr) stdout_debug = 'stdout: {0}'.format(output) if debug is True: logger.debug(stderr_debug) debug_log.append(stdout_debug) exit_code = channel.returncode if exit_code != 0: # Raise same error as check_output raise CalledProcessError(exit_code, cmd_string, output) else: if debug is True: output, stderr = client.run(cmd_list, debug=True) debug_log.append('stderr: {0}'.format(stderr)) else: output = client.run(cmd_list, debug=False).strip() debug_log.append('stdout: {0}'.format(output)) if to_json is True: output = json.loads(output) else: return output duration = time.time() - start if duration > 0.5: logger.warning('AlbaCLI call {0} took {1}s'.format( command, round(duration, 2))) except CalledProcessError as cpe: try: output = json.loads(cpe.output) except Exception: raise RuntimeError( 'Executing command {0} failed with output {1}'.format( cmd_string, cpe.output)) if output['success'] is True: return output['result'] raise RuntimeError(output['error']['message']) except Exception as ex: logger.exception('Error: {0}'.format(ex)) # In case there's an exception, we always log for debug_line in debug_log: logger.debug(debug_line) raise AlbaException(str(ex), command)
class CLIRunner(object): """ Runs a method exposed by the expose_to_cli decorator. Serves as a base for all extensions using expose_to_cli """ logger = Logger("healthcheck-ovs_clirunner") START_PATH = os.path.abspath( os.path.join(os.path.dirname(__file__), os.pardir)) CACHE_KEY = 'ovs_discover_method' _WILDCARD = 'X' def __init__(self): pass @classmethod def _get_methods(cls, module_name=_WILDCARD, method_name=_WILDCARD, addon_type=None): """ Gets method by the specified values :param module_name: module to which the method belong :type module_name: str :param method_name: name of the method :type method_name: str :param addon_type: type of the method, distinguishes different addons :type addon_type: str :return: list of all found functions rtype: list[function] """ result = [] discovered_data = cls._discover_methods() module_names = discovered_data.keys( ) if module_name == cls._WILDCARD else [module_name] for module_name in module_names: if module_name not in discovered_data: raise ModuleNotRecognizedException() for function_data in discovered_data[module_name]: if addon_type != function_data['addon_type'] or ( method_name != cls._WILDCARD and method_name != function_data['method_name']): continue mod = imp.load_source(function_data['module_name'], function_data['location']) cl = getattr(mod, function_data['class'])() result.append(getattr(cl, function_data['function'])) if method_name == function_data['method_name']: break return result @classmethod def extract_arguments(cls, *args): """ Extracts arguments from the CLI Always expects a module_name and a method_name (the wildcard is X) :param args: arguments passed on by bash :return: tuple of module_name, method_name, bool if --help was in and remaining arguments :rtype: tuple(str, str, bool, list) """ args = list(args) help_requested = False # Always expect at least X X if len(args) < 2: raise ValueError('Expecting at least {0} {0} as arguments.'.format( cls._WILDCARD)) if '--help' in args[0:3]: args.remove('--help') help_requested = True return args.pop(0), args.pop(0), help_requested, args @classmethod def run_method(cls, *args): """ Executes the given method :return: None :rtype: NoneType """ module_name, method_name, help_requested, args = cls.extract_arguments( *args) try: found_method_pointers = cls._get_methods(module_name, method_name) except ModuleNotRecognizedException: cls.print_help(cls._get_methods(), error_help=True) return if len(found_method_pointers ) == 0: # Module found but no methods -> print help cls.print_help(cls._get_methods(module_name), error_help=True) return if help_requested is True: cls.print_help(found_method_pointers) return try: for found_method in found_method_pointers: found_method(*args) except KeyboardInterrupt: cls.logger.warning( 'Caught keyboard interrupt. Output may be incomplete!') @classmethod def _discover_methods(cls): """ Discovers all methods with the expose_to_cli decorator :return: dict that contains the required info based on module_name and method_name :rtype: dict """ time_format = "%Y-%m-%d %H:%M:%S" version_id = 1 start_path = cls.START_PATH client = VolatileFactory.get_client() cache_expirey_hours = 2 # Amount of hours the cache would expire def build_cache(): """ Build a dict listing all discovered methods with @expose_to_cli :return: None :rtype: NoneType """ # Build cache # Executed from lib, want to go to extensions/healthcheck found_items = { 'expires': (datetime.now() + timedelta(hours=cache_expirey_hours)).strftime(time_format) } path = start_path for root, dirnames, filenames in os.walk(path): for filename in filenames: if not (filename.endswith('.py') and filename != '__init__.py'): continue name = filename.replace('.py', '') file_path = os.path.join(root, filename) # Import file mod = imp.load_source(name, file_path) for member in inspect.getmembers(mod): if not (inspect.isclass(member[1]) and member[1].__module__ == name and 'object' in [ base.__name__ for base in member[1].__bases__ ]): continue for submember in inspect.getmembers(member[1]): if not hasattr(submember[1], 'expose_to_cli'): continue exposed_data = submember[1].expose_to_cli method_module_name = exposed_data['module_name'] method_name = exposed_data['method_name'] method_addon_type = exposed_data[ 'addon_type'] if 'addon_type' in exposed_data else None if method_module_name not in found_items: found_items[method_module_name] = [] # noinspection PyUnresolvedReferences found_items[method_module_name].append({ 'method_name': method_name, 'module_name': name, 'function': submember[1].__name__, 'class': member[1].__name__, 'location': file_path, 'version': version_id, 'addon_type': method_addon_type }) client.set(cls.CACHE_KEY, found_items) exposed_methods = client.get(cls.CACHE_KEY) # Search first to use old cache if exposed_methods and datetime.strptime( exposed_methods['expires'], time_format ) > datetime.now() + timedelta(hours=cache_expirey_hours): del exposed_methods['expires'] return exposed_methods build_cache() exposed_methods = client.get(cls.CACHE_KEY) del exposed_methods['expires'] return exposed_methods @classmethod def print_help(cls, method_pointers=None, error_help=False): """ Prints the possible methods that are exposed to the CLI :param method_pointers: list of method pointers :type method_pointers: list[function] :param error_help: print extra help incase wrong arguments were suppplied :type error_help: bool :return: None :rtype: NoneType """ if error_help is True: print 'Could not process your arguments.' if len(method_pointers) == 0: # Nothing found for the search terms print 'Found no methods matching your search terms.' elif len(method_pointers) == 1: # Found only one method -> search term was module_name + method_name print method_pointers[0].__doc__ return print 'Possible optional arguments are:' # Multiple entries found means only the module_name was supplied print 'ovs healthcheck {0} {0} -- will run all checks'.format( CLIRunner._WILDCARD) print 'ovs healthcheck MODULE {0} -- will run all checks for module'.format( CLIRunner._WILDCARD) # Sort based on module_name print_dict = {} for method_pointer in method_pointers: module_name = method_pointer.expose_to_cli['module_name'] method_name = method_pointer.expose_to_cli['method_name'] if module_name in print_dict: print_dict[module_name].append(method_name) continue print_dict[module_name] = [method_name] for module_name, method_names in print_dict.iteritems(): for method_name in method_names: print "ovs healthcheck {0} {1}".format(module_name, method_name)
class HealthCheckCLIRunner(CLIRunner): """ Healthcheck adaptation of CLIRunner Injects a result_handler instance with shared resources to every test to collect the results. """ logger = Logger("healthcheck-healthcheck_clirunner") START_PATH = os.path.join( os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)), 'healthcheck') ADDON_TYPE = 'healthcheck' @staticmethod def _keep_old_argument_style(args): """ Fills up the missing arguments to the wildcards :param args: all arguments passed by bash :return: """ args = list(args) possible_args = ['--help', '--unattended', '--to-json'] indexes = [args.index(arg) for arg in args if arg in possible_args] if len(indexes) > 0: if indexes[0] == 0: args.insert(0, HealthCheckCLIRunner._WILDCARD) args.insert(1, HealthCheckCLIRunner._WILDCARD) elif indexes[0] == 1: args.insert(1, HealthCheckCLIRunner._WILDCARD) else: if len(args) == 0: args.insert(0, HealthCheckCLIRunner._WILDCARD) args.insert(1, HealthCheckCLIRunner._WILDCARD) elif len(args) == 1: args.insert(1, HealthCheckCLIRunner._WILDCARD) return args @staticmethod def run_method(*args): """ Executes the given method :return: results & recap :rtype: dict """ args = HealthCheckCLIRunner._keep_old_argument_style(args) unattended = False to_json = False if '--unattended' in args: args.remove('--unattended') unattended = True if '--to-json' in args: args.remove('--to-json') to_json = True module_name, method_name, help_requested, args = HealthCheckCLIRunner.extract_arguments( *args) result_handler = HCResults(unattended, to_json) try: found_method_pointers = HealthCheckCLIRunner._get_methods( module_name, method_name, HealthCheckCLIRunner.ADDON_TYPE) except ModuleNotRecognizedException: HealthCheckCLIRunner.print_help(HealthCheckCLIRunner._get_methods( addon_type=HealthCheckCLIRunner.ADDON_TYPE), error_help=True) return if len(found_method_pointers ) == 0: # Module found but no methods -> print help HealthCheckCLIRunner.print_help(HealthCheckCLIRunner._get_methods( module_name=module_name, addon_type=HealthCheckCLIRunner.ADDON_TYPE), error_help=True) return if help_requested is True: HealthCheckCLIRunner.print_help(found_method_pointers) return local_settings = Helper.get_local_settings() for key, value in local_settings.iteritems(): result_handler.info('{0}: {1}'.format( key.replace('_', ' ').title(), value)) try: result_handler.info( 'Starting OpenvStorage Healthcheck version {0}'.format( Helper.get_healthcheck_version())) result_handler.info("======================") for found_method in found_method_pointers: test_name = '{0}-{1}'.format( found_method.expose_to_cli['module_name'], found_method.expose_to_cli['method_name']) try: node_check(found_method)( result_handler.HCResultCollector(result=result_handler, test_name=test_name) ) # Wrapped in nodecheck for callback except KeyboardInterrupt: raise except Exception as ex: result_handler.exception( 'Unhandled exception caught when executing {0}. Got {1}' .format(found_method.__name__, str(ex))) HealthCheckCLIRunner.logger.exception( 'Unhandled exception caught when executing {0}'.format( found_method.__name__)) return HealthCheckCLIRunner.get_results(result_handler, module_name, method_name) except KeyboardInterrupt: HealthCheckCLIRunner.logger.warning( 'Caught keyboard interrupt. Output may be incomplete!') return HealthCheckCLIRunner.get_results(result_handler, module_name, method_name) @staticmethod def get_results(result_handler, module_name, method_name): """ Gets the result of the Open vStorage healthcheck :param result_handler: result parser :type result_handler: ovs.extensions.healthcheck.result.HCResults :param module_name: module name specified with the cli :type module_name: str :param method_name: method name specified with the cli :type method_name: str :return: results & recap :rtype: dict """ recap_executer = 'Health Check' if module_name != HealthCheckCLIRunner._WILDCARD: recap_executer = '{0} module {1}'.format(recap_executer, module_name) if method_name != HealthCheckCLIRunner._WILDCARD: recap_executer = '{0} test {1}'.format(recap_executer, method_name) result = result_handler.get_results() result_handler.info("Recap of {0}!".format(recap_executer)) result_handler.info("======================") result_handler.info( "SUCCESS={0} FAILED={1} SKIPPED={2} WARNING={3} EXCEPTION={4}". format(result_handler.counters['SUCCESS'], result_handler.counters['FAILED'], result_handler.counters['SKIPPED'], result_handler.counters['WARNING'], result_handler.counters['EXCEPTION'])) # returns dict with minimal and detailed information return { 'result': result, 'recap': { 'SUCCESS': result_handler.counters['SUCCESS'], 'FAILED': result_handler.counters['FAILED'], 'SKIPPED': result_handler.counters['SKIPPED'], 'WARNING': result_handler.counters['WARNING'], 'EXCEPTION': result_handler.counters['EXCEPTION'] } }
class ArakoonHealthCheck(object): """ A healthcheck for the arakoon persistent store """ logger = Logger("healthcheck-healthcheck_arakoon") MODULE = 'arakoon' @classmethod def _get_arakoon_clusters(cls, result_handler): """ Retrieves all Arakoon clusters registered in this OVSCluster :param result_handler: Logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: Dict with the Arakoon cluster types as key and list with dicts which contain cluster names and pyrakoon clients :rtype: dict(str, list[dict]) """ result_handler.info('Fetching available arakoon clusters.', add_to_result=False) arakoon_clusters = {} for cluster_name in list( Configuration.list('/ovs/arakoon')) + ['cacc']: # Determine Arakoon type is_cacc = cluster_name == 'cacc' arakoon_config = ArakoonClusterConfig(cluster_id=cluster_name, load_config=not is_cacc) if is_cacc is True: with open(Configuration.CACC_LOCATION) as config_file: contents = config_file.read() arakoon_config.read_config(contents=contents) try: arakoon_client = ArakoonInstaller.build_client(arakoon_config) except (ArakoonNoMaster, ArakoonNoMasterResult) as ex: result_handler.failure( 'Unable to find a master for Arakoon cluster {0}. (Message: {1})' .format(cluster_name, str(ex)), code=ErrorCodes.master_none) except Exception as ex: msg = 'Unable to connect to Arakoon cluster {0}. (Message: {1})'.format( cluster_name, str(ex)) result_handler.exception(msg, code=ErrorCodes.unhandled_exception) cls.logger.exception(msg) continue metadata = json.loads( arakoon_client.get(ArakoonInstaller.METADATA_KEY)) cluster_type = metadata['cluster_type'] if cluster_type not in arakoon_clusters: arakoon_clusters[cluster_type] = [] arakoon_clusters[cluster_type].append({ 'cluster_name': cluster_name, 'client': arakoon_client, 'config': arakoon_config }) return arakoon_clusters @classmethod @cluster_check @expose_to_cli( MODULE, 'nodes-test', HealthCheckCLI.ADDON_TYPE, help= 'Verify if nodes are missing and if nodes are catching up to the master', short_help='Test if there are nodes missing/catching up') @expose_to_cli.option( '--max-transactions-behind', '-m', type=int, default=10, help= 'The number of transactions that a slave can be behind a master before logging a failure' ) def check_node_status(cls, result_handler, max_transactions_behind=10): """ Checks the status of every node within the Arakoon cluster This check will report what nodes are currently missing and what nodes are catching up to the master :param result_handler: Logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param max_transactions_behind: The number of transactions that a slave can be behind a master before logging a failure :type max_transactions_behind: int :return: None :rtype: NoneType """ result_handler.info('Starting Arakoon nodes test.', add_to_result=False) arakoon_clusters = cls._get_arakoon_clusters(result_handler) for cluster_type, clusters in arakoon_clusters.iteritems(): result_handler.info( 'Fetching the status of {0} Arakoons'.format(cluster_type), add_to_result=False) for cluster in clusters: arakoon_client = cluster['client'] cluster_name = cluster['cluster_name'] arakoon_config = cluster['config'] # Map the node ids to the object for easier lookups node_info = dict( (node.name, node) for node in arakoon_config.nodes) identifier = 'Arakoon cluster {0}'.format(cluster_name) try: statistics = arakoon_client._client.statistics() node_is = statistics['node_is'] # Look for any missing nodes within the cluster missing_ids = list( set(node_info.keys()) - set(node_is.keys())) if len(missing_ids) > 0: for missing_id in missing_ids: node_config = node_info[missing_id] result_handler.failure( '{0} is missing node: {1}'.format( identifier, '{0} ({1}:{2})'.format( node_config.name, node_config.ip, node_config.client_port)), code=ErrorCodes.node_missing) highest_id = max(node_is.iteritems(), key=operator.itemgetter(1))[0] for node_id, transactions in node_is.iteritems(): if node_id == highest_id: continue transactions_behind = node_is[highest_id] - transactions node_config = node_info[node_id] log = 'Node {0} ({1}:{2}) for {3} {{0}} ({4}/{5})'.format( node_config.name, node_config.ip, node_config.client_port, identifier, transactions_behind, max_transactions_behind) if transactions == 0: result_handler.warning( log.format('is catching up'), code=ErrorCodes.slave_catch_up) elif transactions_behind > max_transactions_behind: result_handler.failure( log.format('is behind the master'), code=ErrorCodes.master_behind) else: result_handler.success( log.format('is up to date'), code=ErrorCodes.node_up_to_date) except (ArakoonNoMaster, ArakoonNoMasterResult) as ex: result_handler.failure( '{0} cannot find a master. (Message: {1})'.format( identifier, str(ex)), code=ErrorCodes.master_none) except Exception as ex: cls.logger.exception( 'Unhandled exception during the nodes check') result_handler.exception( 'Testing {0} threw an unhandled exception. (Message: {1})' .format(identifier, str(ex)), code=ErrorCodes.unhandled_exception) @classmethod @cluster_check @expose_to_cli( MODULE, 'ports-test', HealthCheckCLI.ADDON_TYPE, help='Verifies that the Arakoon clusters still respond to connections', short_help='Test if Arakoons accepts connections') def check_arakoon_ports(cls, result_handler): """ Verifies that the Arakoon clusters still respond to connections :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ arakoon_clusters = cls._get_arakoon_clusters(result_handler) result_handler.info('Starting Arakoon ports test.', add_to_result=False) result_handler.info( 'Retrieving all collapsing statistics. This might take a while', add_to_result=False) start = time.time() arakoon_stats = cls._get_port_connections(result_handler, arakoon_clusters) result_handler.info( 'Retrieving all collapsing statistics succeeded (duration: {0})'. format(time.time() - start), add_to_result=False) for cluster_type, clusters in arakoon_stats.iteritems(): result_handler.info( 'Testing the collapse of {0} Arakoons'.format(cluster_type), add_to_result=False) for cluster in clusters: cluster_name = cluster['cluster_name'] connection_result = cluster['connection_result'] connection_result = OrderedDict( sorted(connection_result.items(), key=lambda item: ExtensionsToolbox.advanced_sort( item[0].ip, separator='.'))) for node, stats in connection_result.iteritems(): identifier_log = 'Arakoon cluster {0} on node {1}'.format( cluster_name, node.ip) if len(stats['errors']) > 0: # Determine where issues were found for step, exception in stats['errors']: if step == 'test_connection': try: # Raise the thrown exception raise exception except Exception: message = 'Connection to {0} could not be established due to an unhandled exception.'.format( identifier_log) cls.logger.exception(message) result_handler.exception( message, code=ErrorCodes.unhandled_exception) continue if stats['result'] is True: result_handler.success( 'Connection established to {0}'.format( identifier_log), code=ErrorCodes.arakoon_connection_ok) else: result_handler.failure( 'Connection could not be established to {0}'. format(identifier_log), code=ErrorCodes.arakoon_connection_failure) @classmethod def _get_port_connections(cls, result_handler, arakoon_clusters, batch_size=10): """ Retrieve tlog/tlx stat information for a Arakoon cluster concurrently Note: this will mutate the given arakoon_clusters dict :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param arakoon_clusters: Information about all arakoon clusters, sorted by type and given config :type arakoon_clusters: dict :param batch_size: Amount of workers to collect the Arakoon information. Every worker will initiate a connection :return: Dict with tlog/tlx contents for every node config Example return: {CFG: {ovs.extensions.db.arakooninstaller.ArakoonClusterConfig object: {ovs_extensions.db.arakoon.arakooninstaller.ArakoonNodeConfig object: {'result': True, 'errors': []}, ovs_extensions.db.arakoon.arakooninstaller.ArakoonNodeConfig object: {'result': False, 'errors': []}}} :rtype: dict """ queue = Queue.Queue() # Prep work for cluster_type, clusters in arakoon_clusters.iteritems(): for cluster in clusters: cluster_name = cluster['cluster_name'] arakoon_config = cluster['config'] cluster['connection_result'] = {} for node_config in arakoon_config.nodes: result = {'errors': [], 'result': False} cluster['connection_result'][node_config] = result queue.put((cluster_name, node_config, result)) for _ in xrange(batch_size): thread = Thread(target=cls._connection_worker, args=(queue, result_handler)) thread.setDaemon( True ) # Setting threads as "daemon" allows main program to exit eventually even if these don't finish correctly. thread.start() # Wait for all results queue.join() return arakoon_clusters @staticmethod def _connection_worker(queue, result_handler): """ Worker method to retrieve file descriptors :param queue: Queue to use :param result_handler: Logging object :return: None :rtype: NoneType """ while not queue.empty(): cluster_name, _node_config, _results = queue.get() errors = _results['errors'] identifier = 'Arakoon cluster {0} on node {1}'.format( cluster_name, _node_config.ip) result_handler.info( 'Testing the connection to {0}'.format(identifier), add_to_result=False) try: _results['result'] = NetworkHelper.check_port_connection( _node_config.client_port, _node_config.ip) except Exception as ex: errors.append(('test_connection', ex)) result_handler.warning( 'Could not test the connection to {0} ({1})'.format( identifier, str(ex)), add_to_result=False) finally: queue.task_done() @classmethod @cluster_check @expose_to_cli(MODULE, 'collapse-test', HealthCheckCLI.ADDON_TYPE, help='Verifies collapsing has occurred for all Arakoons', short_help='Test if Arakoon collapsing is not failing') @expose_to_cli.option('--max-collapse-age', '-a', type=int, default=3, help='Maximum age in days for TLX') @expose_to_cli.option('--min-tlx-amount', '-t', type=int, default=10, help='Minimum amount of TLX files before testing') def check_collapse(cls, result_handler, max_collapse_age=3, min_tlx_amount=10): """ Verifies collapsing has occurred for all Arakoons :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param max_collapse_age: tlx files may not be longer than x days :type max_collapse_age: int :param min_tlx_amount: Minimum amount of tlxes before making collapsing mandatory (defaults to 10) :type min_tlx_amount: int :return: None :rtype: NoneType """ arakoon_clusters = cls._get_arakoon_clusters(result_handler) result_handler.info('Starting Arakoon collapse test', add_to_result=False) max_age_seconds = timedelta(days=max_collapse_age).total_seconds() result_handler.info( 'Retrieving all collapsing statistics. This might take a while', add_to_result=False) start = time.time() arakoon_stats = cls._retrieve_stats(result_handler, arakoon_clusters) result_handler.info( 'Retrieving all collapsing statistics succeeded (duration: {0})'. format(time.time() - start), add_to_result=False) for cluster_type, clusters in arakoon_stats.iteritems(): result_handler.info( 'Testing the collapse of {0} Arakoons'.format(cluster_type), add_to_result=False) for cluster in clusters: cluster_name = cluster['cluster_name'] collapse_result = cluster['collapse_result'] collapse_result = OrderedDict( sorted(collapse_result.items(), key=lambda item: ExtensionsToolbox.advanced_sort( item[0].ip, separator='.'))) for node, stats in collapse_result.iteritems(): identifier_log = 'Arakoon cluster {0} on node {1}'.format( cluster_name, node.ip) if len(stats['errors']) > 0: # Determine where issues were found for step, exception in stats['errors']: if step == 'build_client': try: # Raise the thrown exception raise exception except TimeOutException: result_handler.warning( 'Connection to {0} has timed out'. format(identifier_log), code=ErrorCodes.ssh_connection_time) except (socket.error, UnableToConnectException): result_handler.failure( 'Connection to {0} could not be established' .format(identifier_log), code=ErrorCodes.ssh_connection_fail) except NotAuthenticatedException: result_handler.skip( 'Connection to {0} could not be authenticated. This node has no access to the Arakoon node.' .format(identifier_log), code=ErrorCodes. ssh_connection_authentication) except Exception: message = 'Connection to {0} could not be established due to an unhandled exception.'.format( identifier_log) cls.logger.exception(message) result_handler.exception( message, code=ErrorCodes.unhandled_exception) elif step == 'stat_dir': try: raise exception except Exception: message = 'Unable to list the contents of the tlog directory ({0}) for {1}'.format( node.tlog_dir, identifier_log) cls.logger.exception(message) result_handler.exception( message, code=ErrorCodes.unhandled_exception) continue tlx_files = stats['result']['tlx'] tlog_files = stats['result']['tlog'] headdb_files = stats['result']['headDB'] avail_size = stats['result']['avail_size'] if any(item is None for item in [tlx_files, tlog_files, avail_size]): # Exception occurred but no errors were logged result_handler.exception( 'Either the tlx or tlog files or available size could be found in/of the tlog directory ({0}) for {1}' .format(node.tlog_dir, identifier_log), code=ErrorCodes.tlx_tlog_not_found) continue if len(headdb_files) > 0: headdb_size = sum([int(i[2]) for i in headdb_files]) collapse_size_msg = 'Spare space for local collapse is' if avail_size >= headdb_size * 4: result_handler.success( '{0} sufficient (n > 4x head.db size)'.format( collapse_size_msg)) elif avail_size >= headdb_size * 3: result_handler.warning( '{0} running short (n > 3x head.db size)'. format(collapse_size_msg)) elif avail_size >= headdb_size * 2: result_handler.failure( '{0} just enough (n > 2x head.db size'.format( collapse_size_msg)) else: result_handler.failure( '{0} insufficient (n <2 x head.db size'.format( collapse_size_msg)) if len(tlog_files) == 0: # A tlog should always be present result_handler.failure( '{0} has no open tlog'.format(identifier_log), code=ErrorCodes.tlog_not_found) continue if len(tlx_files) < min_tlx_amount: result_handler.skip( '{0} only has {1} tlx, not worth collapsing (required: {2})' .format(identifier_log, len(tlx_files), min_tlx_amount)) continue # Compare youngest tlog and oldest tlx timestamp seconds_difference = int(tlog_files[-1][0]) - int( tlx_files[0][0]) if max_age_seconds > seconds_difference: result_handler.success( '{0} should not be collapsed. The oldest tlx is at least {1} days younger than the youngest tlog (actual age: {2})' .format( identifier_log, max_collapse_age, str(timedelta(seconds=seconds_difference))), code=ErrorCodes.collapse_ok) else: result_handler.failure( '{0} should be collapsed. The oldest tlx is currently {1} old' .format( identifier_log, str(timedelta(seconds=seconds_difference))), code=ErrorCodes.collapse_not_ok) @classmethod def _retrieve_stats(cls, result_handler, arakoon_clusters, batch_size=10): """ Retrieve tlog/tlx stat information for a Arakoon cluster concurrently Note: this will mutate the given arakoon_clusters dict :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param arakoon_clusters: Information about all arakoon clusters, sorted by type and given config :type arakoon_clusters: dict :param batch_size: Amount of workers to collect the Arakoon information. Every worker means a connection towards a different node :return: Dict with tlog/tlx contents for every node config Example return: {CFG: {ovs.extensions.db.arakooninstaller.ArakoonClusterConfig object: {ovs_extensions.db.arakoon.arakooninstaller.ArakoonNodeConfig object: {'result': {'tlx': [['1513174398', '/opt/OpenvStorage/db/arakoon/config/tlogs/3393.tlx']], 'tlog': [['1513178427', '/opt/OpenvStorage/db/arakoon/config/tlogs/3394.tlog']]}, 'errors': []}, ovs_extensions.db.arakoon.arakooninstaller.ArakoonNodeConfig object: {'result': {'tlx': [['1513166090', '/opt/OpenvStorage/db/arakoon/config/tlogs/3392.tlx'], ['1513174418', '/opt/OpenvStorage/db/arakoon/config/tlogs/3393.tlx']], 'tlog': [['1513178427', '/opt/OpenvStorage/db/arakoon/config/tlogs/3394.tlog']]}, 'errors': []}, <ovs_extensions.db.arakoon.arakooninstaller.ArakoonNodeConfig object at 0x7fb3a84db090>: {'output': {'tlx': [['1513174358', '/opt/OpenvStorage/db/arakoon/config/tlogs/3393.tlx']], 'tlog': [['1513178427', '/opt/OpenvStorage/db/arakoon/config/tlogs/3394.tlog']]}, 'errors': []}}} :rtype: dict """ queue = Queue.Queue() clients = {} # Prep work for cluster_type, clusters in arakoon_clusters.iteritems(): for cluster in clusters: cluster_name = cluster['cluster_name'] arakoon_config = cluster['config'] cluster['collapse_result'] = {} for node_config in arakoon_config.nodes: result = { 'errors': [], 'result': { 'tlx': [], 'tlog': [], 'headDB': [], 'avail_size': None } } cluster['collapse_result'][node_config] = result # Build SSHClients outside the threads to avoid GIL try: client = clients.get(node_config.ip) if client is None: client = SSHClient(node_config.ip, timeout=5) clients[node_config.ip] = client except Exception as ex: result['errors'].append(('build_client', ex)) continue queue.put((cluster_name, node_config, result)) # Limit to one session for every node. # Every process will fork from this one, creating a new session instead of using the already existing channel # There might be an issue issue if a ssh session would take too long causing all workers to connect to that one node # and therefore hitting the MaxSessions again (theory) for _ in xrange(min(len(clients.keys()), batch_size)): thread = Thread(target=cls._collapse_worker, args=(queue, clients, result_handler)) thread.setDaemon( True ) # Setting threads as "daemon" allows main program to exit eventually even if these don't finish correctly. thread.start() # Wait for all results queue.join() return arakoon_clusters @staticmethod def _collapse_worker(queue, clients, result_handler): """ Worker method to retrieve file descriptors :param queue: Queue to use :param clients: SSHClients to choose from :param result_handler: Logging object :return: None :rtype: NoneType """ while not queue.empty(): cluster_name, _node_config, _results = queue.get() errors = _results['errors'] output = _results['result'] identifier = 'Arakoon cluster {0} on node {1}'.format( cluster_name, _node_config.ip) result_handler.info( 'Retrieving collapse information for {0}'.format(identifier), add_to_result=False) try: _client = clients[_node_config.ip] tlog_dir = _node_config.tlog_dir path = os.path.join(tlog_dir, '*') try: # List the contents of the tlog directory and sort by oldest modification date # Example output: (timestamp, name, size (bits) # 01111 file.tlog 101 # 01112 file2.tlog 102 timestamp_files = _client.run( 'stat -c "%Y %n %s" {0}'.format(path), allow_insecure=True) output['avail_size'] = _client.run( "df {0} | tail -1 | awk '{{print $4}}'".format(path), allow_insecure=True) except Exception as _ex: errors.append(('stat_dir', _ex)) raise # Sort and separate the timestamp item files for split_entry in sorted( (timestamp_file.split() for timestamp_file in timestamp_files.splitlines()), key=lambda split: int(split[0])): file_name = split_entry[1] if file_name.endswith('tlx'): output['tlx'].append(split_entry) elif file_name.endswith('tlog'): output['tlog'].append(split_entry) elif file_name.rsplit('/')[-1].startswith('head.db'): output['headDB'].append(split_entry) except Exception as _ex: result_handler.warning( 'Could not retrieve the collapse information for {0} ({1})' .format(identifier, str(_ex)), add_to_result=False) finally: queue.task_done() @classmethod @cluster_check @expose_to_cli( MODULE, 'integrity-test', HealthCheckCLI.ADDON_TYPE, help= 'Verifies that all Arakoon clusters are still responding to client calls', short_help='Test if Arakoon clusters are still responding') def verify_integrity(cls, result_handler): """ Verifies that all Arakoon clusters are still responding to client calls :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ arakoon_cluster = cls._get_arakoon_clusters(result_handler) result_handler.info('Starting Arakoon integrity test', add_to_result=False) for cluster_type, clusters in arakoon_cluster.iteritems(): result_handler.info( 'Testing the integry of {0} Arakoons'.format(cluster_type), add_to_result=False) for cluster in clusters: arakoon_client = cluster['client'] cluster_name = cluster['cluster_name'] try: arakoon_client.nop() result_handler.success( 'Arakoon {0} responded'.format(cluster_name), code=ErrorCodes.arakoon_responded) except (ArakoonNoMaster, ArakoonNoMasterResult) as ex: result_handler.failure( 'Arakoon {0} cannot find a master. (Message: {1})'. format(cluster_name, str(ex)), code=ErrorCodes.master_none) except Exception as ex: cls.logger.exception( 'Unhandled exception during the integrity check') result_handler.exception( 'Arakoon {0} threw an unhandled exception. (Message: {1})' .format(cluster_name, str(ex)), code=ErrorCodes.unhandled_exception) @classmethod @cluster_check @expose_to_cli( MODULE, 'file-descriptors-test', HealthCheckCLI.ADDON_TYPE, help= 'Verify the number of File Descriptors on every Arakoon does not exceed the limit', short_help='Test if #FD does not exceed the limit') @expose_to_cli.option( '--fd-limit', '-l', type=int, default=30, help= 'Threshold for the number number of tcp connections for which to start logging warnings' ) def check_arakoon_fd(cls, result_handler, fd_limit=30, passed_connections=None): """ Checks all current open tcp file descriptors for all Arakoon clusters in the OVS cluster Will raise warnings when these reach a certain threshold :param result_handler: Logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param fd_limit: Threshold for the number number of tcp connections for which to start logging warnings :type fd_limit: int :param passed_connections: checked TCP connections :type passed_connections: list :return: None :rtype: NoneType """ if passed_connections is None: passed_connections = ['ESTABLISHED', 'TIME_WAIT'] warning_threshold = fd_limit * 80 / 100 error_threshold = fd_limit * 95 / 100 result_handler.info('Starting Arakoon integrity test', add_to_result=False) arakoon_clusters = cls._get_arakoon_clusters(result_handler) start = time.time() arakoon_fd_results = cls._get_filedescriptors(result_handler, arakoon_clusters) result_handler.info( 'Retrieving all file descriptor information succeeded (duration: {0})' .format(time.time() - start), add_to_result=False) for cluster_type, clusters in arakoon_fd_results.iteritems(): result_handler.info( 'Checking the file descriptors of {0} Arakoons'.format( cluster_type), add_to_result=False) for cluster in clusters: cluster_name = cluster['cluster_name'] fd_result = cluster['fd_result'] fd_result = OrderedDict( sorted(fd_result.items(), key=lambda item: ExtensionsToolbox.advanced_sort( item[0].ip, separator='.'))) for node, stats in fd_result.iteritems(): identifier_log = 'Arakoon cluster {0} on node {1}'.format( cluster_name, node.ip) if len(stats['errors']) > 0: # Determine where issues were found for step, exception in stats['errors']: if step == 'build_client': try: # Raise the thrown exception raise exception except TimeOutException: result_handler.warning( 'Connection to {0} has timed out'. format(identifier_log), code=ErrorCodes.ssh_connection_time) except (socket.error, UnableToConnectException): result_handler.failure( 'Connection to {0} could not be established' .format(identifier_log), code=ErrorCodes.ssh_connection_fail) except NotAuthenticatedException: result_handler.skip( 'Connection to {0} could not be authenticated. This node has no access to the Arakoon node.' .format(identifier_log), code=ErrorCodes. ssh_connection_authentication) except Exception: message = 'Connection to {0} could not be established due to an unhandled exception.'.format( identifier_log) cls.logger.exception(message) result_handler.exception( message, code=ErrorCodes.unhandled_exception) elif step == 'lsof': try: raise exception except Exception: message = 'Unable to list the file descriptors for {0}'.format( identifier_log) cls.logger.exception(message) result_handler.exception( message, ErrorCodes.unhandled_exception) continue fds = stats['result']['fds'] filtered_fds = [ i for i in fds if i.split()[-1].strip('(').strip(')') in passed_connections ] if len(filtered_fds) >= warning_threshold: if len(filtered_fds) >= error_threshold: result_handler.warning( 'Number of TCP connections exceeded the 95% warning threshold for {0}, ({1}/{2})' .format(identifier_log, len(filtered_fds), fd_limit), code=ErrorCodes.arakoon_fd_95) else: result_handler.warning( 'Number of TCP connections exceeded the 80% warning threshold for {0}, ({1}/{2})' .format(identifier_log, len(filtered_fds), fd_limit), code=ErrorCodes.arakoon_fd_80) else: result_handler.success( 'Number of TCP connections for {0} is healthy ({1}/{2})' .format(identifier_log, len(filtered_fds), fd_limit), code=ErrorCodes.arakoon_fd_ok) @classmethod def _get_filedescriptors(cls, result_handler, arakoon_clusters, batch_size=10): """ Retrieve tlog/tlx stat information for a Arakoon cluster concurrently Note: this will mutate the given arakoon_clusters dict :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param arakoon_clusters: Information about all Arakoon clusters, sorted by type and given config :type arakoon_clusters: dict :param batch_size: Amount of workers to collect the Arakoon information. Every worker means a connection towards a different node :return: Dict with file descriptors contents for every node config :rtype: dict """ queue = Queue.Queue() clients = {} # Prep work for cluster_type, clusters in arakoon_clusters.iteritems(): for cluster in clusters: cluster_name = cluster['cluster_name'] arakoon_config = cluster['config'] cluster['fd_result'] = {} for node_config in arakoon_config.nodes: result = {'errors': [], 'result': {'fds': []}} # Build SSHClients outside the threads to avoid GIL try: client = clients.get(node_config.ip) if client is None: client = SSHClient(node_config.ip, timeout=5) clients[node_config.ip] = client except Exception as ex: result['errors'].append(('build_client', ex)) continue cluster['fd_result'][node_config] = result queue.put((cluster_name, node_config, result)) service_manager = ServiceFactory.get_manager() # Limit to one session for every node. # Every process will fork from this one, creating a new session instead of using the already existing channel # There might be an issue issue if a ssh session would take too long causing all workers to connect to that one node # and therefore hitting the MaxSessions again (theory) for _ in xrange(min(len(clients.keys()), batch_size)): thread = Thread(target=cls._fd_worker, args=(queue, clients, result_handler, service_manager)) thread.setDaemon( True ) # Setting threads as "daemon" allows main program to exit eventually even if these don't finish correctly. thread.start() # Wait for all results queue.join() return arakoon_clusters @staticmethod def _fd_worker(queue, clients, result_handler, service_manager): """ Worker method to retrieve file descriptors :param queue: Queue to use :param clients: SSHClients to choose from :param result_handler: Logging object :param service_manager: Service manager instance :return: None :rtype: NoneType """ while not queue.empty(): cluster_name, _node_config, _results = queue.get(False) errors = _results['errors'] output = _results['result'] identifier = 'Arakoon cluster {0} on node {1}'.format( cluster_name, _node_config.ip) result_handler.info( 'Retrieving file descriptor information for {0}'.format( identifier), add_to_result=False) try: client = clients[_node_config.ip] try: # Handle config Arakoon cluster_name = cluster_name if cluster_name != 'cacc' else 'config' service_name = ArakoonInstaller.get_service_name_for_cluster( cluster_name) pid = service_manager.get_service_pid(service_name, client) file_descriptors = client.run( ['lsof', '-i', '-a', '-p', pid]).splitlines()[1:] except Exception as _ex: errors.append(('lsof', _ex)) raise output['fds'] = file_descriptors except Exception as _ex: result_handler.warning( 'Could not retrieve the file descriptor information for {0} ({1})' .format(identifier, str(_ex)), add_to_result=False) finally: queue.task_done()
class CLI(click.Group): """ Click CLI which dynamically loads all possible commands Implementations require an entry point An entry point is defined as: @click.group(cls=CLI) def entry_point(): pass if __name__ == '__main__': entry_point() """ ADDON_TYPE = 'ovs' # Type of addon the CLI is CACHE_KEY = 'ovs_discover_method' CACHE_EXPIRE_HOURS = 2 # Amount of hours the cache would expire GROUP_MODULE_CLASS = click.Group CMD_FOLDER = os.path.join(os.path.dirname(__file__)) # Folder to query for commands logger = Logger("ovs_clirunner") _volatile_client = VolatileFactory.get_client() _discovery_cache = {} def __init__(self, *args, **kwargs): # type: (*any, **any) -> None super(CLI, self).__init__(*args, **kwargs) def list_commands(self, ctx): # type: (click.Context) -> list[str] """ Lists all possible commands found within the directory of this file All modules are retrieved :param ctx: Passed context :return: List of files to look for commands """ _ = ctx sub_commands = self._discover_methods().keys() # Returns all underlying modules sub_commands.sort() return sub_commands def get_command(self, ctx, name): # type: (click.Context, str) -> callable """ Retrieves a command to execute :param ctx: Passed context :param name: Name of the command :return: Function pointer to the command or None when no import could happen :rtype: callable """ cmd = self.commands.get(name) if cmd: return cmd # More extensive - build the command and register discovery_data = self._discover_methods() if name in discovery_data.keys(): # The current passed name is a module. Wrap it up in a group and add all commands under it dynamically module_commands = {} for function_name, function_data in discovery_data[name].iteritems(): # Register the decorated function as callback to click # Try to avoid name collision with other modules. Might lead to unexpected results mod = imp.load_source('ovs_cli_{0}'.format(function_data['module_name']), function_data['location']) cl = getattr(mod, function_data['class'])() module_commands[function_name] = click.Command(function_name, callback=getattr(cl, function_data['function'])) ret = self.GROUP_MODULE_CLASS(name, module_commands) self.add_command(ret) return ret @classmethod def _discover_methods(cls): # type: () -> dict """ Discovers all methods with the expose_to_cli decorator :return: dict that contains the required info based on module_name and method_name :rtype: dict """ version_id = 1 start_path = cls.CMD_FOLDER addon_type = cls.ADDON_TYPE def discover(): """ Build a dict listing all discovered methods with @expose_to_cli :return: Dict with all discovered itms :rtype: dict """ # Build cache found_items = {'expires': time.time() + cls.CACHE_EXPIRE_HOURS * 60 ** 2} path = start_path for root, dirnames, filenames in os.walk(path): for filename in filenames: if not (filename.endswith('.py') and filename != '__init__.py'): continue file_path = os.path.join(root, filename) module_name = 'ovs_cli_{0}'.format(filename.replace('.py', '')) # Import file, making it relative to the start path to avoid name collision. # Without it, the module contents would be merged (eg. alba.py and testing/alba.py would be merged, overriding the path # imp.load_source is different from importing. Therefore using the relative-joined name is safe try: mod = imp.load_source(module_name, file_path) except ImportError: cls.logger.exception('Unable to import module at {0}'.format(file_path)) continue for member_name, member_value in inspect.getmembers(mod): if not (inspect.isclass(member_value) and member_value.__module__ == module_name and 'object' in [base.__name__ for base in member_value.__bases__]): continue for submember_name, submember_value in inspect.getmembers(member_value): if not hasattr(submember_value, expose_to_cli.attribute): continue exposed_data = getattr(submember_value, expose_to_cli.attribute) method_module_name = exposed_data['module_name'] method_name = exposed_data['method_name'] method_addon_type = exposed_data['addon_type'] if 'addon_type' in exposed_data else None if method_module_name not in found_items: found_items[method_module_name] = {} # Only return when the addon type matches if method_addon_type == addon_type: function_metadata = {'function': submember_value.__name__, 'class': member_value.__name__, 'location': file_path, 'version': version_id} function_metadata.update(exposed_data) # Add all exposed data for further re-use found_items[method_module_name][method_name] = function_metadata return found_items def get_and_cache(): found_items = cls._volatile_client.get(cls.CACHE_KEY) if found_items: cls._discovery_cache.update(found_items) return found_items try: exposed_methods = copy.deepcopy(cls._discovery_cache) or get_and_cache() if exposed_methods and exposed_methods['expires'] > time.time(): # Able to use the cache, has not expired yet del exposed_methods['expires'] return exposed_methods except Exception: cls.logger.exception('Unable to retrieve the exposed resources from cache') exposed_methods = discover() try: cls._discovery_cache = exposed_methods cls._volatile_client.set(cls.CACHE_KEY, exposed_methods) except Exception: cls.logger.exception('Unable to cache the exposed resources') del exposed_methods['expires'] return exposed_methods @classmethod def clear_cache(cls): # type: () -> None """ Clear all cache related to discovering methods :return: None :rtype: NoneType """ cls._volatile_client.delete(cls.CACHE_KEY)
class IPMIHealthCheck(object): """ Healthcheck file to execute multiple IPMI tests """ MODULE = 'ipmi' logger = Logger("healthcheck-healthcheck_ipmi") @classmethod @expose_to_cli(MODULE, 'ipmi-test', HealthCheckCLI.ADDON_TYPE, help='Verify that AlbaNodes can be controlled through IPMI', short_help='Test if AlbaNodes their IPMI info is correct') def ipmi_check(cls, result_handler): """ :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: """ for albanode in AlbaNodeList.get_albanodes(): node_id = albanode.node_id ipmi_config_loc = '/ovs/alba/asdnodes/{0}/config/ipmi'.format( node_id) if not Configuration.exists(ipmi_config_loc): result_handler.skip( 'No IPMI info found on AlbaNode with ID {0}'.format( node_id)) continue ipmi_config = Configuration.get(ipmi_config_loc) ip = ipmi_config.get('ip') try: controller = IPMIController( ip=ip, username=ipmi_config.get('username'), password=ipmi_config.get('password'), client=SSHClient(System.get_my_storagerouter())) except: result_handler.failure( 'IPMI settings are not valid for AlbaNode with ID {0}'. format(node_id)) continue try: status = controller.status_node().get(ip) if status == IPMIController.IPMI_POWER_ON: result_handler.success( 'IPMI AlbaNode with ID {0} status is POWER ON'.format( node_id)) elif status == IPMIController.IPMI_POWER_OFF: result_handler.warning( 'IPMI AlbaNode with ID {0} status is POWER OFF'.format( node_id)) except IPMITimeOutException as ex: result_handler.failure( "IPMI AlbaNode with ID {0} timed out: '{1}'".format( node_id, ex)) except IPMICallException as ex: result_handler.failure( "IPMI AlbaNode with ID {0} call failed: '{1}'".format( node_id, ex)) except Exception: msg = 'Could not retrieve info through IPMI for AlbaNode with ID {0}'.format( node_id) cls.logger.exception(msg) result_handler.exception(msg)