Пример #1
0
class HealthCheckShared(object):
    """
    Constants for the HealthcheckCLI
    """

    ADDON_TYPE = 'healthcheck'
    CACHE_KEY = 'ovs_healthcheck_discover_method'

    logger = Logger("healthcheck-ovs_clirunner")
    CMD_FOLDER = os.path.join(os.path.dirname(__file__), 'suites')  # Folder to query for commands

    CONTEXT_SETTINGS_KEY = '/ovs/healthcheck/default_arguments'
    _context_settings = {}  # Cache

    @staticmethod
    def get_healthcheck_results(result_handler):
        # type (HCResults) -> dict
        """
        Output the Healthcheck results
        :param result_handler: HCResults instance
        :type result_handler: HCResults
        :return dict with information
        :rtype: dict
        """
        recap_executer = 'Health Check'
        result = result_handler.get_results()
        result_handler.info("Recap of {0}!".format(recap_executer))
        result_handler.info("======================")
        recount = []  # Order matters
        for severity in ['SUCCESS', 'FAILED', 'SKIPPED', 'WARNING', 'EXCEPTION']:
            recount.append((severity, result_handler.counter[severity]))
        result_handler.info(' '.join('{0}={1}'.format(s, v) for s, v in recount))
        # returns dict with minimal and detailed information
        return {'result': result, 'recap': dict(recount)}

    @classmethod
    def get_default_arguments(cls):
        if not cls._context_settings:
            cls._context_settings = Configuration.get(cls.CONTEXT_SETTINGS_KEY, default={})
        return cls._context_settings
class VolumedriverHealthCheck(object):
    """
    A healthcheck for the volumedriver components
    """
    MODULE = 'volumedriver'
    LOCAL_ID = System.get_my_machine_id()
    LOCAL_SR = System.get_my_storagerouter()
    VDISK_CHECK_SIZE = 1024 ** 3  # 1GB in bytes
    VDISK_HALTED_STATES = DataObject.enumerator('Halted_status', ['HALTED', 'FENCED'])
    VDISK_TIMEOUT_BEFORE_DELETE = 0.5
    # Only used to check status of a fenced volume. This should not be used to link a status of a non-halted/fenced volume
    FENCED_HALTED_STATUS_MAP = {'max_redirect': {'status': VDisk.STATUSES.NON_RUNNING,
                                                 'severity': 'failure',
                                                 'halted': ('These volumes are not running: {0}', ErrorCodes.volume_max_redirect),
                                                 'fenced': ('These volumes are fenced but not running on another node: {0}', ErrorCodes.volume_fenced_max_redirect)},
                                'halted': {'status': VDisk.STATUSES.HALTED,
                                           'severity': 'failure',
                                           'halted': ('These volumes are halted: {0}', ErrorCodes.volume_halted),
                                           'fenced': ('These volumes are fenced and but halted on another node: {0}', ErrorCodes.volume_fenced_halted)},
                                'connection_fail': {'status': 'UNKNOWN',
                                                    'severity': 'failure',
                                                    'halted': ('These volumes experienced a connectivity/timeout problem: {0}', ErrorCodes.voldrv_connection_problem),
                                                    'fenced': ('These volumes are fenced but experienced a connectivity/timeout problem on another node: {0}', ErrorCodes.voldrv_connection_problem)},
                                'ok': {'status': VDisk.STATUSES.RUNNING,
                                       'severity': 'failure',
                                       'halted': ('These volumes are running: {0}', ErrorCodes.volume_ok),
                                       'fenced': ('These volumes are fenced but running on another node: {0}', ErrorCodes.volume_fenced_ok)},
                                'not_found': {'status': 'NOT_FOUND',
                                              'severity': 'warning',
                                              'halted': ('These volumes could not be queried for information: {0}', ErrorCodes.volume_not_found),
                                              'fenced': ('These volumes are fenced but could not be queried for information on another node: {0}', ErrorCodes.volume_fenced_not_found)}}

    logger = Logger('healthcheck-ovs_volumedriver')

    @staticmethod
    @expose_to_cli(MODULE, 'dtl-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that all VDisks their DTL is properly running',
                   short_help='Test if DTL is properly running')
    def check_dtl(result_handler):
        """
        Checks the dtl for all vdisks on the local node
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        # Fetch vdisks hosted on this machine
        local_sr = System.get_my_storagerouter()
        if len(local_sr.vdisks_guids) == 0:
            return result_handler.skip('No VDisks present in cluster.')
        for vdisk_guid in local_sr.vdisks_guids:
            vdisk = VDisk(vdisk_guid)
            vdisk.invalidate_dynamics(['dtl_status', 'info'])
            if vdisk.dtl_status == 'ok_standalone' or vdisk.dtl_status == 'disabled':
                result_handler.success('VDisk {0}s DTL is disabled'.format(vdisk.name), code=ErrorCodes.volume_dtl_standalone)
            elif vdisk.dtl_status == 'ok_sync':
                result_handler.success('VDisk {0}s DTL is enabled and running.'.format(vdisk.name), code=ErrorCodes.volume_dtl_ok)
            elif vdisk.dtl_status == 'degraded':
                result_handler.warning('VDisk {0}s DTL is degraded.'.format(vdisk.name), code=ErrorCodes.volume_dtl_degraded)
            elif vdisk.dtl_status == 'checkup_required':
                result_handler.warning('VDisk {0}s DTL should be configured.'.format(vdisk.name), code=ErrorCodes.volume_dtl_checkup_required)
            elif vdisk.dtl_status == 'catch_up':
                result_handler.warning('VDisk {0}s DTL is enabled but still syncing.'.format(vdisk.name), code=ErrorCodes.volume_dtl_catch_up)
            else:
                result_handler.warning('VDisk {0}s DTL has an unknown status: {1}.'.format(vdisk.name, vdisk.dtl_status), code=ErrorCodes.volume_dtl_unknown)

    @staticmethod
    @timeout_decorator.timeout(30)
    def _check_volumedriver(vdisk_name, storagedriver_guid, logger, vdisk_size=VDISK_CHECK_SIZE):
        """
        Checks if the volumedriver can create a new vdisk
        :param vdisk_name: name of a vdisk (e.g. test.raw)
        :type vdisk_name: str
        :param storagedriver_guid: guid of a storagedriver
        :type storagedriver_guid: str
        :param vdisk_size: size of the volume in bytes (e.g. 10737418240 is 10GB in bytes)
        :type vdisk_size: int
        :param logger: logger instance
        :type logger: ovs.extensions.healthcheck.result.HCResults
        :return: True if succeeds
        :rtype: bool
        """
        try:
            VDiskController.create_new(vdisk_name, vdisk_size, storagedriver_guid)
        except FileExistsException:
            # can be ignored until fixed in framework
            # https://github.com/openvstorage/framework/issues/1247
            return True
        except Exception as ex:
            logger.failure('Creation of the vdisk failed. Got {0}'.format(str(ex)))
            return False
        return True

    @staticmethod
    @timeout_decorator.timeout(30)
    def _check_volumedriver_remove(vpool_name, vdisk_name, present=True):
        """
        Remove a vdisk from a vpool
        :param vdisk_name: name of a vdisk (e.g. test.raw)
        :type vdisk_name: str
        :param vpool_name: name of a vpool
        :type vpool_name: str
        :param present: should the disk be present?
        :type present: bool
        :return: True if disk is not present anymore
        :rtype: bool
        """
        try:
            vdisk = VDiskHelper.get_vdisk_by_name(vdisk_name=vdisk_name, vpool_name=vpool_name)
            VDiskController.delete(vdisk.guid)
            return True
        except VDiskNotFoundError:
            # not found, if it should be present, re-raise the exception
            if present:
                raise
            else:
                return True

    @staticmethod
    # @expose_to_cli(MODULE, 'volumedrivers-test', HealthCheckCLI.ADDON_TYPE,
    #                help='Verify that the Volumedrivers are responding to events',
    #                short_help='Test if Volumedrivers are responding to events')
    def check_volumedrivers(result_handler):
        """
        Checks if the VOLUMEDRIVERS work on a local machine (compatible with multiple vPools)
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Checking volumedrivers.', add_to_result=False)
        vpools = VPoolList.get_vpools()
        if len(vpools) == 0:
            result_handler.skip('No vPools found!')
            return
        for vp in vpools:
            name = 'ovs-healthcheck-test-{0}.raw'.format(VolumedriverHealthCheck.LOCAL_ID)
            if vp.guid not in VolumedriverHealthCheck.LOCAL_SR.vpools_guids:
                result_handler.skip('Skipping vPool {0} because it is not living here.'.format(vp.name))
                continue
            try:
                # delete if previous vdisk with this name exists
                storagedriver_guid = next((storagedriver.guid for storagedriver in vp.storagedrivers
                                           if storagedriver.storagedriver_id == vp.name +
                                           VolumedriverHealthCheck.LOCAL_ID))
                # create a new one
                volume = VolumedriverHealthCheck._check_volumedriver(name, storagedriver_guid, result_handler)

                if volume is True:
                    # delete the recently created
                    try:
                        VolumedriverHealthCheck._check_volumedriver_remove(vpool_name=vp.name, vdisk_name=name)
                    except Exception as ex:
                        raise RuntimeError('Could not delete the created volume. Got {0}'.format(str(ex)))
                    # Working at this point
                    result_handler.success('Volumedriver of vPool {0} is working fine!'.format(vp.name))
                else:
                    # not working
                    result_handler.failure('Something went wrong during vdisk creation on vpool {0}.'.format(vp.name))

            except TimeoutError:
                # timeout occurred, action took too long
                result_handler.warning('Volumedriver of vPool {0} seems to timeout.'.format(vp.name))
            except IOError as ex:
                # can be input/output error by volumedriver
                result_handler.failure('Volumedriver of vPool {0} seems to have IO problems. Got `{1}` while executing.'.format(vp.name, ex.message))
            except RuntimeError as ex:
                result_handler.failure('Volumedriver of vPool {0} seems to have problems. Got `{1}` while executing.'.format(vp.name, ex))
            except VDiskNotFoundError:
                result_handler.warning('Volume on vPool {0} was not found, please retry again'.format(vp.name))
            except Exception as ex:
                result_handler.failure('Uncaught exception for Volumedriver of vPool {0}.Got {1} while executing.'.format(vp.name, ex))
            finally:
                # Attempt to delete the created vdisk
                try:
                    VolumedriverHealthCheck._check_volumedriver_remove(vpool_name=vp.name, vdisk_name=name, present=False)
                except:
                    pass

    @classmethod
    def _is_volumedriver_timeout(cls, exception):
        """
        Validates whether a certain exception is a timeout exception (RuntimeError, prior to NodeNotReachable in voldriver 6.17)
        :param exception: Exception object to check
        :return: True if it is a timeout or False if it's not
        :rtype: bool
        """
        return isinstance(exception, ClusterNotReachableException) or isinstance(exception, RuntimeError) and 'failed to send XMLRPC request' in str(exception)

    @classmethod
    @expose_to_cli(MODULE, 'halted-volumes-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that there are no halted/fenced volumes within the cluster',
                   short_help='Test if there  are no halted/fenced volumes')
    def check_for_halted_volumes(cls, result_handler):
        """
        Checks for halted volumes on a single or multiple vPools
        This will only check the volume states on the current node. If any other volumedriver would be down,
        only the HA'd volumes would pop-up as they could appear halted here (should be verified by the volumedriver team)
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        vpools = VPoolList.get_vpools()
        local_sr = System.get_my_storagerouter()

        if len(vpools) == 0:
            result_handler.skip('No vPools found!'.format(len(vpools)), code=ErrorCodes.vpools_none)
            return
        for vpool in vpools:
            log_start = 'Halted volumes test vPool {0}'.format(vpool.name)
            if vpool.guid not in local_sr.vpools_guids:
                result_handler.skip('{0} - Skipping vPool {1} because it is not living here.'.format(log_start, vpool.name),
                                    code=ErrorCodes.vpool_not_local, add_to_result=False)
                continue

            result_handler.info('{0} - Retrieving all information'.format(log_start), add_to_result=False)
            storagedriver = None
            for std in vpool.storagedrivers:
                if std.storagerouter_guid == local_sr.guid:
                    storagedriver = std
                    break

            if storagedriver is None:
                result_handler.failure('{0} - Could not associate a StorageDriver with this StorageRouter'.format(log_start),
                                       code=ErrorCodes.std_no_str)
                continue

            volume_fenced_states = dict((key, []) for key in cls.FENCED_HALTED_STATUS_MAP.keys())
            volume_lists = {cls.VDISK_HALTED_STATES.HALTED: [], cls.VDISK_HALTED_STATES.FENCED: []}
            volume_states = {cls.VDISK_HALTED_STATES.HALTED: {cls.VDISK_HALTED_STATES.HALTED: volume_lists[cls.VDISK_HALTED_STATES.HALTED]},
                             cls.VDISK_HALTED_STATES.FENCED: volume_fenced_states}  # Less loops to write for outputting
            result_handler.info('{0} - Scanning for halted volumes'.format(log_start), add_to_result=False)
            try:
                voldrv_client = vpool.storagedriver_client
                objectregistry_client = vpool.objectregistry_client
            except Exception:
                cls.logger.exception('{0} - Unable to instantiate the required clients'.format(log_start))
                result_handler.exception('{0} - Unable to load the Volumedriver clients'.format(log_start),
                                         code=ErrorCodes.voldr_unknown_problem)
                continue
            try:
                # Listing all halted volumes with the volumedriver client as it detects stolen volumes too (fenced instances)
                volumes = voldrv_client.list_halted_volumes(str(storagedriver.storagedriver_id))
            except Exception as ex:
                cls.logger.exception('{0} - Exception occurred when listing volumes'.format(log_start))
                if cls._is_volumedriver_timeout(ex) is False:
                    # Unhandled exception at this point
                    result_handler.exception('{0} - Unable to list the Volumes due to an unidentified problem. Please check the logging'.format(log_start),
                                             code=ErrorCodes.voldr_unknown_problem)
                else:
                    result_handler.failure('{0} - Could not list the volumes for due to a connection problem.'.format(log_start),
                                           code=ErrorCodes.voldrv_connection_problem)
                continue
            # Retrieve the parent of the current volume. If this id would not be identical to the one we fetched for, that would mean it is fenced
            # Object registry goes to Arakoon
            # Capturing any possible that would occur to provide a clearer vision of what went wrong
            for volume in volumes:
                try:
                    registry_entry = objectregistry_client.find(volume)
                    if registry_entry.node_id() == storagedriver.storagedriver_id:
                        volume_lists[cls.VDISK_HALTED_STATES.HALTED].append(volume)
                    else:
                        # Fenced
                        volume_lists[cls.VDISK_HALTED_STATES.FENCED].append(volume)
                except Exception:
                    msg = '{0} - Unable to consult the object registry client for volume \'{1}\''.format(log_start, volume)
                    cls.logger.exception(msg)
                    result_handler.exception(msg, code=ErrorCodes.voldr_unknown_problem)
            # Include fenced - OTHER state combo
            for volume in volume_lists[cls.VDISK_HALTED_STATES.FENCED]:
                try:
                    _, state = cls._get_volume_issue(voldrv_client, volume, log_start)
                    volume_fenced_states[state].append(volume)
                except Exception:
                    # Only unhandled at this point
                    result_handler.exception('{0} - Unable to the volume info for volume {1} due to an unidentified problem. Please check the logging'.format(log_start, volume),
                                             code=ErrorCodes.voldr_unknown_problem)
            for halted_state, volume_state_info in volume_states.iteritems():
                for state, volumes in volume_state_info.iteritems():
                    if len(volumes) == 0:
                        continue  # Skip OK/empty lists
                    map_value = cls.FENCED_HALTED_STATUS_MAP[state.lower()]
                    log_func = getattr(result_handler, map_value['severity'])
                    message, code = map_value[halted_state.lower()]
                    log_func('{0} - {1}'.format(log_start, message.format(', '.join(volumes))), code=code)
            # Call success in case nothing is wrong
            if all(len(l) == 0 for l in volume_lists.values()):
                result_handler.success('{0} - No volumes found in halted/fenced state'.format(log_start))

    @classmethod
    def _get_volume_issue(cls, voldrv_client, volume_id, log_start):
        """
        Maps all possible exceptions to a state. These states can be mapped to a status using the FENCED_HALTED_STATUS_MAP
        because the volumedriver does not return a state itself
        :param voldrv_client: Storagedriver client
        :param volume_id: Id of the volume
        :raises: The unhandled exception when such an exception could occur (we try to identify all problems but one could slip past us)
        :return: The volume_id and state
        :rtype: tuple(str, str)
        """
        state = 'ok'
        try:
            # Check if the information can be retrieved about the volume
            vol_info = voldrv_client.info_volume(volume_id, req_timeout_secs=5)
            if vol_info.halted is True:
                state = 'halted'
        except Exception as ex:
            cls.logger.exception('{0} - Exception occurred when fetching the info for volume \'{1}\''.format(log_start, volume_id))
            if isinstance(ex, ObjectNotFoundException):
                # Ignore ovsdb invalid entrees as model consistency will handle it.
                state = 'not_found'
            elif isinstance(ex, MaxRedirectsExceededException):
                # This means the volume is not halted but detached or unreachable for the Volumedriver
                state = 'max_redirect'
            # @todo replace RuntimeError with NodeNotReachableException
            elif any(isinstance(ex, exception) for exception in [ClusterNotReachableException, RuntimeError]):
                if cls._is_volumedriver_timeout(ex) is False:
                    # Unhandled exception at this point
                    raise
                # Timeout / connection problems
                state = 'connection_fail'
            else:
                # Something to be looked at
                raise
        return volume_id, state

    @staticmethod
    @timeout_decorator.timeout(5)
    def _check_filedriver(vp_name, test_name):
        """
        Async method to checks if a FILEDRIVER `touch` works on a vpool
        Always try to check if the file exists after performing this method
        :param vp_name: name of the vpool
        :type vp_name: str
        :param test_name: name of the test file (e.g. `ovs-healthcheck-LOCAL_ID`)
        :type test_name: str
        :return: True if succeeded, False if failed
        :rtype: bool
        """
        return subprocess.check_output('touch /mnt/{0}/{1}.xml'.format(vp_name, test_name), stderr=subprocess.STDOUT, shell=True)

    @staticmethod
    @timeout_decorator.timeout(5)
    def _check_filedriver_remove(vp_name):
        """
        Async method to checks if a FILEDRIVER `remove` works on a vpool
        Always try to check if the file exists after performing this method
        :param vp_name: name of the vpool
        :type vp_name: str
        :return: True if succeeded, False if failed
        :rtype: bool
        """
        subprocess.check_output('rm -f /mnt/{0}/ovs-healthcheck-test-*.xml'.format(vp_name), stderr=subprocess.STDOUT, shell=True)
        return not os.path.exists('/mnt/{0}/ovs-healthcheck-test-*.xml'.format(vp_name))

    @staticmethod
    # @expose_to_cli(MODULE, 'filedrivers-test', HealthCheckCLI.ADDON_TYPE,
    #                help='Verify that all Volumedrivers are accessible through FUSE',
    #                short_help='Test if that the FUSE layer is responding')
    # @todo replace fuse test with edge test
    def check_filedrivers(result_handler):
        """
        Checks if the file drivers work on a local machine (compatible with multiple vPools)
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        """
        result_handler.info('Checking file drivers.', add_to_result=False)
        vpools = VPoolList.get_vpools()
        # perform tests
        if len(vpools) == 0:
            result_handler.skip('No vPools found!')
            return
        for vp in vpools:
            name = 'ovs-healthcheck-test-{0}'.format(VolumedriverHealthCheck.LOCAL_ID)
            if vp.guid not in VolumedriverHealthCheck.LOCAL_SR.vpools_guids:
                result_handler.skip('Skipping vPool {0} because it is not living here.'.format(vp.name))
                continue
            try:
                VolumedriverHealthCheck._check_filedriver(vp.name, name)
                if os.path.exists('/mnt/{0}/{1}.xml'.format(vp.name, name)):
                    # working
                    VolumedriverHealthCheck._check_filedriver_remove(vp.name)
                    result_handler.success('Filedriver for vPool {0} is working fine!'.format(vp.name))
                else:
                    # not working
                    result_handler.failure('Filedriver for vPool {0} seems to have problems!'.format(vp.name))
            except TimeoutError:
                # timeout occurred, action took too long
                result_handler.warning('Filedriver of vPool {0} seems to have `timeout` problems'.format(vp.name))
            except subprocess.CalledProcessError:
                # can be input/output error by filedriver
                result_handler.failure('Filedriver of vPool {0} seems to have `input/output` problems'.format(vp.name))

    @staticmethod
    @expose_to_cli(MODULE, 'volume-potential-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that the Volumedrivers have enough VDisk potential left',
                   short_help='Test if the Volumedrivers can create enough VDisks')
    @expose_to_cli.option('--critical-vol-number', '-c', type=int, default=25, help='Minimum number of volumes left to create')
    def check_volume_potential(result_handler, critical_vol_number=25):
        """
        Checks all local storage drivers from a volume driver. Results in a success if enough volumes are available, a warning if the number of volumes is
        lower then a threshold value (critical_volume_number) and a failure if the nr of volumes ==0)
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param critical_vol_number: Mimimal number of volumes that can be made before throwing a warning
        :type critical_vol_number: int
        """
        result_handler.info('Checking volume potential of storagedrivers')

        if not isinstance(critical_vol_number, int) or critical_vol_number < 0:
            raise ValueError('Critical volume number should be a positive integer')

        for std in VolumedriverHealthCheck.LOCAL_SR.storagedrivers:
            try:
                std_config = StorageDriverConfiguration(std.vpool_guid, std.storagedriver_id)
                client = LocalStorageRouterClient(std_config.remote_path)
                vol_potential = client.volume_potential(str(std.storagedriver_id))
                if vol_potential >= critical_vol_number:
                    log_level = 'success'
                elif critical_vol_number > vol_potential > 0:
                    log_level = 'warning'
                else:
                    log_level = 'failure'
                getattr(result_handler, log_level)('Volume potential of local storage driver: {0}: {1} (potential at: {2})'.format(std.storagedriver_id, log_level.upper(), vol_potential))
            except RuntimeError:
                result_handler.exception('Unable to retrieve configuration for storagedriver {0}'.format(std.storagedriver_id))

    @staticmethod
    @expose_to_cli(MODULE, 'sco-cache-mountpoint-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that sco-cache mountpoints are up and running',
                   short_help='Test if sco-cache mountpoints are up and running')
    def check_sco_cache_mountpoints(result_handler):
        """
        Iterates over StorageDrivers of a local StorageRouter and will check all its sco cache mount points.
        Will result in a warning log if the sco is in offline state
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        """
        result_handler.info('Checking sco cache mount points on all local storagedrivers')
        for std in VolumedriverHealthCheck.LOCAL_SR.storagedrivers:
            try:
                std_config = StorageDriverConfiguration(std.vpool_guid, std.storagedriver_id)
                client = LocalStorageRouterClient(std_config.remote_path)
                for std_info in client.sco_cache_mount_point_info(str(std.storagedriver_id)):
                    if std_info.offlined is True:
                        result_handler.warning('Mountpoint at location {0} of storagedriver {1} is in offline state'.format(std_info.path, std.storagedriver_id))
                    else:
                        result_handler.success('Mountpoint at location {0} of storagedriver {1} is in online state'.format(std_info.path, std.storagedriver_id))
            except RuntimeError:
                result_handler.exception('Unable to check sco cache mountpoint of storagedriver {0}'.format(std.storagedriver_id))
Пример #3
0
    def run(command,
            config=None,
            named_params=None,
            extra_params=None,
            client=None,
            debug=False,
            to_json=True):
        """
        Executes a command on ALBA
        When --to-json is NOT passed:
            * An error occurs --> exitcode != 0
            * It worked --> exitcode == 0

        When --to-json is passed:
            * An errors occurs during verification of parameters passed  -> exitcode != 0
            * An error occurs while executing the command --> exitcode == 0 (error in json output)
            * It worked --> exitcode == 0

        :param command: The command to execute, eg: 'list-namespaces'
        :type command: str
        :param config: The configuration location to be used, eg: 'arakoon://config/ovs/arakoon/ovsdb/config?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini'
        :type config: str
        :param named_params: Additional parameters to be given to the command, eg: {'long-id': ','.join(asd_ids)}
        :type named_params: dict
        :param extra_params: Additional parameters to be given to the command, eg: [name]
        :type extra_params: list
        :param client: A client on which to execute the command
        :type client: ovs.extensions.generic.sshclient.SSHClient
        :param debug: Log additional output
        :type debug: bool
        :param to_json: Parse the output as json
        :type to_json: bool
        :return: The output of the command
        :rtype: dict
        """
        if named_params is None:
            named_params = {}
        if extra_params is None:
            extra_params = []

        logger = Logger('healthcheck-alba_cli')
        if os.environ.get('RUNNING_UNITTESTS') == 'True':
            # For the unittest, all commands are passed to a mocked Alba
            from ovs.extensions.plugins.tests.alba_mockups import VirtualAlbaBackend
            named_params.update({'config': config})
            named_params.update({'extra_params': extra_params})
            return getattr(VirtualAlbaBackend,
                           command.replace('-', '_'))(**named_params)

        debug_log = []
        try:
            if to_json is True:
                extra_options = ["--to-json"]
            else:
                extra_options = []
            cmd_list = ['/usr/bin/alba', command] + extra_options
            if config is not None:
                cmd_list.append('--config={0}'.format(config))
            for key, value in named_params.iteritems():
                cmd_list.append('--{0}={1}'.format(key, value))
            cmd_list.extend(extra_params)
            cmd_string = ' '.join(cmd_list)
            debug_log.append('Command: {0}'.format(cmd_string))

            start = time.time()
            try:
                if client is None:
                    try:
                        if not hasattr(select, 'poll'):
                            import subprocess
                            subprocess._has_poll = False  # Damn 'monkey patching'
                        channel = Popen(cmd_list,
                                        stdout=PIPE,
                                        stderr=PIPE,
                                        universal_newlines=True)
                    except OSError as ose:
                        raise CalledProcessError(1, cmd_string, str(ose))
                    output, stderr = channel.communicate()
                    output = re.sub(r'[^\x00-\x7F]+', '', output)
                    stderr_debug = 'stderr: {0}'.format(stderr)
                    stdout_debug = 'stdout: {0}'.format(output)
                    if debug is True:
                        logger.debug(stderr_debug)
                    debug_log.append(stdout_debug)
                    exit_code = channel.returncode
                    if exit_code != 0:  # Raise same error as check_output
                        raise CalledProcessError(exit_code, cmd_string, output)
                else:
                    if debug is True:
                        output, stderr = client.run(cmd_list, debug=True)
                        debug_log.append('stderr: {0}'.format(stderr))
                    else:
                        output = client.run(cmd_list, debug=False).strip()
                    debug_log.append('stdout: {0}'.format(output))

                if to_json is True:
                    output = json.loads(output)
                else:
                    return output
                duration = time.time() - start
                if duration > 0.5:
                    logger.warning('AlbaCLI call {0} took {1}s'.format(
                        command, round(duration, 2)))
            except CalledProcessError as cpe:
                try:
                    output = json.loads(cpe.output)
                except Exception:
                    raise RuntimeError(
                        'Executing command {0} failed with output {1}'.format(
                            cmd_string, cpe.output))

            if output['success'] is True:
                return output['result']
            raise RuntimeError(output['error']['message'])

        except Exception as ex:
            logger.exception('Error: {0}'.format(ex))
            # In case there's an exception, we always log
            for debug_line in debug_log:
                logger.debug(debug_line)
            raise AlbaException(str(ex), command)
Пример #4
0
class CLIRunner(object):
    """
    Runs a method exposed by the expose_to_cli decorator. Serves as a base for all extensions using expose_to_cli
    """
    logger = Logger("healthcheck-ovs_clirunner")
    START_PATH = os.path.abspath(
        os.path.join(os.path.dirname(__file__), os.pardir))
    CACHE_KEY = 'ovs_discover_method'
    _WILDCARD = 'X'

    def __init__(self):
        pass

    @classmethod
    def _get_methods(cls,
                     module_name=_WILDCARD,
                     method_name=_WILDCARD,
                     addon_type=None):
        """
        Gets method by the specified values
        :param module_name: module to which the method belong
        :type module_name: str
        :param method_name: name of the method
        :type method_name: str
        :param addon_type: type of the method, distinguishes different addons
        :type addon_type: str
        :return: list of all found functions
        rtype: list[function]
        """
        result = []
        discovered_data = cls._discover_methods()
        module_names = discovered_data.keys(
        ) if module_name == cls._WILDCARD else [module_name]
        for module_name in module_names:
            if module_name not in discovered_data:
                raise ModuleNotRecognizedException()
            for function_data in discovered_data[module_name]:
                if addon_type != function_data['addon_type'] or (
                        method_name != cls._WILDCARD
                        and method_name != function_data['method_name']):
                    continue
                mod = imp.load_source(function_data['module_name'],
                                      function_data['location'])
                cl = getattr(mod, function_data['class'])()
                result.append(getattr(cl, function_data['function']))
                if method_name == function_data['method_name']:
                    break
        return result

    @classmethod
    def extract_arguments(cls, *args):
        """
        Extracts arguments from the CLI
        Always expects a module_name and a method_name (the wildcard is X)
        :param args: arguments passed on by bash
        :return: tuple of module_name, method_name, bool if --help was in and remaining arguments
        :rtype: tuple(str, str, bool, list)
        """
        args = list(args)
        help_requested = False
        # Always expect at least X X
        if len(args) < 2:
            raise ValueError('Expecting at least {0} {0} as arguments.'.format(
                cls._WILDCARD))
        if '--help' in args[0:3]:
            args.remove('--help')
            help_requested = True
        return args.pop(0), args.pop(0), help_requested, args

    @classmethod
    def run_method(cls, *args):
        """
        Executes the given method
        :return: None
        :rtype: NoneType
        """
        module_name, method_name, help_requested, args = cls.extract_arguments(
            *args)
        try:
            found_method_pointers = cls._get_methods(module_name, method_name)
        except ModuleNotRecognizedException:
            cls.print_help(cls._get_methods(), error_help=True)
            return
        if len(found_method_pointers
               ) == 0:  # Module found but no methods -> print help
            cls.print_help(cls._get_methods(module_name), error_help=True)
            return
        if help_requested is True:
            cls.print_help(found_method_pointers)
            return
        try:
            for found_method in found_method_pointers:
                found_method(*args)
        except KeyboardInterrupt:
            cls.logger.warning(
                'Caught keyboard interrupt. Output may be incomplete!')

    @classmethod
    def _discover_methods(cls):
        """
        Discovers all methods with the expose_to_cli decorator
        :return: dict that contains the required info based on module_name and method_name
        :rtype: dict
        """
        time_format = "%Y-%m-%d %H:%M:%S"
        version_id = 1
        start_path = cls.START_PATH
        client = VolatileFactory.get_client()
        cache_expirey_hours = 2  # Amount of hours the cache would expire

        def build_cache():
            """
            Build a dict listing all discovered methods with @expose_to_cli
            :return:  None
            :rtype: NoneType
            """
            # Build cache
            # Executed from lib, want to go to extensions/healthcheck
            found_items = {
                'expires':
                (datetime.now() +
                 timedelta(hours=cache_expirey_hours)).strftime(time_format)
            }
            path = start_path
            for root, dirnames, filenames in os.walk(path):
                for filename in filenames:
                    if not (filename.endswith('.py')
                            and filename != '__init__.py'):
                        continue
                    name = filename.replace('.py', '')
                    file_path = os.path.join(root, filename)
                    # Import file
                    mod = imp.load_source(name, file_path)
                    for member in inspect.getmembers(mod):
                        if not (inspect.isclass(member[1]) and
                                member[1].__module__ == name and 'object' in [
                                    base.__name__
                                    for base in member[1].__bases__
                                ]):
                            continue
                        for submember in inspect.getmembers(member[1]):
                            if not hasattr(submember[1], 'expose_to_cli'):
                                continue
                            exposed_data = submember[1].expose_to_cli
                            method_module_name = exposed_data['module_name']
                            method_name = exposed_data['method_name']
                            method_addon_type = exposed_data[
                                'addon_type'] if 'addon_type' in exposed_data else None
                            if method_module_name not in found_items:
                                found_items[method_module_name] = []
                            # noinspection PyUnresolvedReferences
                            found_items[method_module_name].append({
                                'method_name':
                                method_name,
                                'module_name':
                                name,
                                'function':
                                submember[1].__name__,
                                'class':
                                member[1].__name__,
                                'location':
                                file_path,
                                'version':
                                version_id,
                                'addon_type':
                                method_addon_type
                            })
            client.set(cls.CACHE_KEY, found_items)

        exposed_methods = client.get(cls.CACHE_KEY)
        # Search first to use old cache
        if exposed_methods and datetime.strptime(
                exposed_methods['expires'], time_format
        ) > datetime.now() + timedelta(hours=cache_expirey_hours):
            del exposed_methods['expires']
            return exposed_methods
        build_cache()
        exposed_methods = client.get(cls.CACHE_KEY)
        del exposed_methods['expires']
        return exposed_methods

    @classmethod
    def print_help(cls, method_pointers=None, error_help=False):
        """
        Prints the possible methods that are exposed to the CLI
        :param method_pointers: list of method pointers
        :type method_pointers: list[function]
        :param error_help: print extra help incase wrong arguments were suppplied
        :type error_help: bool
        :return: None
        :rtype: NoneType
        """
        if error_help is True:
            print 'Could not process your arguments.'
        if len(method_pointers) == 0:
            # Nothing found for the search terms
            print 'Found no methods matching your search terms.'
        elif len(method_pointers) == 1:
            # Found only one method -> search term was module_name + method_name
            print method_pointers[0].__doc__
            return
        print 'Possible optional arguments are:'
        # Multiple entries found means only the module_name was supplied
        print 'ovs healthcheck {0} {0} -- will run all checks'.format(
            CLIRunner._WILDCARD)
        print 'ovs healthcheck MODULE {0} -- will run all checks for module'.format(
            CLIRunner._WILDCARD)
        # Sort based on module_name
        print_dict = {}
        for method_pointer in method_pointers:
            module_name = method_pointer.expose_to_cli['module_name']
            method_name = method_pointer.expose_to_cli['method_name']
            if module_name in print_dict:
                print_dict[module_name].append(method_name)
                continue
            print_dict[module_name] = [method_name]
        for module_name, method_names in print_dict.iteritems():
            for method_name in method_names:
                print "ovs healthcheck {0} {1}".format(module_name,
                                                       method_name)
Пример #5
0
class HealthCheckCLIRunner(CLIRunner):
    """
    Healthcheck adaptation of CLIRunner
    Injects a result_handler instance with shared resources to every test to collect the results.
    """
    logger = Logger("healthcheck-healthcheck_clirunner")
    START_PATH = os.path.join(
        os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)),
        'healthcheck')
    ADDON_TYPE = 'healthcheck'

    @staticmethod
    def _keep_old_argument_style(args):
        """
        Fills up the missing arguments to the wildcards
        :param args: all arguments passed by bash
        :return:
        """
        args = list(args)
        possible_args = ['--help', '--unattended', '--to-json']
        indexes = [args.index(arg) for arg in args if arg in possible_args]
        if len(indexes) > 0:
            if indexes[0] == 0:
                args.insert(0, HealthCheckCLIRunner._WILDCARD)
                args.insert(1, HealthCheckCLIRunner._WILDCARD)
            elif indexes[0] == 1:
                args.insert(1, HealthCheckCLIRunner._WILDCARD)
        else:
            if len(args) == 0:
                args.insert(0, HealthCheckCLIRunner._WILDCARD)
                args.insert(1, HealthCheckCLIRunner._WILDCARD)
            elif len(args) == 1:
                args.insert(1, HealthCheckCLIRunner._WILDCARD)
        return args

    @staticmethod
    def run_method(*args):
        """
        Executes the given method
        :return: results & recap
        :rtype: dict
        """
        args = HealthCheckCLIRunner._keep_old_argument_style(args)
        unattended = False
        to_json = False
        if '--unattended' in args:
            args.remove('--unattended')
            unattended = True
        if '--to-json' in args:
            args.remove('--to-json')
            to_json = True
        module_name, method_name, help_requested, args = HealthCheckCLIRunner.extract_arguments(
            *args)
        result_handler = HCResults(unattended, to_json)
        try:
            found_method_pointers = HealthCheckCLIRunner._get_methods(
                module_name, method_name, HealthCheckCLIRunner.ADDON_TYPE)
        except ModuleNotRecognizedException:
            HealthCheckCLIRunner.print_help(HealthCheckCLIRunner._get_methods(
                addon_type=HealthCheckCLIRunner.ADDON_TYPE),
                                            error_help=True)
            return
        if len(found_method_pointers
               ) == 0:  # Module found but no methods -> print help
            HealthCheckCLIRunner.print_help(HealthCheckCLIRunner._get_methods(
                module_name=module_name,
                addon_type=HealthCheckCLIRunner.ADDON_TYPE),
                                            error_help=True)
            return
        if help_requested is True:
            HealthCheckCLIRunner.print_help(found_method_pointers)
            return
        local_settings = Helper.get_local_settings()
        for key, value in local_settings.iteritems():
            result_handler.info('{0}: {1}'.format(
                key.replace('_', ' ').title(), value))
        try:
            result_handler.info(
                'Starting OpenvStorage Healthcheck version {0}'.format(
                    Helper.get_healthcheck_version()))
            result_handler.info("======================")
            for found_method in found_method_pointers:
                test_name = '{0}-{1}'.format(
                    found_method.expose_to_cli['module_name'],
                    found_method.expose_to_cli['method_name'])
                try:
                    node_check(found_method)(
                        result_handler.HCResultCollector(result=result_handler,
                                                         test_name=test_name)
                    )  # Wrapped in nodecheck for callback
                except KeyboardInterrupt:
                    raise
                except Exception as ex:
                    result_handler.exception(
                        'Unhandled exception caught when executing {0}. Got {1}'
                        .format(found_method.__name__, str(ex)))
                    HealthCheckCLIRunner.logger.exception(
                        'Unhandled exception caught when executing {0}'.format(
                            found_method.__name__))
            return HealthCheckCLIRunner.get_results(result_handler,
                                                    module_name, method_name)
        except KeyboardInterrupt:
            HealthCheckCLIRunner.logger.warning(
                'Caught keyboard interrupt. Output may be incomplete!')
            return HealthCheckCLIRunner.get_results(result_handler,
                                                    module_name, method_name)

    @staticmethod
    def get_results(result_handler, module_name, method_name):
        """
        Gets the result of the Open vStorage healthcheck
        :param result_handler: result parser
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param module_name:  module name specified with the cli
        :type module_name: str
        :param method_name: method name specified with the cli
        :type method_name: str
        :return: results & recap
        :rtype: dict
        """
        recap_executer = 'Health Check'
        if module_name != HealthCheckCLIRunner._WILDCARD:
            recap_executer = '{0} module {1}'.format(recap_executer,
                                                     module_name)
        if method_name != HealthCheckCLIRunner._WILDCARD:
            recap_executer = '{0} test {1}'.format(recap_executer, method_name)

        result = result_handler.get_results()

        result_handler.info("Recap of {0}!".format(recap_executer))
        result_handler.info("======================")

        result_handler.info(
            "SUCCESS={0} FAILED={1} SKIPPED={2} WARNING={3} EXCEPTION={4}".
            format(result_handler.counters['SUCCESS'],
                   result_handler.counters['FAILED'],
                   result_handler.counters['SKIPPED'],
                   result_handler.counters['WARNING'],
                   result_handler.counters['EXCEPTION']))
        # returns dict with minimal and detailed information
        return {
            'result': result,
            'recap': {
                'SUCCESS': result_handler.counters['SUCCESS'],
                'FAILED': result_handler.counters['FAILED'],
                'SKIPPED': result_handler.counters['SKIPPED'],
                'WARNING': result_handler.counters['WARNING'],
                'EXCEPTION': result_handler.counters['EXCEPTION']
            }
        }
Пример #6
0
class ArakoonHealthCheck(object):
    """
    A healthcheck for the arakoon persistent store
    """

    logger = Logger("healthcheck-healthcheck_arakoon")
    MODULE = 'arakoon'

    @classmethod
    def _get_arakoon_clusters(cls, result_handler):
        """
        Retrieves all Arakoon clusters registered in this OVSCluster
        :param result_handler: Logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: Dict with the Arakoon cluster types as key and list with dicts which contain cluster names and pyrakoon clients
        :rtype: dict(str, list[dict])
        """
        result_handler.info('Fetching available arakoon clusters.',
                            add_to_result=False)
        arakoon_clusters = {}
        for cluster_name in list(
                Configuration.list('/ovs/arakoon')) + ['cacc']:
            # Determine Arakoon type
            is_cacc = cluster_name == 'cacc'
            arakoon_config = ArakoonClusterConfig(cluster_id=cluster_name,
                                                  load_config=not is_cacc)
            if is_cacc is True:
                with open(Configuration.CACC_LOCATION) as config_file:
                    contents = config_file.read()
                arakoon_config.read_config(contents=contents)
            try:
                arakoon_client = ArakoonInstaller.build_client(arakoon_config)
            except (ArakoonNoMaster, ArakoonNoMasterResult) as ex:
                result_handler.failure(
                    'Unable to find a master for Arakoon cluster {0}. (Message: {1})'
                    .format(cluster_name, str(ex)),
                    code=ErrorCodes.master_none)
            except Exception as ex:
                msg = 'Unable to connect to Arakoon cluster {0}. (Message: {1})'.format(
                    cluster_name, str(ex))
                result_handler.exception(msg,
                                         code=ErrorCodes.unhandled_exception)
                cls.logger.exception(msg)
                continue
            metadata = json.loads(
                arakoon_client.get(ArakoonInstaller.METADATA_KEY))
            cluster_type = metadata['cluster_type']
            if cluster_type not in arakoon_clusters:
                arakoon_clusters[cluster_type] = []
            arakoon_clusters[cluster_type].append({
                'cluster_name': cluster_name,
                'client': arakoon_client,
                'config': arakoon_config
            })
        return arakoon_clusters

    @classmethod
    @cluster_check
    @expose_to_cli(
        MODULE,
        'nodes-test',
        HealthCheckCLI.ADDON_TYPE,
        help=
        'Verify if nodes are missing and if nodes are catching up to the master',
        short_help='Test if there are nodes missing/catching up')
    @expose_to_cli.option(
        '--max-transactions-behind',
        '-m',
        type=int,
        default=10,
        help=
        'The number of transactions that a slave can be behind a master before logging a failure'
    )
    def check_node_status(cls, result_handler, max_transactions_behind=10):
        """
        Checks the status of every node within the Arakoon cluster
        This check will report what nodes are currently missing and what nodes are catching up to the master
        :param result_handler: Logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param max_transactions_behind: The number of transactions that a slave can be behind a master before logging a failure
        :type max_transactions_behind: int
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Starting Arakoon nodes test.',
                            add_to_result=False)
        arakoon_clusters = cls._get_arakoon_clusters(result_handler)
        for cluster_type, clusters in arakoon_clusters.iteritems():
            result_handler.info(
                'Fetching the status of {0} Arakoons'.format(cluster_type),
                add_to_result=False)
            for cluster in clusters:
                arakoon_client = cluster['client']
                cluster_name = cluster['cluster_name']
                arakoon_config = cluster['config']
                # Map the node ids to the object for easier lookups
                node_info = dict(
                    (node.name, node) for node in arakoon_config.nodes)
                identifier = 'Arakoon cluster {0}'.format(cluster_name)
                try:
                    statistics = arakoon_client._client.statistics()
                    node_is = statistics['node_is']
                    # Look for any missing nodes within the cluster
                    missing_ids = list(
                        set(node_info.keys()) - set(node_is.keys()))
                    if len(missing_ids) > 0:
                        for missing_id in missing_ids:
                            node_config = node_info[missing_id]
                            result_handler.failure(
                                '{0} is missing node: {1}'.format(
                                    identifier, '{0} ({1}:{2})'.format(
                                        node_config.name, node_config.ip,
                                        node_config.client_port)),
                                code=ErrorCodes.node_missing)
                    highest_id = max(node_is.iteritems(),
                                     key=operator.itemgetter(1))[0]
                    for node_id, transactions in node_is.iteritems():
                        if node_id == highest_id:
                            continue
                        transactions_behind = node_is[highest_id] - transactions
                        node_config = node_info[node_id]
                        log = 'Node {0} ({1}:{2}) for {3} {{0}} ({4}/{5})'.format(
                            node_config.name, node_config.ip,
                            node_config.client_port, identifier,
                            transactions_behind, max_transactions_behind)
                        if transactions == 0:
                            result_handler.warning(
                                log.format('is catching up'),
                                code=ErrorCodes.slave_catch_up)
                        elif transactions_behind > max_transactions_behind:
                            result_handler.failure(
                                log.format('is behind the master'),
                                code=ErrorCodes.master_behind)
                        else:
                            result_handler.success(
                                log.format('is up to date'),
                                code=ErrorCodes.node_up_to_date)
                except (ArakoonNoMaster, ArakoonNoMasterResult) as ex:
                    result_handler.failure(
                        '{0} cannot find a master. (Message: {1})'.format(
                            identifier, str(ex)),
                        code=ErrorCodes.master_none)
                except Exception as ex:
                    cls.logger.exception(
                        'Unhandled exception during the nodes check')
                    result_handler.exception(
                        'Testing {0} threw an unhandled exception. (Message: {1})'
                        .format(identifier, str(ex)),
                        code=ErrorCodes.unhandled_exception)

    @classmethod
    @cluster_check
    @expose_to_cli(
        MODULE,
        'ports-test',
        HealthCheckCLI.ADDON_TYPE,
        help='Verifies that the Arakoon clusters still respond to connections',
        short_help='Test if Arakoons accepts connections')
    def check_arakoon_ports(cls, result_handler):
        """
        Verifies that the Arakoon clusters still respond to connections
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        arakoon_clusters = cls._get_arakoon_clusters(result_handler)
        result_handler.info('Starting Arakoon ports test.',
                            add_to_result=False)
        result_handler.info(
            'Retrieving all collapsing statistics. This might take a while',
            add_to_result=False)
        start = time.time()
        arakoon_stats = cls._get_port_connections(result_handler,
                                                  arakoon_clusters)
        result_handler.info(
            'Retrieving all collapsing statistics succeeded (duration: {0})'.
            format(time.time() - start),
            add_to_result=False)
        for cluster_type, clusters in arakoon_stats.iteritems():
            result_handler.info(
                'Testing the collapse of {0} Arakoons'.format(cluster_type),
                add_to_result=False)
            for cluster in clusters:
                cluster_name = cluster['cluster_name']
                connection_result = cluster['connection_result']
                connection_result = OrderedDict(
                    sorted(connection_result.items(),
                           key=lambda item: ExtensionsToolbox.advanced_sort(
                               item[0].ip, separator='.')))
                for node, stats in connection_result.iteritems():
                    identifier_log = 'Arakoon cluster {0} on node {1}'.format(
                        cluster_name, node.ip)
                    if len(stats['errors']) > 0:
                        # Determine where issues were found
                        for step, exception in stats['errors']:
                            if step == 'test_connection':
                                try:
                                    # Raise the thrown exception
                                    raise exception
                                except Exception:
                                    message = 'Connection to {0} could not be established due to an unhandled exception.'.format(
                                        identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(
                                        message,
                                        code=ErrorCodes.unhandled_exception)
                        continue
                    if stats['result'] is True:
                        result_handler.success(
                            'Connection established to {0}'.format(
                                identifier_log),
                            code=ErrorCodes.arakoon_connection_ok)
                    else:
                        result_handler.failure(
                            'Connection could not be established to {0}'.
                            format(identifier_log),
                            code=ErrorCodes.arakoon_connection_failure)

    @classmethod
    def _get_port_connections(cls,
                              result_handler,
                              arakoon_clusters,
                              batch_size=10):
        """
        Retrieve tlog/tlx stat information for a Arakoon cluster concurrently
        Note: this will mutate the given arakoon_clusters dict
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param arakoon_clusters: Information about all arakoon clusters, sorted by type and given config
        :type arakoon_clusters: dict
        :param batch_size: Amount of workers to collect the Arakoon information.
        Every worker will initiate a connection
        :return: Dict with tlog/tlx contents for every node config
        Example return:
        {CFG: {ovs.extensions.db.arakooninstaller.ArakoonClusterConfig object: {ovs_extensions.db.arakoon.arakooninstaller.ArakoonNodeConfig object: {'result': True,
                                                                                                                                                     'errors': []},
                                                                                ovs_extensions.db.arakoon.arakooninstaller.ArakoonNodeConfig object: {'result': False,
                                                                                                                                                      'errors': []}}}
        :rtype: dict
        """
        queue = Queue.Queue()
        # Prep work
        for cluster_type, clusters in arakoon_clusters.iteritems():
            for cluster in clusters:
                cluster_name = cluster['cluster_name']
                arakoon_config = cluster['config']
                cluster['connection_result'] = {}
                for node_config in arakoon_config.nodes:
                    result = {'errors': [], 'result': False}
                    cluster['connection_result'][node_config] = result
                    queue.put((cluster_name, node_config, result))

        for _ in xrange(batch_size):
            thread = Thread(target=cls._connection_worker,
                            args=(queue, result_handler))
            thread.setDaemon(
                True
            )  # Setting threads as "daemon" allows main program to exit eventually even if these don't finish correctly.
            thread.start()
        # Wait for all results
        queue.join()
        return arakoon_clusters

    @staticmethod
    def _connection_worker(queue, result_handler):
        """
        Worker method to retrieve file descriptors
        :param queue: Queue to use
        :param result_handler: Logging object
        :return: None
        :rtype: NoneType
        """
        while not queue.empty():
            cluster_name, _node_config, _results = queue.get()
            errors = _results['errors']
            identifier = 'Arakoon cluster {0} on node {1}'.format(
                cluster_name, _node_config.ip)
            result_handler.info(
                'Testing the connection to {0}'.format(identifier),
                add_to_result=False)
            try:
                _results['result'] = NetworkHelper.check_port_connection(
                    _node_config.client_port, _node_config.ip)
            except Exception as ex:
                errors.append(('test_connection', ex))
                result_handler.warning(
                    'Could not test the connection to {0} ({1})'.format(
                        identifier, str(ex)),
                    add_to_result=False)
            finally:
                queue.task_done()

    @classmethod
    @cluster_check
    @expose_to_cli(MODULE,
                   'collapse-test',
                   HealthCheckCLI.ADDON_TYPE,
                   help='Verifies collapsing has occurred for all Arakoons',
                   short_help='Test if Arakoon collapsing is not failing')
    @expose_to_cli.option('--max-collapse-age',
                          '-a',
                          type=int,
                          default=3,
                          help='Maximum age in days for TLX')
    @expose_to_cli.option('--min-tlx-amount',
                          '-t',
                          type=int,
                          default=10,
                          help='Minimum amount of TLX files before testing')
    def check_collapse(cls,
                       result_handler,
                       max_collapse_age=3,
                       min_tlx_amount=10):
        """
        Verifies collapsing has occurred for all Arakoons
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param max_collapse_age: tlx files may not be longer than x days
        :type max_collapse_age: int
        :param min_tlx_amount: Minimum amount of tlxes before making collapsing mandatory (defaults to 10)
        :type min_tlx_amount: int
        :return: None
        :rtype: NoneType
        """
        arakoon_clusters = cls._get_arakoon_clusters(result_handler)
        result_handler.info('Starting Arakoon collapse test',
                            add_to_result=False)
        max_age_seconds = timedelta(days=max_collapse_age).total_seconds()
        result_handler.info(
            'Retrieving all collapsing statistics. This might take a while',
            add_to_result=False)
        start = time.time()
        arakoon_stats = cls._retrieve_stats(result_handler, arakoon_clusters)
        result_handler.info(
            'Retrieving all collapsing statistics succeeded (duration: {0})'.
            format(time.time() - start),
            add_to_result=False)
        for cluster_type, clusters in arakoon_stats.iteritems():
            result_handler.info(
                'Testing the collapse of {0} Arakoons'.format(cluster_type),
                add_to_result=False)
            for cluster in clusters:
                cluster_name = cluster['cluster_name']
                collapse_result = cluster['collapse_result']
                collapse_result = OrderedDict(
                    sorted(collapse_result.items(),
                           key=lambda item: ExtensionsToolbox.advanced_sort(
                               item[0].ip, separator='.')))
                for node, stats in collapse_result.iteritems():
                    identifier_log = 'Arakoon cluster {0} on node {1}'.format(
                        cluster_name, node.ip)
                    if len(stats['errors']) > 0:
                        # Determine where issues were found
                        for step, exception in stats['errors']:
                            if step == 'build_client':
                                try:
                                    # Raise the thrown exception
                                    raise exception
                                except TimeOutException:
                                    result_handler.warning(
                                        'Connection to {0} has timed out'.
                                        format(identifier_log),
                                        code=ErrorCodes.ssh_connection_time)
                                except (socket.error,
                                        UnableToConnectException):
                                    result_handler.failure(
                                        'Connection to {0} could not be established'
                                        .format(identifier_log),
                                        code=ErrorCodes.ssh_connection_fail)
                                except NotAuthenticatedException:
                                    result_handler.skip(
                                        'Connection to {0} could not be authenticated. This node has no access to the Arakoon node.'
                                        .format(identifier_log),
                                        code=ErrorCodes.
                                        ssh_connection_authentication)
                                except Exception:
                                    message = 'Connection to {0} could not be established due to an unhandled exception.'.format(
                                        identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(
                                        message,
                                        code=ErrorCodes.unhandled_exception)
                            elif step == 'stat_dir':
                                try:
                                    raise exception
                                except Exception:
                                    message = 'Unable to list the contents of the tlog directory ({0}) for {1}'.format(
                                        node.tlog_dir, identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(
                                        message,
                                        code=ErrorCodes.unhandled_exception)
                        continue
                    tlx_files = stats['result']['tlx']
                    tlog_files = stats['result']['tlog']
                    headdb_files = stats['result']['headDB']
                    avail_size = stats['result']['avail_size']

                    if any(item is None
                           for item in [tlx_files, tlog_files, avail_size]):
                        # Exception occurred but no errors were logged
                        result_handler.exception(
                            'Either the tlx or tlog files or available size could be found in/of the tlog directory ({0}) for {1}'
                            .format(node.tlog_dir, identifier_log),
                            code=ErrorCodes.tlx_tlog_not_found)
                        continue
                    if len(headdb_files) > 0:
                        headdb_size = sum([int(i[2]) for i in headdb_files])
                        collapse_size_msg = 'Spare space for local collapse is'
                        if avail_size >= headdb_size * 4:
                            result_handler.success(
                                '{0} sufficient (n > 4x head.db size)'.format(
                                    collapse_size_msg))
                        elif avail_size >= headdb_size * 3:
                            result_handler.warning(
                                '{0} running short (n > 3x head.db size)'.
                                format(collapse_size_msg))
                        elif avail_size >= headdb_size * 2:
                            result_handler.failure(
                                '{0} just enough (n > 2x head.db size'.format(
                                    collapse_size_msg))
                        else:
                            result_handler.failure(
                                '{0} insufficient (n <2 x head.db size'.format(
                                    collapse_size_msg))

                    if len(tlog_files) == 0:
                        # A tlog should always be present
                        result_handler.failure(
                            '{0} has no open tlog'.format(identifier_log),
                            code=ErrorCodes.tlog_not_found)
                        continue
                    if len(tlx_files) < min_tlx_amount:
                        result_handler.skip(
                            '{0} only has {1} tlx, not worth collapsing (required: {2})'
                            .format(identifier_log, len(tlx_files),
                                    min_tlx_amount))
                        continue
                    # Compare youngest tlog and oldest tlx timestamp
                    seconds_difference = int(tlog_files[-1][0]) - int(
                        tlx_files[0][0])
                    if max_age_seconds > seconds_difference:
                        result_handler.success(
                            '{0} should not be collapsed. The oldest tlx is at least {1} days younger than the youngest tlog (actual age: {2})'
                            .format(
                                identifier_log, max_collapse_age,
                                str(timedelta(seconds=seconds_difference))),
                            code=ErrorCodes.collapse_ok)
                    else:
                        result_handler.failure(
                            '{0} should be collapsed. The oldest tlx is currently {1} old'
                            .format(
                                identifier_log,
                                str(timedelta(seconds=seconds_difference))),
                            code=ErrorCodes.collapse_not_ok)

    @classmethod
    def _retrieve_stats(cls, result_handler, arakoon_clusters, batch_size=10):
        """
        Retrieve tlog/tlx stat information for a Arakoon cluster concurrently
        Note: this will mutate the given arakoon_clusters dict
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param arakoon_clusters: Information about all arakoon clusters, sorted by type and given config
        :type arakoon_clusters: dict
        :param batch_size: Amount of workers to collect the Arakoon information.
        Every worker means a connection towards a different node
        :return: Dict with tlog/tlx contents for every node config
        Example return:
        {CFG: {ovs.extensions.db.arakooninstaller.ArakoonClusterConfig object: {ovs_extensions.db.arakoon.arakooninstaller.ArakoonNodeConfig object: {'result': {'tlx': [['1513174398', '/opt/OpenvStorage/db/arakoon/config/tlogs/3393.tlx']],
                                                                                                                                                                'tlog': [['1513178427', '/opt/OpenvStorage/db/arakoon/config/tlogs/3394.tlog']]},
                                                                                                                                                     'errors': []},
                                                                                ovs_extensions.db.arakoon.arakooninstaller.ArakoonNodeConfig object: {'result': {'tlx': [['1513166090', '/opt/OpenvStorage/db/arakoon/config/tlogs/3392.tlx'], ['1513174418', '/opt/OpenvStorage/db/arakoon/config/tlogs/3393.tlx']],
                                                                                                                                                                'tlog': [['1513178427', '/opt/OpenvStorage/db/arakoon/config/tlogs/3394.tlog']]}, 'errors': []}, <ovs_extensions.db.arakoon.arakooninstaller.ArakoonNodeConfig object at 0x7fb3a84db090>: {'output': {'tlx': [['1513174358', '/opt/OpenvStorage/db/arakoon/config/tlogs/3393.tlx']], 'tlog': [['1513178427', '/opt/OpenvStorage/db/arakoon/config/tlogs/3394.tlog']]},
                                                                                                                                                      'errors': []}}}
        :rtype: dict
        """
        queue = Queue.Queue()
        clients = {}
        # Prep work
        for cluster_type, clusters in arakoon_clusters.iteritems():
            for cluster in clusters:
                cluster_name = cluster['cluster_name']
                arakoon_config = cluster['config']
                cluster['collapse_result'] = {}
                for node_config in arakoon_config.nodes:
                    result = {
                        'errors': [],
                        'result': {
                            'tlx': [],
                            'tlog': [],
                            'headDB': [],
                            'avail_size': None
                        }
                    }
                    cluster['collapse_result'][node_config] = result
                    # Build SSHClients outside the threads to avoid GIL
                    try:
                        client = clients.get(node_config.ip)
                        if client is None:
                            client = SSHClient(node_config.ip, timeout=5)
                            clients[node_config.ip] = client
                    except Exception as ex:
                        result['errors'].append(('build_client', ex))
                        continue
                    queue.put((cluster_name, node_config, result))
        # Limit to one session for every node.
        # Every process will fork from this one, creating a new session instead of using the already existing channel
        # There might be an issue issue if a ssh session would take too long causing all workers to connect to that one node
        # and therefore hitting the MaxSessions again (theory)
        for _ in xrange(min(len(clients.keys()), batch_size)):
            thread = Thread(target=cls._collapse_worker,
                            args=(queue, clients, result_handler))
            thread.setDaemon(
                True
            )  # Setting threads as "daemon" allows main program to exit eventually even if these don't finish correctly.
            thread.start()
        # Wait for all results
        queue.join()
        return arakoon_clusters

    @staticmethod
    def _collapse_worker(queue, clients, result_handler):
        """
        Worker method to retrieve file descriptors
        :param queue: Queue to use
        :param clients: SSHClients to choose from
        :param result_handler: Logging object
        :return: None
        :rtype: NoneType
        """
        while not queue.empty():
            cluster_name, _node_config, _results = queue.get()
            errors = _results['errors']
            output = _results['result']
            identifier = 'Arakoon cluster {0} on node {1}'.format(
                cluster_name, _node_config.ip)
            result_handler.info(
                'Retrieving collapse information for {0}'.format(identifier),
                add_to_result=False)
            try:
                _client = clients[_node_config.ip]
                tlog_dir = _node_config.tlog_dir
                path = os.path.join(tlog_dir, '*')
                try:
                    # List the contents of the tlog directory and sort by oldest modification date
                    # Example output: (timestamp, name, size (bits)
                    # 01111 file.tlog 101
                    # 01112 file2.tlog 102
                    timestamp_files = _client.run(
                        'stat -c "%Y %n %s" {0}'.format(path),
                        allow_insecure=True)
                    output['avail_size'] = _client.run(
                        "df {0} | tail -1 | awk '{{print $4}}'".format(path),
                        allow_insecure=True)
                except Exception as _ex:
                    errors.append(('stat_dir', _ex))
                    raise
                # Sort and separate the timestamp item files
                for split_entry in sorted(
                    (timestamp_file.split()
                     for timestamp_file in timestamp_files.splitlines()),
                        key=lambda split: int(split[0])):
                    file_name = split_entry[1]
                    if file_name.endswith('tlx'):
                        output['tlx'].append(split_entry)
                    elif file_name.endswith('tlog'):
                        output['tlog'].append(split_entry)
                    elif file_name.rsplit('/')[-1].startswith('head.db'):
                        output['headDB'].append(split_entry)
            except Exception as _ex:
                result_handler.warning(
                    'Could not retrieve the collapse information for {0} ({1})'
                    .format(identifier, str(_ex)),
                    add_to_result=False)
            finally:
                queue.task_done()

    @classmethod
    @cluster_check
    @expose_to_cli(
        MODULE,
        'integrity-test',
        HealthCheckCLI.ADDON_TYPE,
        help=
        'Verifies that all Arakoon clusters are still responding to client calls',
        short_help='Test if Arakoon clusters are still responding')
    def verify_integrity(cls, result_handler):
        """
        Verifies that all Arakoon clusters are still responding to client calls
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        arakoon_cluster = cls._get_arakoon_clusters(result_handler)
        result_handler.info('Starting Arakoon integrity test',
                            add_to_result=False)
        for cluster_type, clusters in arakoon_cluster.iteritems():
            result_handler.info(
                'Testing the integry of {0} Arakoons'.format(cluster_type),
                add_to_result=False)
            for cluster in clusters:
                arakoon_client = cluster['client']
                cluster_name = cluster['cluster_name']
                try:
                    arakoon_client.nop()
                    result_handler.success(
                        'Arakoon {0} responded'.format(cluster_name),
                        code=ErrorCodes.arakoon_responded)
                except (ArakoonNoMaster, ArakoonNoMasterResult) as ex:
                    result_handler.failure(
                        'Arakoon {0} cannot find a master. (Message: {1})'.
                        format(cluster_name, str(ex)),
                        code=ErrorCodes.master_none)
                except Exception as ex:
                    cls.logger.exception(
                        'Unhandled exception during the integrity check')
                    result_handler.exception(
                        'Arakoon {0} threw an unhandled exception. (Message: {1})'
                        .format(cluster_name, str(ex)),
                        code=ErrorCodes.unhandled_exception)

    @classmethod
    @cluster_check
    @expose_to_cli(
        MODULE,
        'file-descriptors-test',
        HealthCheckCLI.ADDON_TYPE,
        help=
        'Verify the number of File Descriptors on every Arakoon does not exceed the limit',
        short_help='Test if #FD does not exceed the limit')
    @expose_to_cli.option(
        '--fd-limit',
        '-l',
        type=int,
        default=30,
        help=
        'Threshold for the number number of tcp connections for which to start logging warnings'
    )
    def check_arakoon_fd(cls,
                         result_handler,
                         fd_limit=30,
                         passed_connections=None):
        """
        Checks all current open tcp file descriptors for all Arakoon clusters in the OVS cluster
        Will raise warnings when these reach a certain threshold
        :param result_handler: Logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param fd_limit: Threshold for the number number of tcp connections for which to start logging warnings
        :type fd_limit: int
        :param passed_connections: checked TCP connections
        :type passed_connections: list
        :return: None
        :rtype: NoneType
        """
        if passed_connections is None:
            passed_connections = ['ESTABLISHED', 'TIME_WAIT']
        warning_threshold = fd_limit * 80 / 100
        error_threshold = fd_limit * 95 / 100

        result_handler.info('Starting Arakoon integrity test',
                            add_to_result=False)
        arakoon_clusters = cls._get_arakoon_clusters(result_handler)
        start = time.time()
        arakoon_fd_results = cls._get_filedescriptors(result_handler,
                                                      arakoon_clusters)
        result_handler.info(
            'Retrieving all file descriptor information succeeded (duration: {0})'
            .format(time.time() - start),
            add_to_result=False)
        for cluster_type, clusters in arakoon_fd_results.iteritems():
            result_handler.info(
                'Checking the file descriptors of {0} Arakoons'.format(
                    cluster_type),
                add_to_result=False)
            for cluster in clusters:
                cluster_name = cluster['cluster_name']
                fd_result = cluster['fd_result']
                fd_result = OrderedDict(
                    sorted(fd_result.items(),
                           key=lambda item: ExtensionsToolbox.advanced_sort(
                               item[0].ip, separator='.')))
                for node, stats in fd_result.iteritems():
                    identifier_log = 'Arakoon cluster {0} on node {1}'.format(
                        cluster_name, node.ip)
                    if len(stats['errors']) > 0:
                        # Determine where issues were found
                        for step, exception in stats['errors']:
                            if step == 'build_client':
                                try:
                                    # Raise the thrown exception
                                    raise exception
                                except TimeOutException:
                                    result_handler.warning(
                                        'Connection to {0} has timed out'.
                                        format(identifier_log),
                                        code=ErrorCodes.ssh_connection_time)
                                except (socket.error,
                                        UnableToConnectException):
                                    result_handler.failure(
                                        'Connection to {0} could not be established'
                                        .format(identifier_log),
                                        code=ErrorCodes.ssh_connection_fail)
                                except NotAuthenticatedException:
                                    result_handler.skip(
                                        'Connection to {0} could not be authenticated. This node has no access to the Arakoon node.'
                                        .format(identifier_log),
                                        code=ErrorCodes.
                                        ssh_connection_authentication)
                                except Exception:
                                    message = 'Connection to {0} could not be established due to an unhandled exception.'.format(
                                        identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(
                                        message,
                                        code=ErrorCodes.unhandled_exception)
                            elif step == 'lsof':
                                try:
                                    raise exception
                                except Exception:
                                    message = 'Unable to list the file descriptors for {0}'.format(
                                        identifier_log)
                                    cls.logger.exception(message)
                                    result_handler.exception(
                                        message,
                                        ErrorCodes.unhandled_exception)
                        continue
                    fds = stats['result']['fds']
                    filtered_fds = [
                        i for i in fds if i.split()[-1].strip('(').strip(')')
                        in passed_connections
                    ]
                    if len(filtered_fds) >= warning_threshold:
                        if len(filtered_fds) >= error_threshold:
                            result_handler.warning(
                                'Number of TCP connections exceeded the 95% warning threshold for {0}, ({1}/{2})'
                                .format(identifier_log, len(filtered_fds),
                                        fd_limit),
                                code=ErrorCodes.arakoon_fd_95)
                        else:
                            result_handler.warning(
                                'Number of TCP connections exceeded the 80% warning threshold for {0}, ({1}/{2})'
                                .format(identifier_log, len(filtered_fds),
                                        fd_limit),
                                code=ErrorCodes.arakoon_fd_80)
                    else:
                        result_handler.success(
                            'Number of TCP connections for {0} is healthy ({1}/{2})'
                            .format(identifier_log, len(filtered_fds),
                                    fd_limit),
                            code=ErrorCodes.arakoon_fd_ok)

    @classmethod
    def _get_filedescriptors(cls,
                             result_handler,
                             arakoon_clusters,
                             batch_size=10):
        """
        Retrieve tlog/tlx stat information for a Arakoon cluster concurrently
        Note: this will mutate the given arakoon_clusters dict
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param arakoon_clusters: Information about all Arakoon clusters, sorted by type and given config
        :type arakoon_clusters: dict
        :param batch_size: Amount of workers to collect the Arakoon information.
        Every worker means a connection towards a different node
        :return: Dict with file descriptors contents for every node config
        :rtype: dict
        """
        queue = Queue.Queue()
        clients = {}
        # Prep work
        for cluster_type, clusters in arakoon_clusters.iteritems():
            for cluster in clusters:
                cluster_name = cluster['cluster_name']
                arakoon_config = cluster['config']
                cluster['fd_result'] = {}
                for node_config in arakoon_config.nodes:
                    result = {'errors': [], 'result': {'fds': []}}
                    # Build SSHClients outside the threads to avoid GIL
                    try:
                        client = clients.get(node_config.ip)
                        if client is None:
                            client = SSHClient(node_config.ip, timeout=5)
                            clients[node_config.ip] = client
                    except Exception as ex:
                        result['errors'].append(('build_client', ex))
                        continue
                    cluster['fd_result'][node_config] = result
                    queue.put((cluster_name, node_config, result))
        service_manager = ServiceFactory.get_manager()
        # Limit to one session for every node.
        # Every process will fork from this one, creating a new session instead of using the already existing channel
        # There might be an issue issue if a ssh session would take too long causing all workers to connect to that one node
        # and therefore hitting the MaxSessions again (theory)
        for _ in xrange(min(len(clients.keys()), batch_size)):
            thread = Thread(target=cls._fd_worker,
                            args=(queue, clients, result_handler,
                                  service_manager))
            thread.setDaemon(
                True
            )  # Setting threads as "daemon" allows main program to exit eventually even if these don't finish correctly.
            thread.start()
        # Wait for all results
        queue.join()
        return arakoon_clusters

    @staticmethod
    def _fd_worker(queue, clients, result_handler, service_manager):
        """
        Worker method to retrieve file descriptors
        :param queue: Queue to use
        :param clients: SSHClients to choose from
        :param result_handler: Logging object
        :param service_manager: Service manager instance
        :return: None
        :rtype: NoneType
        """
        while not queue.empty():
            cluster_name, _node_config, _results = queue.get(False)
            errors = _results['errors']
            output = _results['result']
            identifier = 'Arakoon cluster {0} on node {1}'.format(
                cluster_name, _node_config.ip)
            result_handler.info(
                'Retrieving file descriptor information for {0}'.format(
                    identifier),
                add_to_result=False)
            try:
                client = clients[_node_config.ip]
                try:
                    # Handle config Arakoon
                    cluster_name = cluster_name if cluster_name != 'cacc' else 'config'
                    service_name = ArakoonInstaller.get_service_name_for_cluster(
                        cluster_name)
                    pid = service_manager.get_service_pid(service_name, client)
                    file_descriptors = client.run(
                        ['lsof', '-i', '-a', '-p', pid]).splitlines()[1:]
                except Exception as _ex:
                    errors.append(('lsof', _ex))
                    raise
                output['fds'] = file_descriptors
            except Exception as _ex:
                result_handler.warning(
                    'Could not retrieve the file descriptor information for {0} ({1})'
                    .format(identifier, str(_ex)),
                    add_to_result=False)
            finally:
                queue.task_done()
Пример #7
0
class CLI(click.Group):
    """
    Click CLI which dynamically loads all possible commands
    Implementations require an entry point
    An entry point is defined as:
    @click.group(cls=CLI)
    def entry_point():
        pass

    if __name__ == '__main__':
        entry_point()
    """
    ADDON_TYPE = 'ovs'  # Type of addon the CLI is
    CACHE_KEY = 'ovs_discover_method'
    CACHE_EXPIRE_HOURS = 2  # Amount of hours the cache would expire
    GROUP_MODULE_CLASS = click.Group
    CMD_FOLDER = os.path.join(os.path.dirname(__file__))  # Folder to query for commands

    logger = Logger("ovs_clirunner")
    _volatile_client = VolatileFactory.get_client()
    _discovery_cache = {}

    def __init__(self, *args, **kwargs):
        # type: (*any, **any) -> None
        super(CLI, self).__init__(*args, **kwargs)

    def list_commands(self, ctx):
        # type: (click.Context) -> list[str]
        """
        Lists all possible commands found within the directory of this file
        All modules are retrieved
        :param ctx: Passed context
        :return: List of files to look for commands
        """
        _ = ctx
        sub_commands = self._discover_methods().keys()  # Returns all underlying modules
        sub_commands.sort()
        return sub_commands

    def get_command(self, ctx, name):
        # type: (click.Context, str) -> callable
        """
        Retrieves a command to execute
        :param ctx: Passed context
        :param name: Name of the command
        :return: Function pointer to the command or None when no import could happen
        :rtype: callable
        """
        cmd = self.commands.get(name)
        if cmd:
            return cmd
        # More extensive - build the command and register
        discovery_data = self._discover_methods()
        if name in discovery_data.keys():
            # The current passed name is a module. Wrap it up in a group and add all commands under it dynamically
            module_commands = {}
            for function_name, function_data in discovery_data[name].iteritems():
                # Register the decorated function as callback to click
                # Try to avoid name collision with other modules. Might lead to unexpected results
                mod = imp.load_source('ovs_cli_{0}'.format(function_data['module_name']), function_data['location'])
                cl = getattr(mod, function_data['class'])()
                module_commands[function_name] = click.Command(function_name, callback=getattr(cl, function_data['function']))
            ret = self.GROUP_MODULE_CLASS(name, module_commands)
            self.add_command(ret)
            return ret

    @classmethod
    def _discover_methods(cls):
        # type: () -> dict
        """
        Discovers all methods with the expose_to_cli decorator
        :return: dict that contains the required info based on module_name and method_name
        :rtype: dict
        """
        version_id = 1
        start_path = cls.CMD_FOLDER
        addon_type = cls.ADDON_TYPE

        def discover():
            """
            Build a dict listing all discovered methods with @expose_to_cli
            :return:  Dict with all discovered itms
            :rtype: dict
            """
            # Build cache
            found_items = {'expires': time.time() + cls.CACHE_EXPIRE_HOURS * 60 ** 2}
            path = start_path
            for root, dirnames, filenames in os.walk(path):
                for filename in filenames:
                    if not (filename.endswith('.py') and filename != '__init__.py'):
                        continue
                    file_path = os.path.join(root, filename)
                    module_name = 'ovs_cli_{0}'.format(filename.replace('.py', ''))
                    # Import file, making it relative to the start path to avoid name collision.
                    # Without it, the module contents would be merged (eg. alba.py and testing/alba.py would be merged, overriding the path
                    # imp.load_source is different from importing. Therefore using the relative-joined name is safe
                    try:
                        mod = imp.load_source(module_name, file_path)
                    except ImportError:
                        cls.logger.exception('Unable to import module at {0}'.format(file_path))
                        continue
                    for member_name, member_value in inspect.getmembers(mod):
                        if not (inspect.isclass(member_value) and member_value.__module__ == module_name and 'object' in [base.__name__ for base in member_value.__bases__]):
                            continue
                        for submember_name, submember_value in inspect.getmembers(member_value):
                            if not hasattr(submember_value, expose_to_cli.attribute):
                                continue
                            exposed_data = getattr(submember_value, expose_to_cli.attribute)
                            method_module_name = exposed_data['module_name']
                            method_name = exposed_data['method_name']
                            method_addon_type = exposed_data['addon_type'] if 'addon_type' in exposed_data else None
                            if method_module_name not in found_items:
                                found_items[method_module_name] = {}
                            # Only return when the addon type matches
                            if method_addon_type == addon_type:
                                function_metadata = {'function': submember_value.__name__,
                                                     'class': member_value.__name__,
                                                     'location': file_path,
                                                     'version': version_id}
                                function_metadata.update(exposed_data)  # Add all exposed data for further re-use
                                found_items[method_module_name][method_name] = function_metadata
            return found_items

        def get_and_cache():
            found_items = cls._volatile_client.get(cls.CACHE_KEY)
            if found_items:
                cls._discovery_cache.update(found_items)
            return found_items

        try:
            exposed_methods = copy.deepcopy(cls._discovery_cache) or get_and_cache()
            if exposed_methods and exposed_methods['expires'] > time.time():
                # Able to use the cache, has not expired yet
                del exposed_methods['expires']
                return exposed_methods
        except Exception:
            cls.logger.exception('Unable to retrieve the exposed resources from cache')
        exposed_methods = discover()
        try:
            cls._discovery_cache = exposed_methods
            cls._volatile_client.set(cls.CACHE_KEY, exposed_methods)
        except Exception:
            cls.logger.exception('Unable to cache the exposed resources')
        del exposed_methods['expires']
        return exposed_methods

    @classmethod
    def clear_cache(cls):
        # type: () -> None
        """
        Clear all cache related to discovering methods
        :return: None
        :rtype: NoneType
        """
        cls._volatile_client.delete(cls.CACHE_KEY)
Пример #8
0
class IPMIHealthCheck(object):
    """
    Healthcheck file to execute multiple IPMI tests
    """
    MODULE = 'ipmi'
    logger = Logger("healthcheck-healthcheck_ipmi")

    @classmethod
    @expose_to_cli(MODULE,
                   'ipmi-test',
                   HealthCheckCLI.ADDON_TYPE,
                   help='Verify that AlbaNodes can be controlled through IPMI',
                   short_help='Test if AlbaNodes their IPMI info is correct')
    def ipmi_check(cls, result_handler):
        """
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return:
        """
        for albanode in AlbaNodeList.get_albanodes():
            node_id = albanode.node_id
            ipmi_config_loc = '/ovs/alba/asdnodes/{0}/config/ipmi'.format(
                node_id)
            if not Configuration.exists(ipmi_config_loc):
                result_handler.skip(
                    'No IPMI info found on AlbaNode with ID {0}'.format(
                        node_id))
                continue
            ipmi_config = Configuration.get(ipmi_config_loc)
            ip = ipmi_config.get('ip')
            try:
                controller = IPMIController(
                    ip=ip,
                    username=ipmi_config.get('username'),
                    password=ipmi_config.get('password'),
                    client=SSHClient(System.get_my_storagerouter()))
            except:
                result_handler.failure(
                    'IPMI settings are not valid for AlbaNode with ID {0}'.
                    format(node_id))
                continue
            try:
                status = controller.status_node().get(ip)
                if status == IPMIController.IPMI_POWER_ON:
                    result_handler.success(
                        'IPMI AlbaNode with ID {0} status is POWER ON'.format(
                            node_id))
                elif status == IPMIController.IPMI_POWER_OFF:
                    result_handler.warning(
                        'IPMI AlbaNode with ID {0} status is POWER OFF'.format(
                            node_id))
            except IPMITimeOutException as ex:
                result_handler.failure(
                    "IPMI AlbaNode with ID {0} timed out: '{1}'".format(
                        node_id, ex))
            except IPMICallException as ex:
                result_handler.failure(
                    "IPMI AlbaNode with ID {0} call failed: '{1}'".format(
                        node_id, ex))
            except Exception:
                msg = 'Could not retrieve info through IPMI for AlbaNode with ID {0}'.format(
                    node_id)
                cls.logger.exception(msg)
                result_handler.exception(msg)