def check_dtl(result_handler):
     """
     Checks the dtl for all vdisks on the local node
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :return: None
     :rtype: NoneType
     """
     # Fetch vdisks hosted on this machine
     local_sr = System.get_my_storagerouter()
     if len(local_sr.vdisks_guids) == 0:
         return result_handler.skip('No VDisks present in cluster.')
     for vdisk_guid in local_sr.vdisks_guids:
         vdisk = VDisk(vdisk_guid)
         vdisk.invalidate_dynamics(['dtl_status', 'info'])
         if vdisk.dtl_status == 'ok_standalone' or vdisk.dtl_status == 'disabled':
             result_handler.success('VDisk {0}s DTL is disabled'.format(vdisk.name), code=ErrorCodes.volume_dtl_standalone)
         elif vdisk.dtl_status == 'ok_sync':
             result_handler.success('VDisk {0}s DTL is enabled and running.'.format(vdisk.name), code=ErrorCodes.volume_dtl_ok)
         elif vdisk.dtl_status == 'degraded':
             result_handler.warning('VDisk {0}s DTL is degraded.'.format(vdisk.name), code=ErrorCodes.volume_dtl_degraded)
         elif vdisk.dtl_status == 'checkup_required':
             result_handler.warning('VDisk {0}s DTL should be configured.'.format(vdisk.name), code=ErrorCodes.volume_dtl_checkup_required)
         elif vdisk.dtl_status == 'catch_up':
             result_handler.warning('VDisk {0}s DTL is enabled but still syncing.'.format(vdisk.name), code=ErrorCodes.volume_dtl_catch_up)
         else:
             result_handler.warning('VDisk {0}s DTL has an unknown status: {1}.'.format(vdisk.name, vdisk.dtl_status), code=ErrorCodes.volume_dtl_unknown)
示例#2
0
 def wrapped(*args, **kwargs):
     if lock_type == 'local':
         _mutex = file_mutex(key)
     elif lock_type == 'cluster':
         _mutex = volatile_mutex(key)
     else:
         raise ValueError(
             'Lock type {0} is not supported!'.format(lock_type))
     try:
         _mutex.acquire(wait=0.005)
         local_sr = System.get_my_storagerouter()
         CacheHelper.set(key=key,
                         item={
                             'ip': local_sr.ip,
                             'hostname': local_sr.name
                         },
                         expire_time=60)
         return func(*args, **kwargs)
     except (NoFileLockAvailableException,
             NoVolatileLockAvailableException):
         if callback is None:
             return
         else:
             executor_info = None
             start = time.time()
             while executor_info is None:
                 # Calculated guesswork. If a callback function would be expected, the acquire has happened for another executor  the volatilekey should be set eventually
                 # However by setting it after the acquire, the callback executor and original method executor can race between fetch and set
                 # A better implementation would be relying on the fwk ensure_single_decorator as they check for various races themselves
                 # This is just a poor mans, temporary implementation
                 if start - time.time() > 5:
                     raise ValueError(
                         'Timed out after 5 seconds while fetching the information about the executor.'
                     )
                 try:
                     executor_info = CacheHelper.get(key=key)
                 except:
                     pass
             callback_func = callback.__func__ if isinstance(
                 callback, staticmethod) else callback
             argnames = inspect.getargspec(callback_func)[0]
             arguments = list(args)
             kwargs.update({'test_name': func.__name__})
             if executor_info is not None:
                 kwargs.update(executor_info)
                 if 'result_handler' in argnames:
                     result_handler = kwargs.get('result_handler')
                     for index, arg in enumerate(arguments):
                         if isinstance(arg,
                                       HCResults.HCResultCollector):
                             result_handler = arguments.pop(index)
                             break
                     if result_handler is None:
                         raise TypeError(
                             'Expected an instance of {}'.format(
                                 type(HCResults.HCResultCollector)))
                     kwargs['result_handler'] = result_handler
             return callback_func(*tuple(arguments), **kwargs)
     finally:
         _mutex.release()
示例#3
0
 def get_local_storagerouter():
     """
     Fetches the details of a local storagerouter
     :return: a StorageRouter
     :rtype: ovs.dal.hybrids.storagerouter.StorageRouter
     """
     return System.get_my_storagerouter()
    def _create_vpool(self):
        """
        Needed to actually run tests on
        This is not actually a test of "Add Vpool to OVS",
        so any failure here will be reported as a setUp error and no tests will run
        """
        pmachine = System.get_my_storagerouter().pmachine
        mgmt_center = MgmtCenter(
            data={
                'name': 'Openstack',
                'description': 'test',
                'username': OVSPluginTestCase.CINDER_USER,
                'password': OVSPluginTestCase.CINDER_PASS,
                'ip': OVSPluginTestCase.CINDER_CONTROLLER,
                'port': 80,
                'type': 'OPENSTACK',
                'metadata': {
                    'integratemgmt': True
                }
            })
        mgmt_center.save()
        pmachine.mgmtcenter = mgmt_center
        pmachine.save()
        self._debug('Creating vpool')

        parameters = {
            'storagerouter_ip': OVSPluginTestCase.ip,
            'vpool_name': OVSPluginTestCase.VPOOL_NAME,
            'type': 'local',
            'storage_ip': '127.0.0.1',  # KVM
            'vrouter_port': OVSPluginTestCase.VPOOL_PORT,
            'integrate_vpool': True,
            'connection_host': OVSPluginTestCase.ip,
            'connection_port': OVSPluginTestCase.VPOOL_PORT,
            'connection_username': '',
            'connection_password': '',
            'connection_backend': {},
            'readcache_size': 50,
            'writecache_size': 50
        }
        StorageRouterController.add_vpool(parameters)
        attempt = 0
        while attempt < 10:
            vpool = VPoolList.get_vpool_by_name(OVSPluginTestCase.VPOOL_NAME)
            if vpool is not None:
                self._debug('vpool {0} created'.format(
                    OVSPluginTestCase.VPOOL_NAME))
                try:
                    os.listdir(OVSPluginTestCase.VPOOL_MOUNTPOINT)
                    return vpool
                except Exception as ex:
                    # either it doesn't exist, or we don't have permission
                    self._debug('vpool not ready yet {0}'.format(str(ex)))
                    pass
            attempt += 1
            time.sleep(2)
        raise RuntimeError(
            'Vpool {0} was not modeled correctly or did not start.'.format(
                OVSPluginTestCase.VPOOL_NAME))
示例#5
0
    def _create_vpool(self):
        """
        Needed to actually run tests on
        This is not actually a test of "Add Vpool to OVS",
        so any failure here will be reported as a setUp error and no tests will run
        """
        pmachine = System.get_my_storagerouter().pmachine
        mgmt_center = MgmtCenter(data={'name':'Openstack',
                                       'description':'test',
                                       'username':CINDER_USER,
                                       'password':CINDER_PASS,
                                       'ip':CINDER_CONTROLLER,
                                       'port':80,
                                       'type':'OPENSTACK',
                                       'metadata':{'integratemgmt':True}})
        mgmt_center.save()
        pmachine.mgmtcenter = mgmt_center
        pmachine.save()
        self._debug('Creating vpool')
        backend_type = 'local'
        fields = ['storage_ip', 'vrouter_port']

        parameters = {'storagerouter_ip': IP,
                      'vpool_name': VPOOL_NAME,
                      'type': 'local',
                      'mountpoint_bfs': VPOOL_BFS,
                      'mountpoint_temp': VPOOL_TEMP,
                      'mountpoint_md': VPOOL_MD,
                      'mountpoint_readcaches': [VPOOL_READCACHE],
                      'mountpoint_writecaches': [VPOOL_WRITECACHE],
                      'mountpoint_foc': VPOOL_FOC,
                      'storage_ip': '127.0.0.1', #KVM
                      'vrouter_port': VPOOL_PORT,
                      'integrate_vpool': True,
                      'connection_host': IP,
                      'connection_port': VPOOL_PORT,
                      'connection_username': '',
                      'connection_password': '',
                      'connection_backend': {},
                      }
        StorageRouterController.add_vpool(parameters)
        attempt = 0
        while attempt < 10:
            vpool = VPoolList.get_vpool_by_name(VPOOL_NAME)
            if vpool is not None:
                self._debug('vpool %s created' % VPOOL_NAME)
                try:
                    os.listdir(VPOOL_MOUNTPOINT)
                    return vpool
                except Exception as ex:
                    #either it doesn't exist, or we don't have permission
                    self._debug('vpool not ready yet %s' % (str(ex)))
                    pass
            attempt += 1
            time.sleep(2)
        raise RuntimeError('Vpool %s was not modeled correctly or did not start.' % VPOOL_NAME)
示例#6
0
    def _get_test_name():
        """
        Retrieve a structured environment test name

        :returns: a structured environment based test name
        :rtype: str
        """
        number_of_nodes = len(StoragerouterHelper.get_storagerouters())
        split_ip = System.get_my_storagerouter().ip.split('.')
        return str(number_of_nodes) + 'N-' + split_ip[2] + '.' + split_ip[3]
示例#7
0
 def update_components(components):
     """
     Initiate the update through commandline for all StorageRouters
     This is called upon by the API
     :return: None
     """
     components = [component.strip() for component in components]
     root_client = SSHClient(endpoint=System.get_my_storagerouter(),
                             username='******')
     root_client.run(['ovs', 'update', ','.join(components)])
示例#8
0
    def _create_vpool(self):
        """
        Needed to actually run tests on
        This is not actually a test of "Add Vpool to OVS",
        so any failure here will be reported as a setUp error and no tests will run
        """
        pmachine = System.get_my_storagerouter().pmachine
        mgmt_center = MgmtCenter(
            data={
                "name": "Openstack",
                "description": "test",
                "username": CINDER_USER,
                "password": CINDER_PASS,
                "ip": CINDER_CONTROLLER,
                "port": 80,
                "type": "OPENSTACK",
                "metadata": {"integratemgmt": True},
            }
        )
        mgmt_center.save()
        pmachine.mgmtcenter = mgmt_center
        pmachine.save()
        self._debug("Creating vpool")

        parameters = {
            "storagerouter_ip": IP,
            "vpool_name": VPOOL_NAME,
            "type": "local",
            "storage_ip": "127.0.0.1",  # KVM
            "vrouter_port": VPOOL_PORT,
            "integrate_vpool": True,
            "connection_host": IP,
            "connection_port": VPOOL_PORT,
            "connection_username": "",
            "connection_password": "",
            "connection_backend": {},
            "readcache_size": 50,
            "writecache_size": 50,
        }
        StorageRouterController.add_vpool(parameters)
        attempt = 0
        while attempt < 10:
            vpool = VPoolList.get_vpool_by_name(VPOOL_NAME)
            if vpool is not None:
                self._debug("vpool {0} created".format(VPOOL_NAME))
                try:
                    os.listdir(VPOOL_MOUNTPOINT)
                    return vpool
                except Exception as ex:
                    # either it doesn't exist, or we don't have permission
                    self._debug("vpool not ready yet {0}".format(str(ex)))
                    pass
            attempt += 1
            time.sleep(2)
        raise RuntimeError("Vpool {0} was not modeled correctly or did not start.".format(VPOOL_NAME))
class Helper(object):
    """
    Helper module
    """
    MODULE = "utils"
    SETTINGS_LOC = "/opt/OpenvStorage/config/healthcheck/settings.json"
    RAW_INIT_MANAGER = str(
        subprocess.check_output('cat /proc/1/comm', shell=True)).strip()
    LOCAL_SR = System.get_my_storagerouter()
    LOCAL_ID = System.get_my_machine_id()

    with open(SETTINGS_LOC) as settings_file:
        settings = json.load(settings_file)

    debug_mode = settings["healthcheck"]["debug_mode"]
    enable_logging = settings["healthcheck"]["logging"]["enable"]
    max_log_size = settings["healthcheck"]["max_check_log_size"]
    packages = settings["healthcheck"]["package_list"]
    extra_ports = settings["healthcheck"]["extra_ports"]
    rights_dirs = settings["healthcheck"]["rights_dirs"]
    owners_files = settings["healthcheck"]["owners_files"]
    max_hours_zero_disk_safety = settings["healthcheck"][
        "max_hours_zero_disk_safety"]

    @staticmethod
    def get_healthcheck_version():
        """
        Gets the installed healthcheck version
        :return: version number of the installed healthcheck
        :rtype: str
        """
        client = SSHClient(System.get_my_storagerouter())
        package_name = 'openvstorage-health-check'
        package_manager = PackageFactory.get_manager()
        packages = package_manager.get_installed_versions(
            client=client, package_names=[package_name])
        return packages.get(package_name, 'unknown')

    @staticmethod
    def get_local_settings():
        """
        Fetch settings of the local Open vStorage node
        :return: local settings of the node
        :rtype: dict
        """
        # Fetch all details
        local_settings = {
            'cluster_id': Configuration.get("/ovs/framework/cluster_id"),
            'hostname': socket.gethostname(),
            'storagerouter_id': Helper.LOCAL_ID,
            'storagerouter_type': Helper.LOCAL_SR.node_type,
            'environment os': ' '.join(platform.linux_distribution())
        }
        return local_settings
 def get_healthcheck_version():
     """
     Gets the installed healthcheck version
     :return: version number of the installed healthcheck
     :rtype: str
     """
     client = SSHClient(System.get_my_storagerouter())
     package_name = 'openvstorage-health-check'
     package_manager = PackageFactory.get_manager()
     packages = package_manager.get_installed_versions(client=client, package_names=[package_name])
     return packages.get(package_name, 'unknown')
    def tdr_0001_add_append_remove_role_test(ip=None, configuration=None, number_of_roles_to_remain=0):
        """
        This test will add a DB role to the sda disk of the storage router with the given IP
        :param ip: IP address of a storage router. (Example:
        :type ip: str

        :param configuration: Dict that determines layout
        :type configuration: dict
        :return: None

        :param number_of_roles_to_remain: how roles may still be defined on the partition. The first
        'number_of_roles_to_remain' will remain.
        :type number_of_roles_to_remain: int
        """

        if not ip:
            ip = System.get_my_storagerouter().ip

        # Start input validation
        GeneralNetwork.validate_ip(ip)
        # End input validation

        # Start setup
        if configuration:
            config = configuration
        else:
            # Will use first unused disk
            config = {
                ip: {
                    "disks": [{
                        "disk_name": TestDiskRoles.get_first_unused_disk(),
                        "roles": ["WRITE", "SCRUB"]
                    }]
                }
            }
        collection = TestDiskRoles.set_roles_from_config(config, 'SET')
        # End setup

        # Start validation
        assert TestDiskRoles.validate_roles(collection), "Roles were not set according to the configuration!"
        # End validation

        collection = TestDiskRoles.set_roles_from_config(config, 'APPEND')
        # End setup
        # Start validation
        assert TestDiskRoles.validate_roles(collection), "Roles were not set according to the configuration!"
        # End validation
        # Remove disk roles
        collection = TestDiskRoles.remove_roles_from_config(config, number_of_roles_to_remain)
        # End remove disk roles

        # Start remove validation
        assert TestDiskRoles.validate_roles(collection), "Roles were not removed!"
示例#12
0
    def install_plugins():
        """
        (Re)load plugins
        """
        if ServiceManager.has_service('ovs-watcher-framework',
                                      SSHClient('127.0.0.1', username='******')):
            # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load
            # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            clients = []
            try:
                for storagerouter in StorageRouterList.get_storagerouters():
                    clients.append(SSHClient(storagerouter, username='******'))
            except UnableToConnectException:
                raise RuntimeError('Not all StorageRouters are reachable')

            for client in clients:
                for service_name in ['watcher-framework', 'memcached']:
                    ServiceManager.stop_service(service_name, client=client)
                    wait = 30
                    while wait > 0:
                        if ServiceManager.get_service_status(
                                service_name, client=client) is False:
                            break
                        time.sleep(1)
                        wait -= 1
                    if wait == 0:
                        raise RuntimeError(
                            'Could not stop service: {0}'.format(service_name))

            for client in clients:
                for service_name in ['memcached', 'watcher-framework']:
                    ServiceManager.start_service(service_name, client=client)
                    wait = 30
                    while wait > 0:
                        if ServiceManager.get_service_status(
                                service_name, client=client) is True:
                            break
                        time.sleep(1)
                        wait -= 1
                    if wait == 0:
                        raise RuntimeError(
                            'Could not start service: {0}'.format(
                                service_name))

            from ovs.dal.helpers import Migration
            Migration.migrate()

            from ovs.lib.helpers.toolbox import Toolbox
            ip = System.get_my_storagerouter().ip
            functions = Toolbox.fetch_hooks('plugin', 'postinstall')
            for function in functions:
                function(ip=ip)
 def get_healthcheck_version():
     """
     Gets the installed healthcheck version
     :return: version number of the installed healthcheck
     :rtype: str
     """
     client = SSHClient(System.get_my_storagerouter())
     package_name = 'openvstorage-health-check'
     package_manager = PackageFactory.get_manager()
     packages = package_manager.get_installed_versions(
         client=client, package_names=[package_name])
     return packages.get(package_name, 'unknown')
示例#14
0
    def __init__(self, path=None, client=None):
        """

        :param path: path of the fstab file
        :type path: str
        """
        if path:
            self._path = path
        else:
            self._path = self.DEFAULT_PATH
        if client is None:
            client = SSHClient(System.get_my_storagerouter(), username='******')
        self.client = client
    def __init__(self, utility=Utils(False)):
        self.module = 'openvstorage'
        self.utility = utility
        self.service_manager = self.utility.serviceManager
        self.machine_details = System.get_my_storagerouter()
        self.machine_id = self.machine_details.machine_id
        self.max_logsize = 500  # in MB

        # list of packages on your local system
        self.openvstorageTotalPackageList = ["openvstorage", "openvstorage-backend", "openvstorage-backend-core",
                                             "openvstorage-backend-webapps", "openvstorage-core", "openvstorage-hc",
                                             "openvstorage-sdm", "openvstorage-webapps", "openvstorage-test",
                                             "alba", "volumedriver-base", "volumedriver-server", "nginx", "memcached",
                                             "rabbitmq-server", "qemu-kvm", "virtinst", "openvpn", "ntp"
                                             ]
        # 1. key -> service name (string)
        #
        # 2. value -> ports (list)
        self.req_side_ports = {'nginx': ['80', '443'], 'memcached': ['11211']}

        # 1. key -> absolute directory name (string)
        #
        # 2. value -> rights in linux style format (string)
        self.req_map_rights = {'/tmp': '777', '/var/tmp': '777'}

        # 1. key -> absolute directory or log name (string)
        #
        # 2. value -> required user and group (dict)
        self.req_map_owners = {'/var/log/syslog': {'user': '******', 'group': 'adm'},
                               '/var/log/auth.log': {'user': '******', 'group': 'adm'},
                               '/var/log/kern.log': {'user': '******', 'group': 'adm'},
                               '/var/log/wtmp': {'user': '******', 'group': 'utmp'},
                               '/var/log/btmp': {'user': '******', 'group': 'utmp'},
                               '/etc/gshadow': {'user': '******', 'group': 'shadow'},
                               '/var/cache/man': {'user': '******', 'group': 'root'},
                               '/etc/shadow': {'user': '******', 'group': 'shadow'}
                               }

        # 1. for dir required options: AS key -> prefix (string)
        #    AS value -> list, substring of prefix (string) , type -> string (dir)
        #    contains_nested -> Boolean (contains nested dirs and files)
        #
        # 2. for file required options: type -> string (file)
        self.logging = {'/var/log/upstart': {'prefix': ['ovs', 'asd'], 'type': 'dir', 'contains_nested': False},
                        '/var/log/ovs': {'prefix': None, 'type': 'dir', 'contains_nested': True},
                        '/var/log/gunicorn': {'prefix': None, 'type': 'dir', 'contains_nested': False},
                        '/var/log/rabbitmq': {'prefix': None, 'type': 'dir', 'contains_nested': False},
                        '/var/log/nginx': {'prefix': None, 'type': 'dir', 'contains_nested': False},
                        '/var/log/arakoon': {'prefix': None, 'type': 'dir', 'contains_nested': True},
                        '/var/log/memcached.log': {'type': 'file'}
                        }
示例#16
0
 def ipmi_check(cls, result_handler):
     """
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :return:
     """
     for albanode in AlbaNodeList.get_albanodes():
         node_id = albanode.node_id
         ipmi_config_loc = '/ovs/alba/asdnodes/{0}/config/ipmi'.format(
             node_id)
         if not Configuration.exists(ipmi_config_loc):
             result_handler.skip(
                 'No IPMI info found on AlbaNode with ID {0}'.format(
                     node_id))
             continue
         ipmi_config = Configuration.get(ipmi_config_loc)
         ip = ipmi_config.get('ip')
         try:
             controller = IPMIController(
                 ip=ip,
                 username=ipmi_config.get('username'),
                 password=ipmi_config.get('password'),
                 client=SSHClient(System.get_my_storagerouter()))
         except:
             result_handler.failure(
                 'IPMI settings are not valid for AlbaNode with ID {0}'.
                 format(node_id))
             continue
         try:
             status = controller.status_node().get(ip)
             if status == IPMIController.IPMI_POWER_ON:
                 result_handler.success(
                     'IPMI AlbaNode with ID {0} status is POWER ON'.format(
                         node_id))
             elif status == IPMIController.IPMI_POWER_OFF:
                 result_handler.warning(
                     'IPMI AlbaNode with ID {0} status is POWER OFF'.format(
                         node_id))
         except IPMITimeOutException as ex:
             result_handler.failure(
                 "IPMI AlbaNode with ID {0} timed out: '{1}'".format(
                     node_id, ex))
         except IPMICallException as ex:
             result_handler.failure(
                 "IPMI AlbaNode with ID {0} call failed: '{1}'".format(
                     node_id, ex))
         except Exception:
             msg = 'Could not retrieve info through IPMI for AlbaNode with ID {0}'.format(
                 node_id)
             cls.logger.exception(msg)
             result_handler.exception(msg)
示例#17
0
    def _create_vpool(self):
        """
        Needed to actually run tests on
        This is not actually a test of "Add Vpool to OVS",
        so any failure here will be reported as a setUp error and no tests will run
        """
        pmachine = System.get_my_storagerouter().pmachine
        mgmt_center = MgmtCenter(data={'name': 'Openstack',
                                       'description': 'test',
                                       'username': OVSPluginTestCase.CINDER_USER,
                                       'password': OVSPluginTestCase.CINDER_PASS,
                                       'ip': OVSPluginTestCase.CINDER_CONTROLLER,
                                       'port': 80,
                                       'type': 'OPENSTACK',
                                       'metadata': {'integratemgmt': True}})
        mgmt_center.save()
        pmachine.mgmtcenter = mgmt_center
        pmachine.save()
        self._debug('Creating vpool')

        parameters = {'storagerouter_ip': OVSPluginTestCase.ip,
                      'vpool_name': OVSPluginTestCase.VPOOL_NAME,
                      'type': 'local',
                      'storage_ip': '127.0.0.1',  # KVM
                      'vrouter_port': OVSPluginTestCase.VPOOL_PORT,
                      'integrate_vpool': True,
                      'connection_host': OVSPluginTestCase.ip,
                      'connection_port': OVSPluginTestCase.VPOOL_PORT,
                      'connection_username': '',
                      'connection_password': '',
                      'connection_backend': {},
                      'readcache_size': 50,
                      'writecache_size': 50
                      }
        StorageRouterController.add_vpool(parameters)
        attempt = 0
        while attempt < 10:
            vpool = VPoolList.get_vpool_by_name(OVSPluginTestCase.VPOOL_NAME)
            if vpool is not None:
                self._debug('vpool {0} created'.format(OVSPluginTestCase.VPOOL_NAME))
                try:
                    os.listdir(OVSPluginTestCase.VPOOL_MOUNTPOINT)
                    return vpool
                except Exception as ex:
                    # either it doesn't exist, or we don't have permission
                    self._debug('vpool not ready yet {0}'.format(str(ex)))
                    pass
            attempt += 1
            time.sleep(2)
        raise RuntimeError('Vpool {0} was not modeled correctly or did not start.'.format(OVSPluginTestCase.VPOOL_NAME))
示例#18
0
 def _set_storagerouter(self):
     """
     Set the clients storagerouter if the storagerouter is None.
     :return: Value for StorageRouter (either None or the StorageRouter object)
     :rtype: NoneType or ovs.dal.hybrids.storagerouter.StorageRouter
     """
     if self._storagerouter is None:
         try:
             # Will fail when Arakoon is down
             self._storagerouter = System.get_my_storagerouter()
         except Exception:
             self.logger.exception(
                 'Unable to set the storagerouter. Heartbeat will be affected.'
             )
     return self._storagerouter
示例#19
0
 def _umount(mountpoint, client=None):
     """
     Unmount the given partition
     :param mountpoint: Location where the mountpoint is mounted
     :type mountpoint: str
     :return:
     """
     if client is None:
         client = SSHClient(System.get_my_storagerouter(), username='******')
     try:
         client.run(['umount', mountpoint])
     except Exception:
         RoleRemover.LOGGER.exception(
             'Unable to umount mountpoint {0}'.format(mountpoint))
         raise RuntimeError('Could not unmount {0}'.format(mountpoint))
示例#20
0
    def install_plugins():
        """
        (Re)load plugins
        """
        manager = ServiceFactory.get_manager()
        if manager.has_service('ovs-watcher-framework',
                               SSHClient('127.0.0.1', username='******')):
            # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load
            # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed
            print 'Installing plugin into Open vStorage'
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            clients = {}
            masters = StorageRouterList.get_masters()
            slaves = StorageRouterList.get_slaves()
            try:
                for sr in masters + slaves:
                    clients[sr] = SSHClient(sr, username='******')
            except UnableToConnectException:
                raise RuntimeError('Not all StorageRouters are reachable')
            memcached = 'memcached'
            watcher = 'watcher-framework'
            for sr in masters + slaves:
                if manager.has_service(watcher, clients[sr]):
                    print '- Stopping watcher on {0} ({1})'.format(
                        sr.name, sr.ip)
                    manager.stop_service(watcher, clients[sr])
            for sr in masters:
                print '- Restarting memcached on {0} ({1})'.format(
                    sr.name, sr.ip)
                manager.restart_service(memcached, clients[sr])
            for sr in masters + slaves:
                if manager.has_service(watcher, clients[sr]):
                    print '- Starting watcher on {0} ({1})'.format(
                        sr.name, sr.ip)
                    manager.start_service(watcher, clients[sr])

            print '- Execute model migrations'
            from ovs.dal.helpers import Migration
            Migration.migrate()

            from ovs.lib.helpers.toolbox import Toolbox
            ip = System.get_my_storagerouter().ip
            functions = Toolbox.fetch_hooks('plugin', 'postinstall')
            if len(functions) > 0:
                print '- Execute post installation scripts'
            for fct in functions:
                fct(ip=ip)
            print 'Installing plugin into Open vStorage: Completed'
 def wrapped(*args, **kwargs):
     if lock_type == 'local':
         _mutex = file_mutex(key)
     elif lock_type == 'cluster':
         _mutex = volatile_mutex(key)
     else:
         raise ValueError('Lock type {0} is not supported!'.format(lock_type))
     try:
         _mutex.acquire(wait=0.005)
         local_sr = System.get_my_storagerouter()
         CacheHelper.set(key=key, item={'ip': local_sr.ip, 'hostname': local_sr.name}, expire_time=60)
         return func(*args, **kwargs)
     except (NoFileLockAvailableException, NoVolatileLockAvailableException):
         if callback is None:
             return
         else:
             executor_info = None
             start = time.time()
             while executor_info is None:
                 # Calculated guesswork. If a callback function would be expected, the acquire has happened for another executor  the volatilekey should be set eventually
                 # However by setting it after the acquire, the callback executor and original method executor can race between fetch and set
                 # A better implementation would be relying on the fwk ensure_single_decorator as they check for various races themselves
                 # This is just a poor mans, temporary implementation
                 if start - time.time() > 5:
                     raise ValueError('Timed out after 5 seconds while fetching the information about the executor.')
                 try:
                     executor_info = CacheHelper.get(key=key)
                 except:
                     pass
             callback_func = callback.__func__ if isinstance(callback, staticmethod) else callback
             argnames = inspect.getargspec(callback_func)[0]
             arguments = list(args)
             kwargs.update({'test_name': func.__name__})
             if executor_info is not None:
                 kwargs.update(executor_info)
                 if 'result_handler' in argnames:
                     result_handler = kwargs.get('result_handler')
                     for index, arg in enumerate(arguments):
                         if isinstance(arg, HCResults.HCResultCollector):
                             result_handler = arguments.pop(index)
                             break
                     if result_handler is None:
                         raise TypeError('Expected an instance of {0}'.format(HCResults.HCResultCollector))
                     kwargs['result_handler'] = result_handler
             return callback_func(**kwargs)
     finally:
         _mutex.release()
示例#22
0
    def install_plugins():
        """
        (Re)load plugins
        """
        if ServiceManager.has_service('ovs-watcher-framework', SSHClient('127.0.0.1', username='******')):
            # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load
            # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            clients = []
            try:
                for storagerouter in StorageRouterList.get_storagerouters():
                    clients.append(SSHClient(storagerouter, username='******'))
            except UnableToConnectException:
                raise RuntimeError('Not all StorageRouters are reachable')

            for client in clients:
                for service_name in ['watcher-framework', 'memcached']:
                    ServiceManager.stop_service(service_name, client=client)
                    wait = 30
                    while wait > 0:
                        if ServiceManager.get_service_status(service_name, client=client) is False:
                            break
                        time.sleep(1)
                        wait -= 1
                    if wait == 0:
                        raise RuntimeError('Could not stop service: {0}'.format(service_name))

            for client in clients:
                for service_name in ['memcached', 'watcher-framework']:
                    ServiceManager.start_service(service_name, client=client)
                    wait = 30
                    while wait > 0:
                        if ServiceManager.get_service_status(service_name, client=client) is True:
                            break
                        time.sleep(1)
                        wait -= 1
                    if wait == 0:
                        raise RuntimeError('Could not start service: {0}'.format(service_name))

            from ovs.dal.helpers import Migration
            Migration.migrate()

            from ovs.lib.helpers.toolbox import Toolbox
            ip = System.get_my_storagerouter().ip
            functions = Toolbox.fetch_hooks('plugin', 'postinstall')
            for function in functions:
                function(ip=ip)
    def _check_available_space(hosts, username, password):
        required_space = 0
        output_dir = '/'.join(LogFileTimeParser.FILE_PATH_REMOTE.strip('/').rsplit('/', 1)[0])
        get_space_command = 'df /tmp | cut -d " " -f 10 | grep -Eo [0-9]*'
        local_client = SSHClient(System.get_my_storagerouter().ip, username=username, password=password)
        available_space = local_client.run(get_space_command, allow_insecure=True)

        for host in hosts:
            ssh_client = SSHClient(host, username=username, password=password)
            for file_path in LogFileTimeParser.STANDARD_SEARCH_LOCATIONS:
                command = 'ls -al {0} | cut -d " " -f 5'.format(file_path)
                try:
                    required_space += int(ssh_client.run(command, allow_insecure=True))
                except ValueError:
                    pass
        if required_space > available_space:
            raise OverflowError('Would not be able to allocate {0} in {1}'.format(required_space, output_dir))
示例#24
0
    def _check_available_space(hosts, username, password):
        required_space = 0
        output_dir = '/'.join(LogFileTimeParser.FILE_PATH_REMOTE.strip('/').rsplit('/', 1)[0])
        get_space_command = 'df /tmp | cut -d " " -f 10 | grep -Eo [0-9]*'
        local_client = SSHClient(System.get_my_storagerouter().ip, username=username, password=password)
        available_space = local_client.run(get_space_command, allow_insecure=True)

        for host in hosts:
            ssh_client = SSHClient(host, username=username, password=password)
            for file_path in LogFileTimeParser.STANDARD_SEARCH_LOCATIONS:
                command = 'ls -al {0} | cut -d " " -f 5'.format(file_path)
                try:
                    required_space += int(ssh_client.run(command, allow_insecure=True))
                except ValueError:
                    pass
        if required_space > available_space:
            raise OverflowError('Would not be able to allocate {0} in {1}'.format(required_space, output_dir))
示例#25
0
    def __init__(self, port):
        """
        Init
        """
        signal.signal(signal.SIGTERM, self.SIGTERM)

        from ovs.extensions.generic.system import System
        my_storagerouter = System.get_my_storagerouter()
        self.host = my_storagerouter.ip
        self.port = port

        self.persistent = PersistentFactory.get_client()
        self.users = self.get_users()
        # Load from model
        self.assigned_oids = {}
        self.instance_oid = 0
        # Book-keeping
        self.model_oids = set()
示例#26
0
def gather_facts():
    """
    Gather facts from a node

    :returns dictionary with information about Open vStorage on the target node
    :rtype dict
    """

    facts = {}

    # fetch present information from 'ovs setup'
    setup_information = {
        'ovs_installed': ovs_present,
        'ovs_setup_completed': ovs_configured,
        'alba_installed': asdmanager_present,
        'alba_setup_completed': asdmanager_configured
    }

    facts.update({'general': setup_information})

    # fetch ovs information if ovs is installed and configured
    if ovs_present and ovs_configured:

        # pre-fetch data
        openvstorage_id = open('/etc/openvstorage_id', 'r')
        node_id = openvstorage_id.read().strip()
        openvstorage_id.close()
        support = EtcdConfiguration.get('/ovs/framework/support')
        grid_ip = str(EtcdConfiguration.get('/ovs/framework/hosts/{0}/ip'.format(node_id)))

        ovs_cluster_information = {
            'cluster_id': str(EtcdConfiguration.get('/ovs/framework/cluster_id')),
            'node_id': str(node_id),
            'grid_ip': grid_ip,
            'node_type': str(EtcdConfiguration.get('/ovs/framework/hosts/{0}/type'.format(node_id))),
            'base_dir': str(EtcdConfiguration.get('/ovs/framework/paths').get('basedir')),
            'heartbeat_enabled': str(support.get('enabled')),
            'remote_support_enabled': str(support.get('enablesupport')),
            'etcd_proxy': '{0}=http://{1}:2380'.format(node_id, grid_ip),
            'partition_config': System.get_my_storagerouter().partition_config
        }
        facts.update({'ovs': ovs_cluster_information})

    return facts
示例#27
0
 def get_worker_contexts(self):
     # type: () -> dict
     """
     Retrieves information about the all workers (where it is executed and under what PID)
     This information is later used to check which data can be discarded (because of interrupted workers)
     :return: Information about the current workers
     :rtype: dict
     """
     workers_context = {}
     for storagerouter, client in self._clients.iteritems():
         if storagerouter not in self._worker_contexts_cache:
             worker_pid = 0
             worker_start = None
             try:
                 # Retrieve the current start time of the process (used to create a unique key)
                 # Output of the command:
                 #                  STARTED   PID
                 # Mon Jan 22 11:49:04 2018 22287
                 worker_pid = self._service_manager.get_service_pid(
                     name='ovs-workers', client=client)
                 if worker_pid == 0:
                     self._logger.warning(
                         'The workers are down on StorageRouter {0}'.format(
                             storagerouter.guid))
                 else:
                     worker_start = self._service_manager.get_service_start_time(
                         name='ovs-workers', client=client)
             except Exception:
                 self._logger.exception(
                     self._format_message(
                         'Unable to retrieve information about the worker'))
             self._worker_contexts_cache[storagerouter] = {
                 'storagerouter_guid': storagerouter.guid,
                 'worker_pid': worker_pid,
                 'worker_start': worker_start
             }
         workers_context[storagerouter] = self._worker_contexts_cache[
             storagerouter]
     if System.get_my_storagerouter() not in workers_context:
         raise ValueError(
             self._format_message(
                 'The context about the workers on this machine should be known'
             ))
     return workers_context
示例#28
0
    def install_plugins():
        """
        (Re)load plugins
        """
        if ServiceManager.has_service('ovs-watcher-framework', SSHClient('127.0.0.1', username='******')):
            # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load
            # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed
            print 'Installing plugin into Open vStorage'
            from ovs.dal.lists.storagerouterlist import StorageRouterList
            clients = {}
            masters = StorageRouterList.get_masters()
            slaves = StorageRouterList.get_slaves()
            try:
                for sr in masters + slaves:
                    clients[sr] = SSHClient(sr, username='******')
            except UnableToConnectException:
                raise RuntimeError('Not all StorageRouters are reachable')
            memcached = 'memcached'
            watcher = 'watcher-framework'
            for sr in masters + slaves:
                if ServiceManager.has_service(watcher, clients[sr]):
                    print '- Stopping watcher on {0} ({1})'.format(sr.name, sr.ip)
                    ServiceManager.stop_service(watcher, clients[sr])
            for sr in masters:
                print '- Restarting memcached on {0} ({1})'.format(sr.name, sr.ip)
                ServiceManager.restart_service(memcached, clients[sr])
            for sr in masters + slaves:
                if ServiceManager.has_service(watcher, clients[sr]):
                    print '- Starting watcher on {0} ({1})'.format(sr.name, sr.ip)
                    ServiceManager.start_service(watcher, clients[sr])

            print '- Execute model migrations'
            from ovs.dal.helpers import Migration
            Migration.migrate()

            from ovs.lib.helpers.toolbox import Toolbox
            ip = System.get_my_storagerouter().ip
            functions = Toolbox.fetch_hooks('plugin', 'postinstall')
            if len(functions) > 0:
                print '- Execute post installation scripts'
            for function in functions:
                function(ip=ip)
            print 'Installing plugin into Open vStorage: Completed'
 def check_ovs_processes(result_handler):
     """
     Checks the availability of processes for Open vStorage
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :return: None
     :rtype: NoneType
     """
     result_handler.info('Checking local ovs services.')
     client = SSHClient(System.get_my_storagerouter())
     service_manager = ServiceFactory.get_manager()
     services = [service for service in service_manager.list_services(client=client) if service.startswith(OpenvStorageHealthCheck.MODULE)]
     if len(services) == 0:
         result_handler.warning('Found no local ovs services.')
     for service_name in services:
         if service_manager.get_service_status(service_name, client) == 'active':
             result_handler.success('Service {0} is running!'.format(service_name), code=ErrorCodes.process_fwk)
         else:
             result_handler.failure('Service {0} is not running, please check this.'.format(service_name), code=ErrorCodes.process_fwk)
示例#30
0
    def get_logfiles(local_storagerouter_guid):
        """
        Collects logs, moves them to a web-accessible location and returns log tgz's filename
        :param local_storagerouter_guid: StorageRouter guid to retrieve log files on
        :type local_storagerouter_guid: str
        :return: Name of tgz containing the logs
        :rtype: str
        """
        this_storagerouter = System.get_my_storagerouter()
        this_client = SSHClient(this_storagerouter, username='******')
        logfile = this_client.run(['ovs', 'collect', 'logs']).strip()
        logfilename = logfile.split('/')[-1]

        storagerouter = StorageRouter(local_storagerouter_guid)
        webpath = '/opt/OpenvStorage/webapps/frontend/downloads'
        client = SSHClient(storagerouter, username='******')
        client.dir_create(webpath)
        client.file_upload('{0}/{1}'.format(webpath, logfilename), logfile)
        client.run(['chmod', '666', '{0}/{1}'.format(webpath, logfilename)])
        return logfilename
 def check_alba_processes(result_handler):
     """
     Checks the availability of processes for Alba
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :return: None
     :rtype: NoneType
     """
     result_handler.info('Checking LOCAL ALBA services: ', add_to_result=False)
     client = SSHClient(System.get_my_storagerouter())
     service_manager = ServiceFactory.get_manager()
     services = [service for service in service_manager.list_services(client=client) if service.startswith(AlbaHealthCheck.MODULE)]
     if len(services) == 0:
         result_handler.skip('Found no LOCAL ALBA services.')
         return
     for service_name in services:
         if service_manager.get_service_status(service_name, client) == 'active':
             result_handler.success('Service {0} is running!'.format(service_name),
                                    code=ErrorCodes.alba_service_running)
         else:
             result_handler.failure('Service {0} is NOT running! '.format(service_name),
                                    code=ErrorCodes.alba_service_down)
示例#32
0
    def _remove_filesystem(device, alias_part_label, client=None):
        """

        :param alias_part_label: eg /dev/disk/by-partlabel/ata-QEMU_HARDDISK_QM00011
        :type alias_part_label: str
        :return:
        """
        if client is None:
            client = SSHClient(System.get_my_storagerouter(), username='******')
        try:
            partition_cmd = "udevadm info --name={0} | awk -F '=' '/ID_PART_ENTRY_NUMBER/{{print $NF}}'".format(
                alias_part_label)
            partition_number = client.run(partition_cmd, allow_insecure=True)
            if partition_number:
                format_cmd = 'parted {0} rm {1}'.format(
                    device, partition_number)
                client.run(format_cmd.split())
        except Exception:
            RoleRemover.LOGGER.exception(
                'Unable to remove filesystem of {0}'.format(alias_part_label))
            raise RuntimeError(
                'Could not remove filesystem of {0}'.format(alias_part_label))
示例#33
0
    def __init__(self, vdisk_guid):
        # type: (str) -> None
        """
        Initializes a new MDSCatchUp
        An instance populates some caches. These cached are cleared once the instance is garbage collected.
        When running MDSCatchup in bulk: add them to a list to speed up the process
        :param vdisk_guid: Guid of the vDisk to catch up for
        :type vdisk_guid: str
        """
        self.id = str(uuid.uuid4())
        self.vdisk = VDisk(vdisk_guid)
        self.mds_key = self._CATCH_UP_VDISK_KEY.format(self.vdisk.guid)
        self.tlog_threshold = Configuration.get(
            'ovs/volumedriver/mds|tlogs_behind', default=100)
        self.volumedriver_service_name = 'ovs-volumedriver_{0}'.format(
            self.vdisk.vpool.name)
        self.mds_client_timeout = Configuration.get(
            'ovs/vpools/{0}/mds_config|mds_client_connection_timeout'.format(
                self.vdisk.vpool_guid),
            default=120)
        self.mds_clients = {}
        self.dry_run = False
        self.catch_up_threads = []
        self.errors = []

        self._service_manager = ServiceFactory.get_manager()
        self._persistent = PersistentFactory.get_client()
        self._log = 'MDS catchup {0} - vDisk {1} (volume id: {2})'.format(
            self.id, self.vdisk.guid, self.vdisk.volume_id)

        self._clients = self.build_clients()
        self._volumedriver_contexts = self.get_volumedriver_contexts()
        self._worker_contexts = self.get_worker_contexts()
        self._worker_context = self._worker_contexts[
            System.get_my_storagerouter()]
        self._relevant_contexts = self._get_all_relevant_contexts(
        )  # All possible contexts (by mixing volumedriver ones with workers)
 def ipmi_check(cls, result_handler):
     """
     :param result_handler: logging object
     :type result_handler: ovs.extensions.healthcheck.result.HCResults
     :return:
     """
     for albanode in AlbaNodeList.get_albanodes():
         node_id = albanode.node_id
         ipmi_config_loc = '/ovs/alba/asdnodes/{0}/config/ipmi'.format(node_id)
         if not Configuration.exists(ipmi_config_loc):
             result_handler.skip('No IPMI info found on AlbaNode with ID {0}'.format(node_id))
             continue
         ipmi_config = Configuration.get(ipmi_config_loc)
         ip = ipmi_config.get('ip')
         try:
             controller = IPMIController(ip=ip,
                                         username=ipmi_config.get('username'),
                                         password=ipmi_config.get('password'),
                                         client=SSHClient(System.get_my_storagerouter()))
         except:
             result_handler.failure('IPMI settings are not valid for AlbaNode with ID {0}'.format(node_id))
             continue
         try:
             status = controller.status_node().get(ip)
             if status == IPMIController.IPMI_POWER_ON:
                 result_handler.success('IPMI AlbaNode with ID {0} status is POWER ON'.format(node_id))
             elif status == IPMIController.IPMI_POWER_OFF:
                 result_handler.warning('IPMI AlbaNode with ID {0} status is POWER OFF'.format(node_id))
         except IPMITimeOutException as ex:
             result_handler.failure("IPMI AlbaNode with ID {0} timed out: '{1}'".format(node_id, ex))
         except IPMICallException as ex:
             result_handler.failure("IPMI AlbaNode with ID {0} call failed: '{1}'".format(node_id, ex))
         except Exception:
             msg = 'Could not retrieve info through IPMI for AlbaNode with ID {0}'.format(node_id)
             cls.logger.exception(msg)
             result_handler.exception(msg)
示例#35
0
 def _get_my_ip(self):
     try:
         return System.get_my_storagerouter().ip
     except:
         return '127.0.0.1'
示例#36
0
    def post_update_core(client, components):
        """
        Execute functionality after the openvstorage core packages have been updated
        For framework:
            * Restart support-agent on every client
            * Restart arakoon-ovsdb on every client (if present and required)
        For storagedriver:
            * ALBA proxies on every client
            * Restart arakoon-voldrv on every client (if present and required)
        :param client: Client on which to execute this post update functionality
        :type client: SSHClient
        :param components: Update components which have been executed
        :type components: list
        :return: None
        """
        if 'framework' not in components and 'storagedriver' not in components:
            return

        update_information = UpdateController.get_update_information_core({})
        services_to_restart = set()
        if 'storagedriver' in components:
            services_to_restart.update(update_information.get('storagedriver', {}).get('services_post_update', set()))
        if 'framework' in components:
            services_to_restart.update(update_information.get('framework', {}).get('services_post_update', set()))
            services_to_restart.add('support-agent')

        if services_to_restart:
            UpdateController._logger.debug('{0}: Executing hook {1}'.format(client.ip, inspect.currentframe().f_code.co_name))
            for service_name in sorted(services_to_restart):
                if not service_name.startswith('ovs-arakoon-'):
                    UpdateController.change_services_state(services=[service_name], ssh_clients=[client], action='restart')
                else:
                    cluster_name = ArakoonClusterConfig.get_cluster_name(ExtensionToolbox.remove_prefix(service_name, 'ovs-arakoon-'))
                    if cluster_name == 'config':
                        arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name='cacc', filesystem=True, ip=System.get_my_storagerouter().ip)
                    else:
                        arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name)
                    if arakoon_metadata['internal'] is True:
                        UpdateController._logger.debug('{0}: Restarting arakoon node {1}'.format(client.ip, cluster_name))
                        ArakoonInstaller.restart_node(cluster_name=cluster_name,
                                                      client=client)
            UpdateController._logger.debug('{0}: Executed hook {1}'.format(client.ip, inspect.currentframe().f_code.co_name))
    def check_for_halted_volumes(cls, result_handler):
        """
        Checks for halted volumes on a single or multiple vPools
        This will only check the volume states on the current node. If any other volumedriver would be down,
        only the HA'd volumes would pop-up as they could appear halted here (should be verified by the volumedriver team)
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        vpools = VPoolList.get_vpools()
        local_sr = System.get_my_storagerouter()

        if len(vpools) == 0:
            result_handler.skip('No vPools found!'.format(len(vpools)), code=ErrorCodes.vpools_none)
            return
        for vpool in vpools:
            log_start = 'Halted volumes test vPool {0}'.format(vpool.name)
            if vpool.guid not in local_sr.vpools_guids:
                result_handler.skip('{0} - Skipping vPool {1} because it is not living here.'.format(log_start, vpool.name),
                                    code=ErrorCodes.vpool_not_local, add_to_result=False)
                continue

            result_handler.info('{0} - Retrieving all information'.format(log_start), add_to_result=False)
            storagedriver = None
            for std in vpool.storagedrivers:
                if std.storagerouter_guid == local_sr.guid:
                    storagedriver = std
                    break

            if storagedriver is None:
                result_handler.failure('{0} - Could not associate a StorageDriver with this StorageRouter'.format(log_start),
                                       code=ErrorCodes.std_no_str)
                continue

            volume_fenced_states = dict((key, []) for key in cls.FENCED_HALTED_STATUS_MAP.keys())
            volume_lists = {cls.VDISK_HALTED_STATES.HALTED: [], cls.VDISK_HALTED_STATES.FENCED: []}
            volume_states = {cls.VDISK_HALTED_STATES.HALTED: {cls.VDISK_HALTED_STATES.HALTED: volume_lists[cls.VDISK_HALTED_STATES.HALTED]},
                             cls.VDISK_HALTED_STATES.FENCED: volume_fenced_states}  # Less loops to write for outputting
            result_handler.info('{0} - Scanning for halted volumes'.format(log_start), add_to_result=False)
            try:
                voldrv_client = vpool.storagedriver_client
                objectregistry_client = vpool.objectregistry_client
            except Exception:
                cls.logger.exception('{0} - Unable to instantiate the required clients'.format(log_start))
                result_handler.exception('{0} - Unable to load the Volumedriver clients'.format(log_start),
                                         code=ErrorCodes.voldr_unknown_problem)
                continue
            try:
                # Listing all halted volumes with the volumedriver client as it detects stolen volumes too (fenced instances)
                volumes = voldrv_client.list_halted_volumes(str(storagedriver.storagedriver_id))
            except Exception as ex:
                cls.logger.exception('{0} - Exception occurred when listing volumes'.format(log_start))
                if cls._is_volumedriver_timeout(ex) is False:
                    # Unhandled exception at this point
                    result_handler.exception('{0} - Unable to list the Volumes due to an unidentified problem. Please check the logging'.format(log_start),
                                             code=ErrorCodes.voldr_unknown_problem)
                else:
                    result_handler.failure('{0} - Could not list the volumes for due to a connection problem.'.format(log_start),
                                           code=ErrorCodes.voldrv_connection_problem)
                continue
            # Retrieve the parent of the current volume. If this id would not be identical to the one we fetched for, that would mean it is fenced
            # Object registry goes to Arakoon
            # Capturing any possible that would occur to provide a clearer vision of what went wrong
            for volume in volumes:
                try:
                    registry_entry = objectregistry_client.find(volume)
                    if registry_entry.node_id() == storagedriver.storagedriver_id:
                        volume_lists[cls.VDISK_HALTED_STATES.HALTED].append(volume)
                    else:
                        # Fenced
                        volume_lists[cls.VDISK_HALTED_STATES.FENCED].append(volume)
                except Exception:
                    msg = '{0} - Unable to consult the object registry client for volume \'{1}\''.format(log_start, volume)
                    cls.logger.exception(msg)
                    result_handler.exception(msg, code=ErrorCodes.voldr_unknown_problem)
            # Include fenced - OTHER state combo
            for volume in volume_lists[cls.VDISK_HALTED_STATES.FENCED]:
                try:
                    _, state = cls._get_volume_issue(voldrv_client, volume, log_start)
                    volume_fenced_states[state].append(volume)
                except Exception:
                    # Only unhandled at this point
                    result_handler.exception('{0} - Unable to the volume info for volume {1} due to an unidentified problem. Please check the logging'.format(log_start, volume),
                                             code=ErrorCodes.voldr_unknown_problem)
            for halted_state, volume_state_info in volume_states.iteritems():
                for state, volumes in volume_state_info.iteritems():
                    if len(volumes) == 0:
                        continue  # Skip OK/empty lists
                    map_value = cls.FENCED_HALTED_STATUS_MAP[state.lower()]
                    log_func = getattr(result_handler, map_value['severity'])
                    message, code = map_value[halted_state.lower()]
                    log_func('{0} - {1}'.format(log_start, message.format(', '.join(volumes))), code=code)
            # Call success in case nothing is wrong
            if all(len(l) == 0 for l in volume_lists.values()):
                result_handler.success('{0} - No volumes found in halted/fenced state'.format(log_start))
示例#38
0
    def get(self, request, *args, **kwargs):
        """
        Handles token post
        """
        _ = args, kwargs
        html_endpoint = EtcdConfiguration.get('/ovs/framework/webapps|html_endpoint')
        if 'code' not in request.GET:
            OAuth2RedirectView._logger.error('Got OAuth2 redirection request without code')
            return HttpResponseRedirect, html_endpoint
        code = request.GET['code']
        if 'state' not in request.GET:
            OAuth2RedirectView._logger.error('Got OAuth2 redirection request without state')
            return HttpResponseRedirect, html_endpoint
        state = request.GET['state']
        if 'error' in request.GET:
            error = request.GET['error']
            description = request.GET['error_description'] if 'error_description' in request.GET else ''
            OAuth2RedirectView._logger.error('Error {0} during OAuth2 redirection request: {1}'.format(error, description))
            return HttpResponseRedirect, html_endpoint

        base_url = EtcdConfiguration.get('/ovs/framework/webapps|oauth2.token_uri')
        client_id = EtcdConfiguration.get('/ovs/framework/webapps|oauth2.client_id')
        client_secret = EtcdConfiguration.get('/ovs/framework/webapps|oauth2.client_secret')
        parameters = {'grant_type': 'authorization_code',
                      'redirect_url': 'https://{0}/api/oauth2/redirect/'.format(System.get_my_storagerouter().ip),
                      'client_id': client_id,
                      'code': code}
        url = '{0}?{1}'.format(base_url, urllib.urlencode(parameters))
        headers = {'Accept': 'application/json',
                   'Authorization': 'Basic {0}'.format(base64.b64encode('{0}:{1}'.format(client_id, client_secret)).strip())}
        raw_response = requests.post(url=url, headers=headers, verify=False)
        response = raw_response.json()
        if 'error' in response:
            error = response['error']
            description = response['error_description'] if 'error_description' in response else ''
            OAuth2RedirectView._logger.error('Error {0} during OAuth2 redirection access token: {1}'.format(error, description))
            return HttpResponseRedirect, html_endpoint

        token = response['access_token']
        expires_in = response['expires_in']

        clients = ClientList.get_by_types('INTERNAL', 'CLIENT_CREDENTIALS')
        client = None
        for current_client in clients:
            if current_client.user.group.name == 'administrators':
                client = current_client
                break
        if client is None:
            OAuth2RedirectView._logger.error('Could not find INTERNAL CLIENT_CREDENTIALS client in administrator group.')
            return HttpResponseRedirect, html_endpoint

        roles = RoleList.get_roles_by_codes(['read', 'write', 'manage'])
        access_token, _ = Toolbox.generate_tokens(client, generate_access=True, scopes=roles)
        access_token.expiration = int(time.time() + expires_in)
        access_token.access_token = token
        access_token.save()

        expires = datetime.datetime.now() + datetime.timedelta(minutes=2)
        response = HttpResponseRedirect(html_endpoint)
        response.set_cookie('state', state, expires=expires, secure=True)
        response.set_cookie('accesstoken', token, expires=expires, secure=True)

        return response
 def get_local_storagerouter():
     """
     Retrieve the local Storage Router
     :return: Storage Router DAL object
     """
     return System.get_my_storagerouter()
示例#40
0
class VolumeDriverUpdater(ComponentUpdater):
    """
    Responsible for updating the volumedriver of a single node
    """

    logger = Logger('update-volumedriver')

    COMPONENT = 'volumedriver'
    # List with tuples. [(package_name, binary_name, binary_location, [service_prefix_0]]
    BINARIES = [(PACKAGES_EE, VOLUMEDRIVER_CMD_NAME, VOLUMEDRIVER_BIN_PATH,
                 [STORAGEDRIVER_SERVICE_BASE])
                ]  # type: List[Tuple[List[str], str, str, List[str]]]
    LOCAL_SR = System.get_my_storagerouter()
    EDGE_SYNC_TIME = 5 * 60

    @classmethod
    def restart_services(cls):
        """
        Override the service restart. The volumedrivers should be prepared for shutdown
        """
        cls.logger.info("Preparing to restart the related services")
        initial_run_steps = True
        try:
            run_number = 0
            while True:
                cls.logger.info(
                    'Attempt {0} to prepare the restart'.format(run_number))
                # Get the migration plans for every volume on this host. If there are no plans for certain volumes, it will raise
                balances_by_vpool = cls.get_vpool_balances_for_evacuating_storagerouter(
                    cls.LOCAL_SR)
                if initial_run_steps:
                    cls.logger.info(
                        'Offloading a MDS catchup to celery. This will ensure all slaves will be caught up to avoid deadlocking'
                    )
                    MDSServiceController.mds_catchup.apply_async()
                    # Plan to execute migrate. Avoid the VPool from being an HA target
                    cls.mark_storagerouter_unreachable_for_ha(cls.LOCAL_SR)
                    initial_run_steps = False
                try:
                    cls.migrate_away(balances_by_vpool, cls.LOCAL_SR)
                    cls.migrate_master_mds(cls.LOCAL_SR)
                    all_prefixes = tuple(
                        itertools.chain.from_iterable(b[3]
                                                      for b in cls.BINARIES))
                    cls.logger.info("Restarting all related services")
                    return cls.restart_services_by_prefixes(all_prefixes)
                except LocalMastersRemaining:
                    # Swallow and retry
                    cls.logger.warning(
                        'Local masters still found on the machine. Will try to migrate them away'
                    )
                run_number += 1
        finally:
            if not initial_run_steps:
                cls.mark_storagerouter_reachable_for_ha(cls.LOCAL_SR)

    @staticmethod
    def get_vpool_balances_for_evacuating_storagerouter(storagerouter):
        # type: (StorageRouter) -> Dict[VPool, List[VDiskBalance]]
        """
        Retrieve the balances for every vpool on the local machine
        :param storagerouter: Storagerouter to migrate away from
        :type storagerouter: StorageRouter
        :return: Dict with vpool and balances
        :rtype: Dict[VPool, VDiskBalance]
        :raises FailedToMigrateException if not all vdisks would be able to move out
        """
        errors = []
        evacuate_srs = [storagerouter.guid]
        balances_by_vpool = {}
        for storagedriver in storagerouter.storagedrivers:
            vpool = storagedriver.vpool
            try:
                balances = VDiskRebalancer.get_rebalanced_layout(
                    storagedriver.vpool_guid,
                    ignore_domains=False,
                    excluded_storagerouters=None,
                    evacuate_storagerouters=evacuate_srs,
                    base_on_volume_potential=True)
                balances_sorted = sorted(balances,
                                         key=lambda b: b.storagedriver.
                                         storagerouter_guid in evacuate_srs,
                                         reverse=True)
                balances_by_vpool[vpool] = balances_sorted
            except Exception as ex:
                errors.append((vpool, ex))
        if errors:
            formatted_errors = '\n - {0}'.format('\n - '.join(
                'VPool {0}: {1}'.format(vpool.name, error)
                for vpool, error in errors))
            raise FailedToMigrateException(
                'Unable to migrate all volumes away from this machine: {}'.
                format(formatted_errors))
        return balances_by_vpool

    @classmethod
    def mark_storagerouter_unreachable_for_ha(cls, storagerouter):
        """
        Update the node distance maps to
        Current code paths that update the node distance map on the volumedriver side are:
        - Update of domains
        - Update of vpool layout (extend/shrink)
        - cluster registry checkup (ran periodically)
        :return: None
        :rtype: NoneType
        """
        cls.logger.info(
            "Marking Storagerouter {} as unavailable for HA".format(
                storagerouter.name))
        # Set the value used in the storagedriver cluster node config path
        # This holds for all mentioned paths in the docstrings
        Configuration.set(os.path.join(VPOOL_UPDATE_KEY, storagerouter.guid),
                          0)
        # Trigger a complete reload of node distance maps
        StorageDriverController.cluster_registry_checkup()
        # Wait a few moment for the edge to catch up all the configs
        sleep_time = cls.get_edge_sync_time()
        cls.logger.info(
            "Waiting {} to sync up all edge clients".format(sleep_time))
        time.sleep(sleep_time)

    @classmethod
    def mark_storagerouter_reachable_for_ha(cls, storagerouter):
        # type: (StorageRouter) -> None
        """
        Update the node distance map to add the storagerouter back into the HA pool
        :param storagerouter: Storagerouter to put back into the distance map
        :type storagerouter: StorageRouter
        :return: None
        """
        cls.logger.info("Marking Storagerouter {} as available for HA".format(
            storagerouter.name))
        Configuration.delete(os.path.join(VPOOL_UPDATE_KEY,
                                          storagerouter.guid))
        # Trigger a complete reload of node distance maps
        StorageDriverController.cluster_registry_checkup()
        # Wait a few moment for the edge to catch up all the configs
        sleep_time = cls.get_edge_sync_time()
        cls.logger.info(
            "Waiting {} to sync up all edge clients".format(sleep_time))
        time.sleep(sleep_time)

    @classmethod
    def migrate_away(cls, balances_by_vpool, storagerouter):
        # type: (Dict[VPool, List[VDiskBalance]], StorageRouter) -> None
        """
        Migrate all volumes away
        :param balances_by_vpool: Dict with VPool as key and List of vdisk balances to execute
        :type balances_by_vpool: Dict[VPool, List[VDiskBalance]]
        :param storagerouter: Storagerouter to move away from
        :type storagerouter: StorageRouter
        :return: None
        :raises: FailureDuringMigrateException if any volumes failed to move
        """
        tasks = []
        signatures = []
        for vpool, balances in balances_by_vpool.iteritems():
            # Serialize to offload to celery. DataObjects can't be serialized yet
            serialized_balances = [b.to_dict() for b in balances]
            signature = VPoolController.execute_balance_change.si(
                vpool.guid, serialized_balances, [storagerouter.guid])
            # Freeze freezes the task into its final form. This will net the async result object we'd normally get from delaying it
            tasks.append(signature.freeze())
            signatures.append(signature)
        if signatures:
            cls.logger.info('Adding migration group with tasks {}'.format(
                ', '.join(t.id for t in tasks)))
            # Add all chain signatures to a group for parallel execution
            task_group = group(signatures)
            # Wait for the group result
            async_result = task_group.apply_async()
            cls.logger.info('Waiting for all tasks of group {}'.format(
                async_result.id))
            # Timeout similar to migrate_master_mds does not make a lot of sense. All tasks are executed in parallel
            _ = async_result.get()
        cls.logger.info("MDS migration finished")

    @classmethod
    def migrate_master_mds(cls,
                           storagerouter,
                           max_chain_size=100,
                           group_timeout=10 * 60):
        # type: (StorageRouter, Optional[int], Optional[int]) -> None
        """
        Migrate away all master mds from the given storagerouter
        :param storagerouter: Storagerouter to migrate away from
        :type storagerouter: StorageRouter
        :param max_chain_size: Maximum number of tasks within a chain. Set because https://github.com/celery/celery/issues/1078
        :type max_chain_size: int
        :param group_timeout: Timeout for the complete group. Will abort all pending tasks afterwards. Defaults to 10 mins
        :type group_timeout: int
        :return: None
        :rtype: NoneType
        """
        cls.logger.info("Starting MDS migrations")
        while True:
            hosted_vdisk_guids = storagerouter._vdisks_guids()
            vpool_mds_master_vdisks = cls.get_vdisks_mds_masters_on_storagerouter(
                storagerouter)
            all_masters_gone = sum(
                len(vds) for vds in vpool_mds_master_vdisks.values()) == 0
            if all_masters_gone:
                break
            all_tasks = []
            chains = []
            for vpool_guid, vdisk_guids in vpool_mds_master_vdisks.iteritems():
                signatures = []
                tasks = []
                for vdisk_guid in vdisk_guids[0:max_chain_size]:
                    if vdisk_guid in hosted_vdisk_guids:
                        cls.logger.warning(
                            'Skipping vDisk {} as it is still hosted on Storagerouter {}'
                            .format(vdisk_guid, storagerouter.name))
                    cls.logger.info(
                        'Ensuring safety for {}'.format(vdisk_guid))
                    # Ensure safety is a common task. Let's timeout on the ensure single quickly to avoid worker lockups
                    signature = MDSServiceController.ensure_safety.si(
                        vdisk_guid, ensure_single_timeout=5)
                    # Freeze freezes the task into its final form. This will net the async result object we'd normally get from delaying it
                    tasks.append(signature.freeze())
                    signatures.append(signature)
                all_tasks.extend(tasks)
                if signatures:
                    cls.logger.info(
                        'Adding chain for VPool {} with tasks {}'.format(
                            vpool_guid, ', '.join(t.id for t in tasks)))
                    chains.append(chain(signatures))
            # Add all chain signatures to a group for parallel execution
            task_group = group(chains)
            # Wait for the group result
            async_result = task_group.apply_async()
            cls.logger.info('Waiting for all tasks of group {}'.format(
                async_result.id))
            try:
                _ = async_result.get(timeout=group_timeout)
            except TimeoutError:
                cls.logger.warning(
                    'Migration took longer than expected. Revoking all non-started tasks'
                )
                revoked_tasks = []
                for task in all_tasks:
                    if task.state == 'PENDING':
                        # Certain PENDING tasks cannot be revoked. It appears they're non-existent. Not even the workers know about them
                        # @todo build a new result chain and wait for that
                        task.revoke()
                        revoked_tasks.append(task)
                if revoked_tasks:
                    cls.logger.warning('Revoked migration tasks: {}'.format(
                        ', '.join(revoked_tasks)))
                cls.logger.warning(
                    'Waiting for the execution on the running migrations')
                _ = async_result.get()
        cls.logger.info("MDS migration finished")
        if len(hosted_vdisk_guids) > 0:
            raise LocalMastersRemaining(
                'vDisks are still hosted on Storagerouter to migrate from: {}'.
                format(', '.join(hosted_vdisk_guids), storagerouter.name))

    @staticmethod
    def get_vdisks_mds_masters_on_storagerouter(storagerouter):
        # type: (StorageRouter) -> Dict[VPool, List[VDisk]]
        """
        Retrieve all vdisks with the MDS master on the given storagerouter
        :param storagerouter: Storagerouter to list MDS masters on
        :type storagerouter: StorageRouter
        :return: Dict with VPool as key and vdisks with the MDS master on the storagerouter as value
        :rtype: Dict[VPool, List[VDisk]
        """
        mds_masters = {}
        vpools = set(sd.vpool for sd in storagerouter.storagedrivers)
        for vpool in sorted(vpools, key=lambda k: k.name):
            masters = []
            for mds_service in sorted(vpool.mds_services,
                                      key=lambda k: k.number):
                if mds_service.service.storagerouter_guid == storagerouter.guid:
                    for junction in mds_service.vdisks:
                        if junction.is_master:
                            masters.append(junction.vdisk_guid)
            mds_masters[vpool.name] = masters
        return mds_masters

    @staticmethod
    def get_persistent_client():
        return PersistentFactory.get_client()

    @classmethod
    def get_edge_sync_time(cls):
        # type: () -> int
        """
        Get the time required for all edge clients to do a complete sync
        :return: Time for a complete edge sync
        :rtype: int
        """
        return 2 * cls.EDGE_SYNC_TIME
示例#41
0
def run_kvm_watcher():
    """
    Check whether to run the KVM file watcher
    """
    return System.get_my_storagerouter().pmachine.hvtype == 'KVM'
 def detectOvsType(self):
     return System.get_my_storagerouter().node_type
示例#43
0
    def get(self, request, *args, **kwargs):
        """
        Handles token post
        """
        _ = args, kwargs
        html_endpoint = Configuration.get(
            '/ovs/framework/webapps|html_endpoint')
        if 'code' not in request.GET:
            OAuth2RedirectView._logger.error(
                'Got OAuth2 redirection request without code')
            return HttpResponseRedirect(html_endpoint)
        code = request.GET['code']
        if 'state' not in request.GET:
            OAuth2RedirectView._logger.error(
                'Got OAuth2 redirection request without state')
            return HttpResponseRedirect(html_endpoint)
        state = request.GET['state']
        if 'error' in request.GET:
            error = request.GET['error']
            description = request.GET[
                'error_description'] if 'error_description' in request.GET else ''
            OAuth2RedirectView._logger.error(
                'Error {0} during OAuth2 redirection request: {1}'.format(
                    error, description))
            return HttpResponseRedirect(html_endpoint)

        base_url = Configuration.get('/ovs/framework/webapps|oauth2.token_uri')
        client_id = Configuration.get(
            '/ovs/framework/webapps|oauth2.client_id')
        client_secret = Configuration.get(
            '/ovs/framework/webapps|oauth2.client_secret')
        parameters = {
            'grant_type':
            'authorization_code',
            'redirect_url':
            'https://{0}/api/oauth2/redirect/'.format(
                System.get_my_storagerouter().ip),
            'client_id':
            client_id,
            'code':
            code
        }
        url = '{0}?{1}'.format(base_url, urllib.urlencode(parameters))
        headers = {
            'Accept':
            'application/json',
            'Authorization':
            'Basic {0}'.format(
                base64.b64encode('{0}:{1}'.format(client_id,
                                                  client_secret)).strip())
        }
        raw_response = requests.post(url=url, headers=headers, verify=False)
        response = raw_response.json()
        if 'error' in response:
            error = response['error']
            description = response[
                'error_description'] if 'error_description' in response else ''
            OAuth2RedirectView._logger.error(
                'Error {0} during OAuth2 redirection access token: {1}'.format(
                    error, description))
            return HttpResponseRedirect(html_endpoint)

        token = response['access_token']
        expires_in = response['expires_in']

        clients = ClientList.get_by_types('INTERNAL', 'CLIENT_CREDENTIALS')
        client = None
        for current_client in clients:
            if current_client.user.group.name == 'administrators':
                client = current_client
                break
        if client is None:
            OAuth2RedirectView._logger.error(
                'Could not find INTERNAL CLIENT_CREDENTIALS client in administrator group.'
            )
            return HttpResponseRedirect(html_endpoint)

        roles = RoleList.get_roles_by_codes(['read', 'write', 'manage'])
        access_token, _ = Toolbox.generate_tokens(client,
                                                  generate_access=True,
                                                  scopes=roles)
        access_token.expiration = int(time.time() + expires_in)
        access_token.access_token = token
        access_token.save()

        expires = datetime.datetime.now() + datetime.timedelta(minutes=2)
        response = HttpResponseRedirect(html_endpoint)
        response.set_cookie('state', state, expires=expires, secure=True)
        response.set_cookie('accesstoken', token, expires=expires, secure=True)

        return response
示例#44
0
    def _create_vpool(self):
        """
        Needed to actually run tests on
        This is not actually a test of "Add Vpool to OVS",
        so any failure here will be reported as a setUp error and no tests will run
        """
        pmachine = System.get_my_storagerouter().pmachine
        mgmt_center = MgmtCenter(
            data={
                'name': 'Openstack',
                'description': 'test',
                'username': CINDER_USER,
                'password': CINDER_PASS,
                'ip': CINDER_CONTROLLER,
                'port': 80,
                'type': 'OPENSTACK',
                'metadata': {
                    'integratemgmt': True
                }
            })
        mgmt_center.save()
        pmachine.mgmtcenter = mgmt_center
        pmachine.save()
        self._debug('Creating vpool')
        backend_type = 'local'
        fields = ['storage_ip', 'vrouter_port']

        parameters = {
            'storagerouter_ip': IP,
            'vpool_name': VPOOL_NAME,
            'type': 'local',
            'mountpoint_bfs': VPOOL_BFS,
            'mountpoint_temp': VPOOL_TEMP,
            'mountpoint_md': VPOOL_MD,
            'mountpoint_readcaches': [VPOOL_READCACHE],
            'mountpoint_writecaches': [VPOOL_WRITECACHE],
            'mountpoint_foc': VPOOL_FOC,
            'storage_ip': '127.0.0.1',  #KVM
            'vrouter_port': VPOOL_PORT,
            'integrate_vpool': True,
            'connection_host': IP,
            'connection_port': VPOOL_PORT,
            'connection_username': '',
            'connection_password': '',
            'connection_backend': {},
        }
        StorageRouterController.add_vpool(parameters)
        attempt = 0
        while attempt < 10:
            vpool = VPoolList.get_vpool_by_name(VPOOL_NAME)
            if vpool is not None:
                self._debug('vpool %s created' % VPOOL_NAME)
                try:
                    os.listdir(VPOOL_MOUNTPOINT)
                    return vpool
                except Exception as ex:
                    #either it doesn't exist, or we don't have permission
                    self._debug('vpool not ready yet %s' % (str(ex)))
                    pass
            attempt += 1
            time.sleep(2)
        raise RuntimeError(
            'Vpool %s was not modeled correctly or did not start.' %
            VPOOL_NAME)
class OpenvStorageHealthCheck(object):
    """
    A healthcheck for the Open vStorage framework
    """
    MODULE = 'ovs'
    LOCAL_SR = System.get_my_storagerouter()
    LOCAL_ID = System.get_my_machine_id()

    CELERY_CHECK_TIME = 7

    @staticmethod
    @expose_to_cli(MODULE, 'log-files-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that all log files are not too big',
                   short_help='Test if log files are not too big')
    @expose_to_cli.option('--max-log-size', '-m', type=float, default=Helper.max_log_size, help='Maximum size of the file (in MB)')
    def check_size_of_log_files(result_handler, max_log_size=Helper.max_log_size):
        """
        Checks the size of the initialized log files
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param max_log_size: maximum log size of a log file (in MB)
        :type max_log_size: double
        :return: None
        :rtype: NoneType
        """
        def get_log_files_by_path(start_path, recursive=True):
            files_to_check = []
            for entry in os.listdir(start_path):
                entry_path = '{0}/{1}'.format(start_path, entry)
                if os.path.isdir(entry_path) and recursive is True:
                    files_to_check.extend(get_log_files_by_path(entry_path))
                elif entry.endswith('.log'):
                    files_to_check.append(entry_path)
            return files_to_check

        good_size = []
        too_big = []
        result_handler.info('Checking if log files their size is not bigger than {0} MB: '.format(max_log_size), add_to_result=False)

        for c_files in get_log_files_by_path('/var/log/'):
            # check if logfile is larger than max_size
            if os.stat(c_files).st_size < 1024 ** 2 * max_log_size:
                good_size.append(c_files)
                result_handler.success('Logfile {0} size is fine!'.format(c_files), code=ErrorCodes.log_file_size)
            else:
                too_big.append(c_files)
                result_handler.warning('Logfile {0} is larger than {1} MB!'.format(c_files, max_log_size), code=ErrorCodes.log_file_size)

        if len(too_big) != 0:
            result_handler.warning('The following log files are too big: {0}.'.format(', '.join(too_big)), code=ErrorCodes.log_file_size)
        else:
            result_handler.success('All log files are ok!', code=ErrorCodes.log_file_size)

    @staticmethod
    @expose_to_cli(MODULE, 'port-ranges-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that there are enough ports remaining for OVS use',
                   short_help='Test if there are enough ports remaining')
    @expose_to_cli.option('--minimal_port_amount', '-m', type=int, default=20, help='Minimal number of ports')
    def check_port_ranges(result_handler, minimal_port_amount=20):
        """
        Checks whether the expected amount of ports is available for the requested amount of ports
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param minimal_port_amount: minimal number of ports without warning
        :type minimal_port_amount: int
        :return: None
        :rtype: NoneType
        """
        # @todo: check other port ranges too
        port_range = Configuration.get('/ovs/framework/hosts/{0}/ports|storagedriver'.format(OpenvStorageHealthCheck.LOCAL_ID))
        expected_ports = System.get_free_ports(selected_range=port_range, amount=0)
        result_handler.info('Checking if enough ports are still available for OpenvStorage')
        if len(expected_ports) >= minimal_port_amount:
            result_handler.success('{} ports free'.format(len(expected_ports)))
        else:
            result_handler.warning('{} ports found, less than {}'.format(len(expected_ports), minimal_port_amount))

    @staticmethod
    @expose_to_cli(MODULE, 'nginx-ports-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that NGINX is reachable',
                   short_help='Test if NGINX is reachable')
    def check_nginx_ports(result_handler):
        """
        Checks if this node can connect to it's own Nginx
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        return OpenvStorageHealthCheck._check_extra_ports(result_handler, 'nginx')

    @staticmethod
    @expose_to_cli(MODULE, 'memcached-ports-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that memcached is reachable',
                   short_help='Test if memcached is reachable')
    def check_memcached_ports(result_handler):
        """
        Checks the connection of this node to all Memcached endpoints
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        memcached_ips = [endpoint.rsplit(':')[0] for endpoint in Configuration.get('ovs/framework/memcache|endpoints', default=[])]
        return OpenvStorageHealthCheck._check_extra_ports(result_handler, 'memcached', ips=memcached_ips)

    @staticmethod
    def _check_extra_ports(result_handler, key, ips=None):
        """
        Checks the extra ports for key specified in the settings.json
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param key: check all ports for this key
        :type key: string
        :param ips: IPs to check the port on. Default the local IP
        :type ips: list
        :return: None
        :rtype: NoneType
        """
        if ips is None:
            ips = [OpenvStorageHealthCheck.LOCAL_SR.ip]
        result_handler.info('Checking {0} ports'.format(key), add_to_result=False)
        if key not in Helper.extra_ports:
            raise RuntimeError('Settings.json is incorrect! The extra ports to check do not have {0}'.format(key))
        for port in Helper.extra_ports[key]:
            for ip in ips:
                result_handler.info('Checking socket {0}:{1} of service {2}.'.format(ip, port, key), add_to_result=False)
                result = NetworkHelper.check_port_connection(port, ip)
                if result:
                    result_handler.success('Connection successfully established to service {0} on {1}:{2}'.format(key, ip, port), code=getattr(ErrorCodes, 'port_{0}'.format(key)))
                else:
                    result_handler.failure('Connection FAILED to service {0} on {1}:{2}'.format(key, ip, port), code=getattr(ErrorCodes, 'port_{0}'.format(key)))

    @staticmethod
    @expose_to_cli(MODULE, 'celery-ports-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that RabbitMQ is reachable',
                   short_help='Test if RabbitMQ is reachable')
    def check_rabbitmq_ports(result_handler):
        """
        Checks all ports of Open vStorage components rabbitMQ and celery
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        # Check Celery and RabbitMQ
        if OpenvStorageHealthCheck.LOCAL_SR.node_type != 'MASTER':
            result_handler.skip('RabbitMQ is not running/active on this server!')
            return
        result_handler.info('Checking Celery.', add_to_result=False)
        from errno import errorcode
        try:
            # noinspection PyUnresolvedReferences
            from celery.task.control import inspect
            stats = inspect().stats()
            if stats:
                result_handler.success('Successfully connected to Celery on all nodes.', code=ErrorCodes.port_celery)
            else:
                result_handler.failure('No running Celery workers were found.', code=ErrorCodes.port_celery)
        except IOError as ex:
            msg = 'Could not connect to Celery. Got {0}.'.format(ex)
            if len(ex.args) > 0 and errorcode.get(ex.args[0]) == 'ECONNREFUSED':
                msg += ' Check that the RabbitMQ server is running.'
                result_handler.failure(msg, code=ErrorCodes.port_celery)
        except ImportError as ex:
            result_handler.failure('Could not import the celery module. Got {}'.format(str(ex)), code=ErrorCodes.port_celery)

    @staticmethod
    @expose_to_cli(MODULE, 'packages-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that all required packages are installed',
                   short_help='Test if all required packages are installed')
    def check_ovs_packages(result_handler):
        """
        Checks the availability of packages for Open vStorage
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Checking OVS packages: ', add_to_result=False)
        client = SSHClient(OpenvStorageHealthCheck.LOCAL_SR)
        package_manager = PackageFactory.get_manager()
        # Get all base packages
        base_packages = set()
        for names in package_manager.package_info['names'].itervalues():
            base_packages = base_packages.union(names)
        base_packages = list(base_packages)
        extra_packages = Helper.packages
        installed = package_manager.get_installed_versions(client=client, package_names=base_packages)
        installed.update(package_manager.get_installed_versions(client=client, package_names=Helper.packages))
        for package in base_packages + extra_packages:
            version = installed.get(package)
            if version:
                version = str(version)
                result_handler.success('Package {0} is installed with version {1}'.format(package, version),
                                       code=ErrorCodes.package_required)
            else:
                if package in package_manager.package_info['mutually_exclusive']:
                    # Mutually exclusive package, so ignore
                    continue
                if package in base_packages:
                    result_handler.warning('Package {0} is not installed.'.format(package),
                                           code=ErrorCodes.package_required)
                elif package in extra_packages:
                    result_handler.skip('Package {0} is not installed.'.format(package))

    @staticmethod
    @expose_to_cli(MODULE, 'processes-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that all OVS related processes are running',
                   short_help='Test if OVS processes are running')
    def check_ovs_processes(result_handler):
        """
        Checks the availability of processes for Open vStorage
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Checking local ovs services.')
        client = SSHClient(System.get_my_storagerouter())
        service_manager = ServiceFactory.get_manager()
        services = [service for service in service_manager.list_services(client=client) if service.startswith(OpenvStorageHealthCheck.MODULE)]
        if len(services) == 0:
            result_handler.warning('Found no local ovs services.')
        for service_name in services:
            if service_manager.get_service_status(service_name, client) == 'active':
                result_handler.success('Service {0} is running!'.format(service_name), code=ErrorCodes.process_fwk)
            else:
                result_handler.failure('Service {0} is not running, please check this.'.format(service_name), code=ErrorCodes.process_fwk)

    @staticmethod
    @timeout(CELERY_CHECK_TIME)
    def _check_celery():
        """
        Preliminary/Simple check for Celery and RabbitMQ component
        """
        # try if celery works smoothly
        try:
            machine_id = OpenvStorageHealthCheck.LOCAL_SR.machine_id
            obj = StorageRouterController.get_support_info.s().apply_async(routing_key='sr.{0}'.format(machine_id)).get()
        except TimeoutError as ex:
            raise TimeoutError('{0}: Process is taking to long!'.format(ex.value))
        if obj:
            return True
        else:
            return False

    @staticmethod
    @expose_to_cli(MODULE, 'workers-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that the ovs-workers are working',
                   short_help='Test if the ovs-workers are working')
    def check_ovs_workers(result_handler):
        """
        Extended check of the Open vStorage workers; When the simple check fails, it will execute a full/deep check.
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Checking if OVS-WORKERS are running smoothly.', add_to_result=False)
        # Checking celery
        try:
            # Basic celery check
            OpenvStorageHealthCheck._check_celery()
            result_handler.success('The OVS-WORKERS are working smoothly!', code=ErrorCodes.process_celery_timeout)
        except TimeoutError:
            # Apparently the basic check failed, so we are going crazy
            result_handler.failure('The test timed out after {0}s! Is RabbitMQ and ovs-workers running?'.format(OpenvStorageHealthCheck.CELERY_CHECK_TIME), code=ErrorCodes.process_celery_timeout)
        except Exception as ex:
            result_handler.failure('The celery check has failed with {0}'.format(str(ex)), code=ErrorCodes.process_celery_timeout)

    @staticmethod
    @expose_to_cli(MODULE, 'directories-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that all OVS related directories have correct ownership and rights',
                   short_help='Test if directories their ownership and rights are correct')
    def check_required_dirs(result_handler):
        """
        Checks the directories their rights and owners for mistakes
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Checking if owner rights are set correctly on certain directories.', add_to_result=False)
        for dirname, owner_settings in Helper.owners_files.iteritems():
            # check if directory/file exists
            if os.path.exists(dirname):
                if owner_settings.get('user') == FilesystemHelper.get_owner_of_file(dirname) \
                        and owner_settings.get('group') == FilesystemHelper.get_group_of_file(dirname):
                    result_handler.success('Directory {0} has correct owners!'.format(dirname), code=ErrorCodes.directory_ownership_incorrect)
                else:
                    result_handler.warning('Directory {0} has INCORRECT owners! It must be OWNED by USER={1} and GROUP={2}'.format(dirname, owner_settings.get('user'), owner_settings.get('group')),
                                           code=ErrorCodes.directory_ownership_incorrect)
            else:
                result_handler.skip('Directory {0} does not exists!'.format(dirname))

        result_handler.info('Checking if Rights are set correctly on certain maps.', add_to_result=False)
        for dirname, rights in Helper.rights_dirs.iteritems():
            # check if directory/file exists
            if os.path.exists(dirname):
                if FilesystemHelper.check_rights_of_file(dirname, rights):
                    result_handler.success('Directory {0} has correct rights!'.format(dirname), code=ErrorCodes.directory_rights_incorrect)
                else:
                    result_handler.warning('Directory {0} has INCORRECT rights! It must be CHMOD={1} '.format(dirname, rights),
                                           code=ErrorCodes.directory_rights_incorrect)
            else:
                result_handler.skip('Directory {0} does not exists!'.format(dirname))

    @staticmethod
    @expose_to_cli(MODULE, 'dns-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that the node can resolve DNS names',
                   short_help='Test if the node can resolve DNS names')
    def check_if_dns_resolves(result_handler, fqdn='google.com'):
        """
        Checks if DNS resolving works on a local machine
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param fqdn: the absolute pathname of the file
        :type fqdn: str
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Checking DNS resolving.', add_to_result=False)
        result = NetworkHelper.check_if_dns_resolves(fqdn)
        if result is True:
            result_handler.success('DNS resolving works!', code=ErrorCodes.dns_resolve_fail)
        else:
            result_handler.warning('DNS resolving doesnt work, please check /etc/resolv.conf or add correct DNS server and make it immutable: "sudo chattr +i /etc/resolv.conf"!',
                                   code=ErrorCodes.dns_resolve_fail)

    @staticmethod
    @expose_to_cli(MODULE, 'zombie-processes-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that no zombie processes are running on the machine',
                   short_help='Test if there are zombie processes running')
    def check_zombied_and_dead_processes(result_handler):
        """
        Finds zombie or dead processes on a local machine
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        zombie_processes = []
        dead_processes = []

        result_handler.info('Checking for zombie/dead processes.', add_to_result=False)

        # Check for zombie'd and dead processes
        for proc in psutil.process_iter():
            try:
                pinfo = proc.as_dict(attrs=['pid', 'name', 'status'])
            except psutil.NoSuchProcess:
                pass
            else:
                if pinfo.get('status') == psutil.STATUS_ZOMBIE:
                    zombie_processes.append('{0}({1})'.format(pinfo.get('name'), pinfo.get('pid')))

                if pinfo.get('status') == psutil.STATUS_DEAD:
                    dead_processes.append('{0}({1})'.format(pinfo.get('name'), pinfo.get('pid')))

        # Check if there zombie processes
        if len(zombie_processes) == 0:
            result_handler.success('There are no zombie processes on this node!', code=ErrorCodes.process_zombie_found)
        else:
            result_handler.warning('We DETECTED zombie processes on this node: {0}'.format(', '.join(zombie_processes)),
                                   code=ErrorCodes.process_zombie_found)

        # Check if there dead processes
        if len(dead_processes) == 0:
            result_handler.success('There are no dead processes on this node!', code=ErrorCodes.process_dead_found)
        else:
            result_handler.warning('We DETECTED dead processes on this node: {0}'.format(', '.join(dead_processes)),
                                   code=ErrorCodes.process_dead_found)

    @staticmethod
    @expose_to_cli(MODULE, 'model-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that the Framework model is in sync with the one from the Volumedriver',
                   short_help='Test if Framework and Volumedriver model match')
    def check_model_consistency(result_handler):
        """
        Checks if the model consistency of OVSDB vs. VOLUMEDRIVER and does a preliminary check on RABBITMQ
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Checking model consistency: ')

        # Checking consistency of volumedriver vs. ovsdb and backwards
        for vp in VPoolList.get_vpools():
            if vp.guid not in OpenvStorageHealthCheck.LOCAL_SR.vpools_guids:
                result_handler.skip('Skipping vPool {0} because it is not living here.'.format(vp.name))
                continue
            result_handler.info('Checking consistency of volumedriver vs. ovsdb for {0}: '.format(vp.name), add_to_result=False)
            missing_in_volumedriver = []
            missing_in_model = []
            try:
                # noinspection PyArgumentList
                voldrv_volume_list = vp.storagedriver_client.list_volumes()
            except (ClusterNotReachableException, RuntimeError) as ex:
                result_handler.warning('Seems like the volumedriver {0} is not running. Got {1}'.format(vp.name, str(ex)),
                                       code=ErrorCodes.voldrv_connection_problem)
                continue

            vdisk_volume_ids = []
            # Cross-reference model vs. volumedriver
            for vdisk in vp.vdisks:
                vdisk_volume_ids.append(vdisk.volume_id)
                if vdisk.volume_id not in voldrv_volume_list:
                    missing_in_volumedriver.append(vdisk.guid)
                else:
                    voldrv_volume_list.remove(vdisk.volume_id)
            # Cross-reference volumedriver vs. model
            for voldrv_id in voldrv_volume_list:
                if voldrv_id not in vdisk_volume_ids:
                    missing_in_model.append(voldrv_id)

            # Display discrepancies for vPool
            if len(missing_in_volumedriver) != 0:
                result_handler.warning('Detected volumes that are MISSING in volumedriver but are in ovsdb in vpool: {0} - vdisk guid(s):{1}.'
                                       .format(vp.name, ' '.join(missing_in_volumedriver)),
                                       code=ErrorCodes.missing_volumedriver)
            else:
                result_handler.success('No discrepancies found for ovsdb in vPool {0}'.format(vp.name), code=ErrorCodes.missing_volumedriver)

            if len(missing_in_model) != 0:
                result_handler.warning('Detected volumes that are AVAILABLE in volumedriver but are not in ovsdb in vpool: {0} - vdisk volume id(s):{1}'
                                       .format(vp.name, ', '.join(missing_in_model)),
                                       code=ErrorCodes.missing_ovsdb)
            else:
                result_handler.success('No discrepancies found for voldrv in vpool {0}'.format(vp.name), code=ErrorCodes.missing_ovsdb)

    @staticmethod
    @expose_to_cli(MODULE, 'verify-rabbitmq-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that RabbitMQ is properly running',
                   short_help='Test if RabbitMQ is properly running')
    def verify_rabbitmq(result_handler):
        """
        Verify rabbitmq
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        # RabbitMQ check: cluster verification
        result_handler.info('Pre-check: verification of RabbitMQ cluster.', add_to_result=False)
        if OpenvStorageHealthCheck.LOCAL_SR.node_type == 'MASTER':
            r = RabbitMQ(ip=OpenvStorageHealthCheck.LOCAL_SR.ip)
            partitions = r.partition_status()
            if len(partitions) == 0:
                result_handler.success('RabbitMQ has no partition issues!', code=ErrorCodes.process_rabbit_mq)
            else:
                result_handler.failure('RabbitMQ has partition issues: {0}'.format(', '.join(partitions)), code=ErrorCodes.process_rabbit_mq)
        else:
            result_handler.skip('RabbitMQ is not running/active on this server!')

    @staticmethod
    @expose_to_cli(MODULE, 'recovery-domain-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that all recovery domains are setup as regular domain',
                   short_help='Test if all recovery domains are setup as regular domain')
    def check_recovery_domains(result_handler):
        result_handler.info('Checking recovery domains:')
        prim_domains = [domain.name for domain in DomainList.get_domains() if len(domain.storage_router_layout['regular']) >= 1]
        for domain in DomainList.get_domains():
            layout = domain.storage_router_layout
            recovery = layout['recovery']
            regular = layout['regular']
            # Check recovery usage
            if len(recovery) >= 1 and domain.name not in prim_domains:
                sr_ips = ', '.join([StorageRouter(guid).ip for guid in recovery])
                result_handler.warning('Domain {0} set as recovery domain on storagerouter(s) {1}, but nowhere as regular domain'.format(domain.name, sr_ips))
            else:
                result_handler.info('Domain {0} passed test, set {1} time(s) as regular domain'.format(domain.name, len(regular)))

            # Check for double usage
            intersection = set(recovery).intersection(regular)
            if intersection:
                sr_ips = ', '.join([StorageRouter(guid).ip for guid in intersection])
                result_handler.warning('Recovery domain {0} is also found to be a regular domain in {1}.'.format(domain.name, sr_ips))
示例#46
0
    def remove_node(node_ip, silent=None):
        """
        Remove the node with specified IP from the cluster
        :param node_ip: IP of the node to remove
        :type node_ip: str
        :param silent: If silent == '--force-yes' no question will be asked to confirm the removal
        :type silent: str
        :return: None
        """
        from ovs.lib.storagedriver import StorageDriverController
        from ovs.lib.storagerouter import StorageRouterController
        from ovs.dal.lists.storagerouterlist import StorageRouterList

        Toolbox.log(logger=NodeRemovalController._logger, messages="Remove node", boxed=True)
        Toolbox.log(
            logger=NodeRemovalController._logger,
            messages="WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n",
        )

        ###############
        # VALIDATIONS #
        ###############
        try:
            node_ip = node_ip.strip()
            if not isinstance(node_ip, str):
                raise ValueError("Node IP must be a string")
            if not re.match(SSHClient.IP_REGEX, node_ip):
                raise ValueError("Invalid IP {0} specified".format(node_ip))

            storage_router_all = StorageRouterList.get_storagerouters()
            storage_router_masters = StorageRouterList.get_masters()
            storage_router_all_ips = set([storage_router.ip for storage_router in storage_router_all])
            storage_router_master_ips = set([storage_router.ip for storage_router in storage_router_masters])
            storage_router_to_remove = StorageRouterList.get_by_ip(node_ip)

            if node_ip not in storage_router_all_ips:
                raise ValueError(
                    "Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}".format(
                        "\n - ".join(storage_router_all_ips), node_ip
                    )
                )

            if len(storage_router_all_ips) == 1:
                raise RuntimeError("Removing the only node is not possible")

            if node_ip in storage_router_master_ips and len(storage_router_master_ips) == 1:
                raise RuntimeError("Removing the only master node is not possible")

            if System.get_my_storagerouter() == storage_router_to_remove:
                raise RuntimeError(
                    "The node to be removed cannot be identical to the node on which the removal is initiated"
                )

            Toolbox.log(
                logger=NodeRemovalController._logger, messages="Creating SSH connections to remaining master nodes"
            )
            master_ip = None
            ip_client_map = {}
            storage_routers_offline = []
            storage_router_to_remove_online = True
            for storage_router in storage_router_all:
                try:
                    client = SSHClient(storage_router, username="******")
                    if client.run(["pwd"]):
                        Toolbox.log(
                            logger=NodeRemovalController._logger,
                            messages="  Node with IP {0:<15} successfully connected to".format(storage_router.ip),
                        )
                        ip_client_map[storage_router.ip] = client
                        if storage_router != storage_router_to_remove and storage_router.node_type == "MASTER":
                            master_ip = storage_router.ip
                except UnableToConnectException:
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages="  Node with IP {0:<15} is unreachable".format(storage_router.ip),
                    )
                    storage_routers_offline.append(storage_router)
                    if storage_router == storage_router_to_remove:
                        storage_router_to_remove_online = False

            if len(ip_client_map) == 0 or master_ip is None:
                raise RuntimeError("Could not connect to any master node in the cluster")

            storage_router_to_remove.invalidate_dynamics("vdisks_guids")
            if (
                len(storage_router_to_remove.vdisks_guids) > 0
            ):  # vDisks are supposed to be moved away manually before removing a node
                raise RuntimeError("Still vDisks attached to Storage Router {0}".format(storage_router_to_remove.name))

            internal_memcached = Toolbox.is_service_internally_managed(service="memcached")
            internal_rabbit_mq = Toolbox.is_service_internally_managed(service="rabbitmq")
            memcached_endpoints = Configuration.get(key="/ovs/framework/memcache|endpoints")
            rabbit_mq_endpoints = Configuration.get(key="/ovs/framework/messagequeue|endpoints")
            copy_memcached_endpoints = list(memcached_endpoints)
            copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints)
            for endpoint in memcached_endpoints:
                if endpoint.startswith(storage_router_to_remove.ip):
                    copy_memcached_endpoints.remove(endpoint)
            for endpoint in rabbit_mq_endpoints:
                if endpoint.startswith(storage_router_to_remove.ip):
                    copy_rabbit_mq_endpoints.remove(endpoint)
            if len(copy_memcached_endpoints) == 0 and internal_memcached is True:
                raise RuntimeError(
                    "Removal of provided nodes will result in a complete removal of the memcached service"
                )
            if len(copy_rabbit_mq_endpoints) == 0 and internal_rabbit_mq is True:
                raise RuntimeError(
                    "Removal of provided nodes will result in a complete removal of the messagequeue service"
                )
        except Exception as exception:
            Toolbox.log(
                logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel="exception"
            )
            sys.exit(1)

        #################
        # CONFIRMATIONS #
        #################
        interactive = silent != "--force-yes"
        remove_asd_manager = not interactive  # Remove ASD manager if non-interactive else ask
        if interactive is True:
            proceed = Interactive.ask_yesno(
                message="Are you sure you want to remove node {0}?".format(storage_router_to_remove.name),
                default_value=False,
            )
            if proceed is False:
                Toolbox.log(logger=NodeRemovalController._logger, messages="Abort removal", title=True)
                sys.exit(1)

            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove, username="******")
                if ServiceManager.has_service(name="asd-manager", client=client):
                    remove_asd_manager = Interactive.ask_yesno(
                        message="Do you also want to remove the ASD manager and related ASDs?", default_value=False
                    )

            if remove_asd_manager is True or storage_router_to_remove_online is False:
                for function in Toolbox.fetch_hooks("setup", "validate_asd_removal"):
                    validation_output = function(storage_router_to_remove.ip)
                    if validation_output["confirm"] is True:
                        if Interactive.ask_yesno(message=validation_output["question"], default_value=False) is False:
                            remove_asd_manager = False
                            break

        ###########
        # REMOVAL #
        ###########
        try:
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages="Starting removal of node {0} - {1}".format(
                    storage_router_to_remove.name, storage_router_to_remove.ip
                ),
            )
            if storage_router_to_remove_online is False:
                Toolbox.log(
                    logger=NodeRemovalController._logger,
                    messages="  Marking all Storage Drivers served by Storage Router {0} as offline".format(
                        storage_router_to_remove.ip
                    ),
                )
                StorageDriverController.mark_offline(storagerouter_guid=storage_router_to_remove.guid)

            # Remove vPools
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages="  Removing vPools from node".format(storage_router_to_remove.ip),
            )
            storage_routers_offline_guids = [
                sr.guid for sr in storage_routers_offline if sr.guid != storage_router_to_remove.guid
            ]
            for storage_driver in storage_router_to_remove.storagedrivers:
                Toolbox.log(
                    logger=NodeRemovalController._logger,
                    messages="    Removing vPool {0} from node".format(storage_driver.vpool.name),
                )
                StorageRouterController.remove_storagedriver(
                    storagedriver_guid=storage_driver.guid, offline_storage_router_guids=storage_routers_offline_guids
                )

            # Demote if MASTER
            if storage_router_to_remove.node_type == "MASTER":
                NodeTypeController.demote_node(
                    cluster_ip=storage_router_to_remove.ip,
                    master_ip=master_ip,
                    ip_client_map=ip_client_map,
                    unique_id=storage_router_to_remove.machine_id,
                    unconfigure_memcached=internal_memcached,
                    unconfigure_rabbitmq=internal_rabbit_mq,
                    offline_nodes=storage_routers_offline,
                )

            # Stop / remove services
            Toolbox.log(logger=NodeRemovalController._logger, messages="Stopping and removing services")
            config_store = Configuration.get_store()
            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove, username="******")
                NodeRemovalController.remove_services(
                    client=client,
                    node_type=storage_router_to_remove.node_type.lower(),
                    logger=NodeRemovalController._logger,
                )
                service = "watcher-config"
                if ServiceManager.has_service(service, client=client):
                    Toolbox.log(logger=NodeRemovalController._logger, messages="Removing service {0}".format(service))
                    ServiceManager.stop_service(service, client=client)
                    ServiceManager.remove_service(service, client=client)

                if config_store == "etcd":
                    from ovs.extensions.db.etcd.installer import EtcdInstaller

                    if Configuration.get(key="/ovs/framework/external_config") is None:
                        Toolbox.log(logger=NodeRemovalController._logger, messages="      Removing Etcd cluster")
                        try:
                            EtcdInstaller.stop("config", client)
                            EtcdInstaller.remove("config", client)
                        except Exception as ex:
                            Toolbox.log(
                                logger=NodeRemovalController._logger,
                                messages=["\nFailed to unconfigure Etcd", ex],
                                loglevel="exception",
                            )

                    Toolbox.log(logger=NodeRemovalController._logger, messages="Removing Etcd proxy")
                    EtcdInstaller.remove_proxy("config", client.ip)

            Toolbox.run_hooks(
                component="noderemoval",
                sub_component="remove",
                logger=NodeRemovalController._logger,
                cluster_ip=storage_router_to_remove.ip,
                complete_removal=remove_asd_manager,
            )

            # Clean up model
            Toolbox.log(logger=NodeRemovalController._logger, messages="Removing node from model")
            for service in storage_router_to_remove.services:
                service.delete()
            for disk in storage_router_to_remove.disks:
                for partition in disk.partitions:
                    partition.delete()
                disk.delete()
            for j_domain in storage_router_to_remove.domains:
                j_domain.delete()
            Configuration.delete("/ovs/framework/hosts/{0}".format(storage_router_to_remove.machine_id))

            NodeTypeController.restart_framework_and_memcache_services(
                clients=ip_client_map,
                offline_node_ips=[node.ip for node in storage_routers_offline],
                logger=NodeRemovalController._logger,
            )

            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove, username="******")
                if config_store == "arakoon":
                    client.file_delete(filenames=[ArakoonConfiguration.CACC_LOCATION])
                client.file_delete(filenames=[Configuration.BOOTSTRAP_CONFIG_LOCATION])
            storage_router_to_remove.delete()
            Toolbox.log(logger=NodeRemovalController._logger, messages="Successfully removed node\n")
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger, messages="\n")
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=["An unexpected error occurred:", str(exception)],
                boxed=True,
                loglevel="exception",
            )
            sys.exit(1)
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages="\n")
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages="This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.",
                boxed=True,
                loglevel="error",
            )
            sys.exit(1)

        if remove_asd_manager is True:
            Toolbox.log(logger=NodeRemovalController._logger, messages="\nRemoving ASD Manager")
            with remote(storage_router_to_remove.ip, [os]) as rem:
                rem.os.system("asd-manager remove --force-yes")
        Toolbox.log(logger=NodeRemovalController._logger, messages="Remove nodes finished", title=True)
示例#47
0
    def remove_node(node_ip, silent=None):
        """
        Remove the node with specified IP from the cluster
        :param node_ip: IP of the node to remove
        :type node_ip: str
        :param silent: If silent == '--force-yes' no question will be asked to confirm the removal
        :type silent: str
        :return: None
        """
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.lib.storagedriver import StorageDriverController
        from ovs.lib.vpool import VPoolController

        Toolbox.log(logger=NodeRemovalController._logger,
                    messages='Remove node',
                    boxed=True)
        Toolbox.log(
            logger=NodeRemovalController._logger,
            messages=
            'WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n'
        )
        service_manager = ServiceFactory.get_manager()

        ###############
        # VALIDATIONS #
        ###############
        try:
            node_ip = node_ip.strip()
            if not isinstance(node_ip, str):
                raise ValueError('Node IP must be a string')
            if not re.match(SSHClient.IP_REGEX, node_ip):
                raise ValueError('Invalid IP {0} specified'.format(node_ip))

            storage_router_all = sorted(StorageRouterList.get_storagerouters(),
                                        key=lambda k: k.name)
            storage_router_masters = StorageRouterList.get_masters()
            storage_router_all_ips = set(
                [storage_router.ip for storage_router in storage_router_all])
            storage_router_master_ips = set([
                storage_router.ip for storage_router in storage_router_masters
            ])
            storage_router_to_remove = StorageRouterList.get_by_ip(node_ip)
            offline_reasons = {}
            if node_ip not in storage_router_all_ips:
                raise ValueError(
                    'Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}'
                    .format('\n - '.join(storage_router_all_ips), node_ip))

            if len(storage_router_all_ips) == 1:
                raise RuntimeError("Removing the only node is not possible")

            if node_ip in storage_router_master_ips and len(
                    storage_router_master_ips) == 1:
                raise RuntimeError(
                    "Removing the only master node is not possible")

            if System.get_my_storagerouter() == storage_router_to_remove:
                raise RuntimeError(
                    'The node to be removed cannot be identical to the node on which the removal is initiated'
                )

            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages='Creating SSH connections to remaining master nodes')
            master_ip = None
            ip_client_map = {}
            storage_routers_offline = []
            storage_router_to_remove_online = True
            for storage_router in storage_router_all:
                try:
                    client = SSHClient(storage_router,
                                       username='******',
                                       timeout=10)
                except (UnableToConnectException, NotAuthenticatedException,
                        TimeOutException) as ex:
                    if isinstance(ex, UnableToConnectException):
                        msg = 'Unable to connect'
                    elif isinstance(ex, NotAuthenticatedException):
                        msg = 'Could not authenticate'
                    elif isinstance(ex, TimeOutException):
                        msg = 'Connection timed out'
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages='  * Node with IP {0:<15}- {1}'.format(
                            storage_router.ip, msg))
                    offline_reasons[storage_router.ip] = msg
                    storage_routers_offline.append(storage_router)
                    if storage_router == storage_router_to_remove:
                        storage_router_to_remove_online = False
                    continue

                Toolbox.log(
                    logger=NodeRemovalController._logger,
                    messages='  * Node with IP {0:<15}- Successfully connected'
                    .format(storage_router.ip))
                ip_client_map[storage_router.ip] = client
                if storage_router != storage_router_to_remove and storage_router.node_type == 'MASTER':
                    master_ip = storage_router.ip

            if len(ip_client_map) == 0 or master_ip is None:
                raise RuntimeError(
                    'Could not connect to any master node in the cluster')

            storage_router_to_remove.invalidate_dynamics('vdisks_guids')
            if len(
                    storage_router_to_remove.vdisks_guids
            ) > 0:  # vDisks are supposed to be moved away manually before removing a node
                raise RuntimeError(
                    "Still vDisks attached to Storage Router {0}".format(
                        storage_router_to_remove.name))

            internal_memcached = Toolbox.is_service_internally_managed(
                service='memcached')
            internal_rabbit_mq = Toolbox.is_service_internally_managed(
                service='rabbitmq')
            memcached_endpoints = Configuration.get(
                key='/ovs/framework/memcache|endpoints')
            rabbit_mq_endpoints = Configuration.get(
                key='/ovs/framework/messagequeue|endpoints')
            copy_memcached_endpoints = list(memcached_endpoints)
            copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints)
            for endpoint in memcached_endpoints:
                if endpoint.startswith(storage_router_to_remove.ip):
                    copy_memcached_endpoints.remove(endpoint)
            for endpoint in rabbit_mq_endpoints:
                if endpoint.startswith(storage_router_to_remove.ip):
                    copy_rabbit_mq_endpoints.remove(endpoint)
            if len(copy_memcached_endpoints
                   ) == 0 and internal_memcached is True:
                raise RuntimeError(
                    'Removal of provided nodes will result in a complete removal of the memcached service'
                )
            if len(copy_rabbit_mq_endpoints
                   ) == 0 and internal_rabbit_mq is True:
                raise RuntimeError(
                    'Removal of provided nodes will result in a complete removal of the messagequeue service'
                )

            Toolbox.run_hooks(component='noderemoval',
                              sub_component='validate_removal',
                              logger=NodeRemovalController._logger,
                              cluster_ip=storage_router_to_remove.ip)
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=
                'Removal has been aborted during the validation step. No changes have been applied.',
                boxed=True,
                loglevel='warning')
            sys.exit(1)
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages=[str(exception)],
                        boxed=True,
                        loglevel='exception')
            sys.exit(1)

        #################
        # CONFIRMATIONS #
        #################
        try:
            interactive = silent != '--force-yes'
            remove_asd_manager = not interactive  # Remove ASD manager if non-interactive else ask
            if interactive is True:
                if len(storage_routers_offline) > 0:
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages=
                        'Certain nodes appear to be offline. These will not fully removed and will cause issues if they are not really offline.'
                    )
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages='Offline nodes: {0}'.format(''.join(
                            ('\n  * {0:<15}- {1}.'.format(ip, message)
                             for ip, message in offline_reasons.iteritems()))))
                    valid_node_info = Interactive.ask_yesno(
                        message=
                        'Continue the removal with these being presumably offline?',
                        default_value=False)
                    if valid_node_info is False:
                        Toolbox.log(
                            logger=NodeRemovalController._logger,
                            messages=
                            'Please validate the state of the nodes before removing.',
                            title=True)
                        sys.exit(1)
                proceed = Interactive.ask_yesno(
                    message='Are you sure you want to remove node {0}?'.format(
                        storage_router_to_remove.name),
                    default_value=False)
                if proceed is False:
                    Toolbox.log(logger=NodeRemovalController._logger,
                                messages='Abort removal',
                                title=True)
                    sys.exit(1)

                remove_asd_manager = True
                if storage_router_to_remove_online is True:
                    client = SSHClient(endpoint=storage_router_to_remove,
                                       username='******')
                    if service_manager.has_service(name='asd-manager',
                                                   client=client):
                        remove_asd_manager = Interactive.ask_yesno(
                            message=
                            'Do you also want to remove the ASD manager and related ASDs?',
                            default_value=False)

                if remove_asd_manager is True or storage_router_to_remove_online is False:
                    for fct in Toolbox.fetch_hooks('noderemoval',
                                                   'validate_asd_removal'):
                        validation_output = fct(storage_router_to_remove.ip)
                        if validation_output['confirm'] is True:
                            if Interactive.ask_yesno(
                                    message=validation_output['question'],
                                    default_value=False) is False:
                                remove_asd_manager = False
                                break
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=
                'Removal has been aborted during the confirmation step. No changes have been applied.',
                boxed=True,
                loglevel='warning')
            sys.exit(1)
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages=[str(exception)],
                        boxed=True,
                        loglevel='exception')
            sys.exit(1)
        ###########
        # REMOVAL #
        ###########
        try:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Starting removal of node {0} - {1}'.format(
                            storage_router_to_remove.name,
                            storage_router_to_remove.ip))
            if storage_router_to_remove_online is False:
                Toolbox.log(
                    logger=NodeRemovalController._logger,
                    messages=
                    '  Marking all Storage Drivers served by Storage Router {0} as offline'
                    .format(storage_router_to_remove.ip))
                StorageDriverController.mark_offline(
                    storagerouter_guid=storage_router_to_remove.guid)

            # Remove vPools
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='  Removing vPools from node'.format(
                            storage_router_to_remove.ip))
            storage_routers_offline_guids = [
                sr.guid for sr in storage_routers_offline
                if sr.guid != storage_router_to_remove.guid
            ]
            for storage_driver in storage_router_to_remove.storagedrivers:
                Toolbox.log(logger=NodeRemovalController._logger,
                            messages='    Removing vPool {0} from node'.format(
                                storage_driver.vpool.name))
                VPoolController.shrink_vpool(
                    storagedriver_guid=storage_driver.guid,
                    offline_storage_router_guids=storage_routers_offline_guids)

            # Demote if MASTER
            if storage_router_to_remove.node_type == 'MASTER':
                NodeTypeController.demote_node(
                    cluster_ip=storage_router_to_remove.ip,
                    master_ip=master_ip,
                    ip_client_map=ip_client_map,
                    unique_id=storage_router_to_remove.machine_id,
                    unconfigure_memcached=internal_memcached,
                    unconfigure_rabbitmq=internal_rabbit_mq,
                    offline_nodes=storage_routers_offline)

            # Stop / remove services
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Stopping and removing services')
            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove,
                                   username='******')
                NodeRemovalController.remove_services(
                    client=client,
                    node_type=storage_router_to_remove.node_type.lower(),
                    logger=NodeRemovalController._logger)
                service = 'watcher-config'
                if service_manager.has_service(service, client=client):
                    Toolbox.log(
                        logger=NodeRemovalController._logger,
                        messages='Removing service {0}'.format(service))
                    service_manager.stop_service(service, client=client)
                    service_manager.remove_service(service, client=client)

            Toolbox.run_hooks(component='noderemoval',
                              sub_component='remove',
                              logger=NodeRemovalController._logger,
                              cluster_ip=storage_router_to_remove.ip,
                              complete_removal=remove_asd_manager)

            # Clean up model
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Removing node from model')
            for service in storage_router_to_remove.services:
                service.delete()
            for disk in storage_router_to_remove.disks:
                for partition in disk.partitions:
                    partition.delete()
                disk.delete()
            for j_domain in storage_router_to_remove.domains:
                j_domain.delete()
            Configuration.delete('/ovs/framework/hosts/{0}'.format(
                storage_router_to_remove.machine_id))

            NodeTypeController.restart_framework_and_memcache_services(
                clients=ip_client_map,
                offline_node_ips=[node.ip for node in storage_routers_offline],
                logger=NodeRemovalController._logger)

            if storage_router_to_remove_online is True:
                client = SSHClient(endpoint=storage_router_to_remove,
                                   username='******')
                client.file_delete(filenames=[CACC_LOCATION])
                client.file_delete(filenames=[CONFIG_STORE_LOCATION])
            storage_router_to_remove.delete()
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='Successfully removed node\n')
        except Exception as exception:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=['An unexpected error occurred:',
                          str(exception)],
                boxed=True,
                loglevel='exception')
            sys.exit(1)
        except KeyboardInterrupt:
            Toolbox.log(logger=NodeRemovalController._logger, messages='\n')
            Toolbox.log(
                logger=NodeRemovalController._logger,
                messages=
                'This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.',
                boxed=True,
                loglevel='error')
            sys.exit(1)

        if remove_asd_manager is True and storage_router_to_remove_online is True:
            Toolbox.log(logger=NodeRemovalController._logger,
                        messages='\nRemoving ASD Manager')
            with remote(storage_router_to_remove.ip, [os]) as rem:
                rem.os.system('asd-manager remove --force-yes')
        Toolbox.log(logger=NodeRemovalController._logger,
                    messages='Remove nodes finished',
                    title=True)
    def get_update_information_alba_plugin(information):
        """
        Called when the 'Update' button in the GUI is pressed
        This call collects additional information about the packages which can be updated
        Eg:
            * Downtime for Arakoons
            * Downtime for StorageDrivers
            * Prerequisites that haven't been met
            * Services which will be stopped during update
            * Services which will be restarted after update
        """
        # Verify arakoon info
        arakoon_ovs_info = {'down': False,
                            'name': None,
                            'internal': False}
        arakoon_cacc_info = {'down': False,
                             'name': None,
                             'internal': False}
        for cluster in ['cacc', 'ovsdb']:
            cluster_name = ArakoonClusterConfig.get_cluster_name(cluster)
            if cluster_name is None:
                continue

            if cluster == 'cacc':
                arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name, filesystem=True, ip=System.get_my_storagerouter().ip)
            else:
                arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name)

            if arakoon_metadata['internal'] is True:
                config = ArakoonClusterConfig(cluster_id=cluster_name, filesystem=(cluster == 'cacc'))
                config.load_config(System.get_my_storagerouter().ip if cluster == 'cacc' else None)
                if cluster == 'ovsdb':
                    arakoon_ovs_info['down'] = len(config.nodes) < 3
                    arakoon_ovs_info['name'] = arakoon_metadata['cluster_name']
                    arakoon_ovs_info['internal'] = True
                else:
                    arakoon_cacc_info['name'] = arakoon_metadata['cluster_name']
                    arakoon_cacc_info['internal'] = True

        # Verify StorageRouter downtime
        fwk_prerequisites = []
        all_storagerouters = StorageRouterList.get_storagerouters()
        for storagerouter in all_storagerouters:
            try:
                SSHClient(endpoint=storagerouter, username='******')
            except UnableToConnectException:
                fwk_prerequisites.append(['node_down', storagerouter.name])

        # Verify ALBA node responsiveness
        alba_prerequisites = []
        for alba_node in AlbaNodeList.get_albanodes():
            try:
                alba_node.client.get_metadata()
            except Exception:
                alba_prerequisites.append(['alba_node_unresponsive', alba_node.ip])

        for key in ['framework', 'alba']:
            if key not in information:
                information[key] = {'packages': {},
                                    'downtime': [],
                                    'prerequisites': fwk_prerequisites if key == 'framework' else alba_prerequisites,
                                    'services_stop_start': set(),
                                    'services_post_update': set()}

            for storagerouter in StorageRouterList.get_storagerouters():
                if key not in storagerouter.package_information:
                    continue

                # Retrieve Arakoon issues
                arakoon_downtime = []
                arakoon_services = []
                for service in storagerouter.services:
                    if service.type.name not in [ServiceType.SERVICE_TYPES.ALBA_MGR, ServiceType.SERVICE_TYPES.NS_MGR]:
                        continue

                    if service.type.name == ServiceType.SERVICE_TYPES.ALBA_MGR:
                        cluster_name = AlbaController.get_abm_cluster_name(alba_backend=service.abm_service.alba_backend)
                    else:
                        cluster_name = AlbaController.get_nsm_cluster_name(alba_backend=service.nsm_service.alba_backend, number=service.nsm_service.number)
                    if Configuration.exists('/ovs/arakoon/{0}/config'.format(cluster_name), raw=True) is False:
                        continue
                    arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name)
                    if arakoon_metadata['internal'] is True:
                        arakoon_services.append('ovs-{0}'.format(service.name))
                        config = ArakoonClusterConfig(cluster_id=cluster_name, filesystem=False)
                        config.load_config()
                        if len(config.nodes) < 3:
                            if service.type.name == ServiceType.SERVICE_TYPES.NS_MGR:
                                arakoon_downtime.append(['backend', service.nsm_service.alba_backend.name])
                            else:
                                arakoon_downtime.append(['backend', service.abm_service.alba_backend.name])

                for package_name, package_info in storagerouter.package_information[key].iteritems():
                    if package_name not in AlbaUpdateController.alba_plugin_packages:
                        continue  # Only gather information for the core packages

                    information[key]['services_post_update'].update(package_info.pop('services_to_restart'))
                    if package_name not in information[key]['packages']:
                        information[key]['packages'][package_name] = {}
                    information[key]['packages'][package_name].update(package_info)

                    if package_name == 'openvstorage-backend':
                        if ['gui', None] not in information[key]['downtime']:
                            information[key]['downtime'].append(['gui', None])
                        if ['api', None] not in information[key]['downtime']:
                            information[key]['downtime'].append(['api', None])
                        information[key]['services_stop_start'].update({'watcher-framework', 'memcached'})
                    elif package_name == 'alba':
                        for down in arakoon_downtime:
                            if down not in information[key]['downtime']:
                                information[key]['downtime'].append(down)
                        information[key]['services_post_update'].update(arakoon_services)
                    elif package_name == 'arakoon':
                        if key == 'framework':
                            framework_arakoons = set()
                            if arakoon_ovs_info['internal'] is True:
                                framework_arakoons.add('ovs-arakoon-{0}'.format(arakoon_ovs_info['name']))
                            if arakoon_cacc_info['internal'] is True:
                                framework_arakoons.add('ovs-arakoon-{0}'.format(arakoon_cacc_info['name']))

                            information[key]['services_post_update'].update(framework_arakoons)
                            if arakoon_ovs_info['down'] is True and ['ovsdb', None] not in information[key]['downtime']:
                                information[key]['downtime'].append(['ovsdb', None])
                        else:
                            for down in arakoon_downtime:
                                if down not in information[key]['downtime']:
                                    information[key]['downtime'].append(down)
                            information[key]['services_post_update'].update(arakoon_services)

            for alba_node in AlbaNodeList.get_albanodes():
                for package_name, package_info in alba_node.package_information.get(key, {}).iteritems():
                    if package_name not in AlbaUpdateController.sdm_packages:
                        continue  # Only gather information for the SDM packages

                    information[key]['services_post_update'].update(package_info.pop('services_to_restart'))
                    if package_name not in information[key]['packages']:
                        information[key]['packages'][package_name] = {}
                    information[key]['packages'][package_name].update(package_info)
        return information
示例#49
0
        try:
            response = requests.post(self._url,
                                     data={'data': json.dumps(self.get_heartbeat_data())},
                                     headers={'Accept': 'application/json; version=1'})
            if response.status_code != 200:
                raise RuntimeError('Received invalid status code: {0} - {1}'.format(response.status_code, response.text))
            return_data = response.json()
        except Exception, ex:
            logger.exception('Unexpected error during support call: {0}'.format(ex))
            raise

        try:
            # Try to save the timestamp at which we last succefully send the heartbeat data
            from ovs.extensions.generic.system import System
            storagerouter = System.get_my_storagerouter()
            storagerouter.last_heartheat = time.time()
            storagerouter.save()
        except Exception:
            logger.error('Could not save last heartbeat timestamp')
            # Ignore this error, it's not mandatory for the support agent

        if self._enable_support:
            try:
                for task in return_data['tasks']:
                    self._process_task(task['code'], task['metadata'], self.servicemanager)
            except Exception, ex:
                logger.exception('Unexpected error processing tasks: {0}'.format(ex))
                raise
        if 'interval' in return_data:
            interval = return_data['interval']
示例#50
0
    def get(self, request, *args, **kwargs):
        """
        Fetches metadata
        """
        _ = args, kwargs
        data = {'authenticated': False,
                'authentication_state': None,
                'authentication_metadata': {},
                'username': None,
                'userguid': None,
                'roles': [],
                'identification': {},
                'storagerouter_ips': [sr.ip for sr in StorageRouterList.get_storagerouters()],
                'versions': list(settings.VERSION),
                'plugins': {}}
        try:
            # Gather plugin metadata
            plugins = {}
            # - Backends. BackendType plugins must set the has_plugin flag on True
            for backend_type in BackendTypeList.get_backend_types():
                if backend_type.has_plugin is True:
                    if backend_type.code not in plugins:
                        plugins[backend_type.code] = []
                    plugins[backend_type.code] += ['backend', 'gui']
            # - Generic plugins, as added to the configuration file(s)
            generic_plugins = Configuration.get('/ovs/framework/plugins/installed|generic')
            for plugin_name in generic_plugins:
                if plugin_name not in plugins:
                    plugins[plugin_name] = []
                plugins[plugin_name] += ['gui']
            data['plugins'] = plugins

            # Fill identification
            data['identification'] = {'cluster_id': Configuration.get('/ovs/framework/cluster_id')}

            # Get authentication metadata
            authentication_metadata = {'ip': System.get_my_storagerouter().ip}
            for key in ['mode', 'authorize_uri', 'client_id', 'scope']:
                if Configuration.exists('/ovs/framework/webapps|oauth2.{0}'.format(key)):
                    authentication_metadata[key] = Configuration.get('/ovs/framework/webapps|oauth2.{0}'.format(key))
            data['authentication_metadata'] = authentication_metadata

            # Gather authorization metadata
            if 'HTTP_AUTHORIZATION' not in request.META:
                return dict(data.items() + {'authentication_state': 'unauthenticated'}.items())
            authorization_type, access_token = request.META['HTTP_AUTHORIZATION'].split(' ')
            if authorization_type != 'Bearer':
                return dict(data.items() + {'authentication_state': 'invalid_authorization_type'}.items())
            tokens = BearerTokenList.get_by_access_token(access_token)
            if len(tokens) != 1:
                return dict(data.items() + {'authentication_state': 'invalid_token'}.items())
            token = tokens[0]
            if token.expiration < time.time():
                for junction in token.roles.itersafe():
                    junction.delete()
                token.delete()
                return dict(data.items() + {'authentication_state': 'token_expired'}.items())

            # Gather user metadata
            user = token.client.user
            if not user.is_active:
                return dict(data.items() + {'authentication_state': 'inactive_user'}.items())
            roles = [j.role.code for j in token.roles]

            return dict(data.items() + {'authenticated': True,
                                        'authentication_state': 'authenticated',
                                        'username': user.username,
                                        'userguid': user.guid,
                                        'roles': roles,
                                        'plugins': plugins}.items())
        except Exception as ex:
            MetadataView._logger.exception('Unexpected exception: {0}'.format(ex))
            return dict(data.items() + {'authentication_state': 'unexpected_exception'}.items())
class VolumedriverHealthCheck(object):
    """
    A healthcheck for the volumedriver components
    """
    MODULE = 'volumedriver'
    LOCAL_ID = System.get_my_machine_id()
    LOCAL_SR = System.get_my_storagerouter()
    VDISK_CHECK_SIZE = 1024 ** 3  # 1GB in bytes
    VDISK_HALTED_STATES = DataObject.enumerator('Halted_status', ['HALTED', 'FENCED'])
    VDISK_TIMEOUT_BEFORE_DELETE = 0.5
    # Only used to check status of a fenced volume. This should not be used to link a status of a non-halted/fenced volume
    FENCED_HALTED_STATUS_MAP = {'max_redirect': {'status': VDisk.STATUSES.NON_RUNNING,
                                                 'severity': 'failure',
                                                 'halted': ('These volumes are not running: {0}', ErrorCodes.volume_max_redirect),
                                                 'fenced': ('These volumes are fenced but not running on another node: {0}', ErrorCodes.volume_fenced_max_redirect)},
                                'halted': {'status': VDisk.STATUSES.HALTED,
                                           'severity': 'failure',
                                           'halted': ('These volumes are halted: {0}', ErrorCodes.volume_halted),
                                           'fenced': ('These volumes are fenced and but halted on another node: {0}', ErrorCodes.volume_fenced_halted)},
                                'connection_fail': {'status': 'UNKNOWN',
                                                    'severity': 'failure',
                                                    'halted': ('These volumes experienced a connectivity/timeout problem: {0}', ErrorCodes.voldrv_connection_problem),
                                                    'fenced': ('These volumes are fenced but experienced a connectivity/timeout problem on another node: {0}', ErrorCodes.voldrv_connection_problem)},
                                'ok': {'status': VDisk.STATUSES.RUNNING,
                                       'severity': 'failure',
                                       'halted': ('These volumes are running: {0}', ErrorCodes.volume_ok),
                                       'fenced': ('These volumes are fenced but running on another node: {0}', ErrorCodes.volume_fenced_ok)},
                                'not_found': {'status': 'NOT_FOUND',
                                              'severity': 'warning',
                                              'halted': ('These volumes could not be queried for information: {0}', ErrorCodes.volume_not_found),
                                              'fenced': ('These volumes are fenced but could not be queried for information on another node: {0}', ErrorCodes.volume_fenced_not_found)}}

    logger = Logger('healthcheck-ovs_volumedriver')

    @staticmethod
    @expose_to_cli(MODULE, 'dtl-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that all VDisks their DTL is properly running',
                   short_help='Test if DTL is properly running')
    def check_dtl(result_handler):
        """
        Checks the dtl for all vdisks on the local node
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        # Fetch vdisks hosted on this machine
        local_sr = System.get_my_storagerouter()
        if len(local_sr.vdisks_guids) == 0:
            return result_handler.skip('No VDisks present in cluster.')
        for vdisk_guid in local_sr.vdisks_guids:
            vdisk = VDisk(vdisk_guid)
            vdisk.invalidate_dynamics(['dtl_status', 'info'])
            if vdisk.dtl_status == 'ok_standalone' or vdisk.dtl_status == 'disabled':
                result_handler.success('VDisk {0}s DTL is disabled'.format(vdisk.name), code=ErrorCodes.volume_dtl_standalone)
            elif vdisk.dtl_status == 'ok_sync':
                result_handler.success('VDisk {0}s DTL is enabled and running.'.format(vdisk.name), code=ErrorCodes.volume_dtl_ok)
            elif vdisk.dtl_status == 'degraded':
                result_handler.warning('VDisk {0}s DTL is degraded.'.format(vdisk.name), code=ErrorCodes.volume_dtl_degraded)
            elif vdisk.dtl_status == 'checkup_required':
                result_handler.warning('VDisk {0}s DTL should be configured.'.format(vdisk.name), code=ErrorCodes.volume_dtl_checkup_required)
            elif vdisk.dtl_status == 'catch_up':
                result_handler.warning('VDisk {0}s DTL is enabled but still syncing.'.format(vdisk.name), code=ErrorCodes.volume_dtl_catch_up)
            else:
                result_handler.warning('VDisk {0}s DTL has an unknown status: {1}.'.format(vdisk.name, vdisk.dtl_status), code=ErrorCodes.volume_dtl_unknown)

    @staticmethod
    @timeout_decorator.timeout(30)
    def _check_volumedriver(vdisk_name, storagedriver_guid, logger, vdisk_size=VDISK_CHECK_SIZE):
        """
        Checks if the volumedriver can create a new vdisk
        :param vdisk_name: name of a vdisk (e.g. test.raw)
        :type vdisk_name: str
        :param storagedriver_guid: guid of a storagedriver
        :type storagedriver_guid: str
        :param vdisk_size: size of the volume in bytes (e.g. 10737418240 is 10GB in bytes)
        :type vdisk_size: int
        :param logger: logger instance
        :type logger: ovs.extensions.healthcheck.result.HCResults
        :return: True if succeeds
        :rtype: bool
        """
        try:
            VDiskController.create_new(vdisk_name, vdisk_size, storagedriver_guid)
        except FileExistsException:
            # can be ignored until fixed in framework
            # https://github.com/openvstorage/framework/issues/1247
            return True
        except Exception as ex:
            logger.failure('Creation of the vdisk failed. Got {0}'.format(str(ex)))
            return False
        return True

    @staticmethod
    @timeout_decorator.timeout(30)
    def _check_volumedriver_remove(vpool_name, vdisk_name, present=True):
        """
        Remove a vdisk from a vpool
        :param vdisk_name: name of a vdisk (e.g. test.raw)
        :type vdisk_name: str
        :param vpool_name: name of a vpool
        :type vpool_name: str
        :param present: should the disk be present?
        :type present: bool
        :return: True if disk is not present anymore
        :rtype: bool
        """
        try:
            vdisk = VDiskHelper.get_vdisk_by_name(vdisk_name=vdisk_name, vpool_name=vpool_name)
            VDiskController.delete(vdisk.guid)
            return True
        except VDiskNotFoundError:
            # not found, if it should be present, re-raise the exception
            if present:
                raise
            else:
                return True

    @staticmethod
    # @expose_to_cli(MODULE, 'volumedrivers-test', HealthCheckCLI.ADDON_TYPE,
    #                help='Verify that the Volumedrivers are responding to events',
    #                short_help='Test if Volumedrivers are responding to events')
    def check_volumedrivers(result_handler):
        """
        Checks if the VOLUMEDRIVERS work on a local machine (compatible with multiple vPools)
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        result_handler.info('Checking volumedrivers.', add_to_result=False)
        vpools = VPoolList.get_vpools()
        if len(vpools) == 0:
            result_handler.skip('No vPools found!')
            return
        for vp in vpools:
            name = 'ovs-healthcheck-test-{0}.raw'.format(VolumedriverHealthCheck.LOCAL_ID)
            if vp.guid not in VolumedriverHealthCheck.LOCAL_SR.vpools_guids:
                result_handler.skip('Skipping vPool {0} because it is not living here.'.format(vp.name))
                continue
            try:
                # delete if previous vdisk with this name exists
                storagedriver_guid = next((storagedriver.guid for storagedriver in vp.storagedrivers
                                           if storagedriver.storagedriver_id == vp.name +
                                           VolumedriverHealthCheck.LOCAL_ID))
                # create a new one
                volume = VolumedriverHealthCheck._check_volumedriver(name, storagedriver_guid, result_handler)

                if volume is True:
                    # delete the recently created
                    try:
                        VolumedriverHealthCheck._check_volumedriver_remove(vpool_name=vp.name, vdisk_name=name)
                    except Exception as ex:
                        raise RuntimeError('Could not delete the created volume. Got {0}'.format(str(ex)))
                    # Working at this point
                    result_handler.success('Volumedriver of vPool {0} is working fine!'.format(vp.name))
                else:
                    # not working
                    result_handler.failure('Something went wrong during vdisk creation on vpool {0}.'.format(vp.name))

            except TimeoutError:
                # timeout occurred, action took too long
                result_handler.warning('Volumedriver of vPool {0} seems to timeout.'.format(vp.name))
            except IOError as ex:
                # can be input/output error by volumedriver
                result_handler.failure('Volumedriver of vPool {0} seems to have IO problems. Got `{1}` while executing.'.format(vp.name, ex.message))
            except RuntimeError as ex:
                result_handler.failure('Volumedriver of vPool {0} seems to have problems. Got `{1}` while executing.'.format(vp.name, ex))
            except VDiskNotFoundError:
                result_handler.warning('Volume on vPool {0} was not found, please retry again'.format(vp.name))
            except Exception as ex:
                result_handler.failure('Uncaught exception for Volumedriver of vPool {0}.Got {1} while executing.'.format(vp.name, ex))
            finally:
                # Attempt to delete the created vdisk
                try:
                    VolumedriverHealthCheck._check_volumedriver_remove(vpool_name=vp.name, vdisk_name=name, present=False)
                except:
                    pass

    @classmethod
    def _is_volumedriver_timeout(cls, exception):
        """
        Validates whether a certain exception is a timeout exception (RuntimeError, prior to NodeNotReachable in voldriver 6.17)
        :param exception: Exception object to check
        :return: True if it is a timeout or False if it's not
        :rtype: bool
        """
        return isinstance(exception, ClusterNotReachableException) or isinstance(exception, RuntimeError) and 'failed to send XMLRPC request' in str(exception)

    @classmethod
    @expose_to_cli(MODULE, 'halted-volumes-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that there are no halted/fenced volumes within the cluster',
                   short_help='Test if there  are no halted/fenced volumes')
    def check_for_halted_volumes(cls, result_handler):
        """
        Checks for halted volumes on a single or multiple vPools
        This will only check the volume states on the current node. If any other volumedriver would be down,
        only the HA'd volumes would pop-up as they could appear halted here (should be verified by the volumedriver team)
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :return: None
        :rtype: NoneType
        """
        vpools = VPoolList.get_vpools()
        local_sr = System.get_my_storagerouter()

        if len(vpools) == 0:
            result_handler.skip('No vPools found!'.format(len(vpools)), code=ErrorCodes.vpools_none)
            return
        for vpool in vpools:
            log_start = 'Halted volumes test vPool {0}'.format(vpool.name)
            if vpool.guid not in local_sr.vpools_guids:
                result_handler.skip('{0} - Skipping vPool {1} because it is not living here.'.format(log_start, vpool.name),
                                    code=ErrorCodes.vpool_not_local, add_to_result=False)
                continue

            result_handler.info('{0} - Retrieving all information'.format(log_start), add_to_result=False)
            storagedriver = None
            for std in vpool.storagedrivers:
                if std.storagerouter_guid == local_sr.guid:
                    storagedriver = std
                    break

            if storagedriver is None:
                result_handler.failure('{0} - Could not associate a StorageDriver with this StorageRouter'.format(log_start),
                                       code=ErrorCodes.std_no_str)
                continue

            volume_fenced_states = dict((key, []) for key in cls.FENCED_HALTED_STATUS_MAP.keys())
            volume_lists = {cls.VDISK_HALTED_STATES.HALTED: [], cls.VDISK_HALTED_STATES.FENCED: []}
            volume_states = {cls.VDISK_HALTED_STATES.HALTED: {cls.VDISK_HALTED_STATES.HALTED: volume_lists[cls.VDISK_HALTED_STATES.HALTED]},
                             cls.VDISK_HALTED_STATES.FENCED: volume_fenced_states}  # Less loops to write for outputting
            result_handler.info('{0} - Scanning for halted volumes'.format(log_start), add_to_result=False)
            try:
                voldrv_client = vpool.storagedriver_client
                objectregistry_client = vpool.objectregistry_client
            except Exception:
                cls.logger.exception('{0} - Unable to instantiate the required clients'.format(log_start))
                result_handler.exception('{0} - Unable to load the Volumedriver clients'.format(log_start),
                                         code=ErrorCodes.voldr_unknown_problem)
                continue
            try:
                # Listing all halted volumes with the volumedriver client as it detects stolen volumes too (fenced instances)
                volumes = voldrv_client.list_halted_volumes(str(storagedriver.storagedriver_id))
            except Exception as ex:
                cls.logger.exception('{0} - Exception occurred when listing volumes'.format(log_start))
                if cls._is_volumedriver_timeout(ex) is False:
                    # Unhandled exception at this point
                    result_handler.exception('{0} - Unable to list the Volumes due to an unidentified problem. Please check the logging'.format(log_start),
                                             code=ErrorCodes.voldr_unknown_problem)
                else:
                    result_handler.failure('{0} - Could not list the volumes for due to a connection problem.'.format(log_start),
                                           code=ErrorCodes.voldrv_connection_problem)
                continue
            # Retrieve the parent of the current volume. If this id would not be identical to the one we fetched for, that would mean it is fenced
            # Object registry goes to Arakoon
            # Capturing any possible that would occur to provide a clearer vision of what went wrong
            for volume in volumes:
                try:
                    registry_entry = objectregistry_client.find(volume)
                    if registry_entry.node_id() == storagedriver.storagedriver_id:
                        volume_lists[cls.VDISK_HALTED_STATES.HALTED].append(volume)
                    else:
                        # Fenced
                        volume_lists[cls.VDISK_HALTED_STATES.FENCED].append(volume)
                except Exception:
                    msg = '{0} - Unable to consult the object registry client for volume \'{1}\''.format(log_start, volume)
                    cls.logger.exception(msg)
                    result_handler.exception(msg, code=ErrorCodes.voldr_unknown_problem)
            # Include fenced - OTHER state combo
            for volume in volume_lists[cls.VDISK_HALTED_STATES.FENCED]:
                try:
                    _, state = cls._get_volume_issue(voldrv_client, volume, log_start)
                    volume_fenced_states[state].append(volume)
                except Exception:
                    # Only unhandled at this point
                    result_handler.exception('{0} - Unable to the volume info for volume {1} due to an unidentified problem. Please check the logging'.format(log_start, volume),
                                             code=ErrorCodes.voldr_unknown_problem)
            for halted_state, volume_state_info in volume_states.iteritems():
                for state, volumes in volume_state_info.iteritems():
                    if len(volumes) == 0:
                        continue  # Skip OK/empty lists
                    map_value = cls.FENCED_HALTED_STATUS_MAP[state.lower()]
                    log_func = getattr(result_handler, map_value['severity'])
                    message, code = map_value[halted_state.lower()]
                    log_func('{0} - {1}'.format(log_start, message.format(', '.join(volumes))), code=code)
            # Call success in case nothing is wrong
            if all(len(l) == 0 for l in volume_lists.values()):
                result_handler.success('{0} - No volumes found in halted/fenced state'.format(log_start))

    @classmethod
    def _get_volume_issue(cls, voldrv_client, volume_id, log_start):
        """
        Maps all possible exceptions to a state. These states can be mapped to a status using the FENCED_HALTED_STATUS_MAP
        because the volumedriver does not return a state itself
        :param voldrv_client: Storagedriver client
        :param volume_id: Id of the volume
        :raises: The unhandled exception when such an exception could occur (we try to identify all problems but one could slip past us)
        :return: The volume_id and state
        :rtype: tuple(str, str)
        """
        state = 'ok'
        try:
            # Check if the information can be retrieved about the volume
            vol_info = voldrv_client.info_volume(volume_id, req_timeout_secs=5)
            if vol_info.halted is True:
                state = 'halted'
        except Exception as ex:
            cls.logger.exception('{0} - Exception occurred when fetching the info for volume \'{1}\''.format(log_start, volume_id))
            if isinstance(ex, ObjectNotFoundException):
                # Ignore ovsdb invalid entrees as model consistency will handle it.
                state = 'not_found'
            elif isinstance(ex, MaxRedirectsExceededException):
                # This means the volume is not halted but detached or unreachable for the Volumedriver
                state = 'max_redirect'
            # @todo replace RuntimeError with NodeNotReachableException
            elif any(isinstance(ex, exception) for exception in [ClusterNotReachableException, RuntimeError]):
                if cls._is_volumedriver_timeout(ex) is False:
                    # Unhandled exception at this point
                    raise
                # Timeout / connection problems
                state = 'connection_fail'
            else:
                # Something to be looked at
                raise
        return volume_id, state

    @staticmethod
    @timeout_decorator.timeout(5)
    def _check_filedriver(vp_name, test_name):
        """
        Async method to checks if a FILEDRIVER `touch` works on a vpool
        Always try to check if the file exists after performing this method
        :param vp_name: name of the vpool
        :type vp_name: str
        :param test_name: name of the test file (e.g. `ovs-healthcheck-LOCAL_ID`)
        :type test_name: str
        :return: True if succeeded, False if failed
        :rtype: bool
        """
        return subprocess.check_output('touch /mnt/{0}/{1}.xml'.format(vp_name, test_name), stderr=subprocess.STDOUT, shell=True)

    @staticmethod
    @timeout_decorator.timeout(5)
    def _check_filedriver_remove(vp_name):
        """
        Async method to checks if a FILEDRIVER `remove` works on a vpool
        Always try to check if the file exists after performing this method
        :param vp_name: name of the vpool
        :type vp_name: str
        :return: True if succeeded, False if failed
        :rtype: bool
        """
        subprocess.check_output('rm -f /mnt/{0}/ovs-healthcheck-test-*.xml'.format(vp_name), stderr=subprocess.STDOUT, shell=True)
        return not os.path.exists('/mnt/{0}/ovs-healthcheck-test-*.xml'.format(vp_name))

    @staticmethod
    # @expose_to_cli(MODULE, 'filedrivers-test', HealthCheckCLI.ADDON_TYPE,
    #                help='Verify that all Volumedrivers are accessible through FUSE',
    #                short_help='Test if that the FUSE layer is responding')
    # @todo replace fuse test with edge test
    def check_filedrivers(result_handler):
        """
        Checks if the file drivers work on a local machine (compatible with multiple vPools)
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        """
        result_handler.info('Checking file drivers.', add_to_result=False)
        vpools = VPoolList.get_vpools()
        # perform tests
        if len(vpools) == 0:
            result_handler.skip('No vPools found!')
            return
        for vp in vpools:
            name = 'ovs-healthcheck-test-{0}'.format(VolumedriverHealthCheck.LOCAL_ID)
            if vp.guid not in VolumedriverHealthCheck.LOCAL_SR.vpools_guids:
                result_handler.skip('Skipping vPool {0} because it is not living here.'.format(vp.name))
                continue
            try:
                VolumedriverHealthCheck._check_filedriver(vp.name, name)
                if os.path.exists('/mnt/{0}/{1}.xml'.format(vp.name, name)):
                    # working
                    VolumedriverHealthCheck._check_filedriver_remove(vp.name)
                    result_handler.success('Filedriver for vPool {0} is working fine!'.format(vp.name))
                else:
                    # not working
                    result_handler.failure('Filedriver for vPool {0} seems to have problems!'.format(vp.name))
            except TimeoutError:
                # timeout occurred, action took too long
                result_handler.warning('Filedriver of vPool {0} seems to have `timeout` problems'.format(vp.name))
            except subprocess.CalledProcessError:
                # can be input/output error by filedriver
                result_handler.failure('Filedriver of vPool {0} seems to have `input/output` problems'.format(vp.name))

    @staticmethod
    @expose_to_cli(MODULE, 'volume-potential-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that the Volumedrivers have enough VDisk potential left',
                   short_help='Test if the Volumedrivers can create enough VDisks')
    @expose_to_cli.option('--critical-vol-number', '-c', type=int, default=25, help='Minimum number of volumes left to create')
    def check_volume_potential(result_handler, critical_vol_number=25):
        """
        Checks all local storage drivers from a volume driver. Results in a success if enough volumes are available, a warning if the number of volumes is
        lower then a threshold value (critical_volume_number) and a failure if the nr of volumes ==0)
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        :param critical_vol_number: Mimimal number of volumes that can be made before throwing a warning
        :type critical_vol_number: int
        """
        result_handler.info('Checking volume potential of storagedrivers')

        if not isinstance(critical_vol_number, int) or critical_vol_number < 0:
            raise ValueError('Critical volume number should be a positive integer')

        for std in VolumedriverHealthCheck.LOCAL_SR.storagedrivers:
            try:
                std_config = StorageDriverConfiguration(std.vpool_guid, std.storagedriver_id)
                client = LocalStorageRouterClient(std_config.remote_path)
                vol_potential = client.volume_potential(str(std.storagedriver_id))
                if vol_potential >= critical_vol_number:
                    log_level = 'success'
                elif critical_vol_number > vol_potential > 0:
                    log_level = 'warning'
                else:
                    log_level = 'failure'
                getattr(result_handler, log_level)('Volume potential of local storage driver: {0}: {1} (potential at: {2})'.format(std.storagedriver_id, log_level.upper(), vol_potential))
            except RuntimeError:
                result_handler.exception('Unable to retrieve configuration for storagedriver {0}'.format(std.storagedriver_id))

    @staticmethod
    @expose_to_cli(MODULE, 'sco-cache-mountpoint-test', HealthCheckCLI.ADDON_TYPE,
                   help='Verify that sco-cache mountpoints are up and running',
                   short_help='Test if sco-cache mountpoints are up and running')
    def check_sco_cache_mountpoints(result_handler):
        """
        Iterates over StorageDrivers of a local StorageRouter and will check all its sco cache mount points.
        Will result in a warning log if the sco is in offline state
        :param result_handler: logging object
        :type result_handler: ovs.extensions.healthcheck.result.HCResults
        """
        result_handler.info('Checking sco cache mount points on all local storagedrivers')
        for std in VolumedriverHealthCheck.LOCAL_SR.storagedrivers:
            try:
                std_config = StorageDriverConfiguration(std.vpool_guid, std.storagedriver_id)
                client = LocalStorageRouterClient(std_config.remote_path)
                for std_info in client.sco_cache_mount_point_info(str(std.storagedriver_id)):
                    if std_info.offlined is True:
                        result_handler.warning('Mountpoint at location {0} of storagedriver {1} is in offline state'.format(std_info.path, std.storagedriver_id))
                    else:
                        result_handler.success('Mountpoint at location {0} of storagedriver {1} is in online state'.format(std_info.path, std.storagedriver_id))
            except RuntimeError:
                result_handler.exception('Unable to check sco cache mountpoint of storagedriver {0}'.format(std.storagedriver_id))
示例#52
0
    def get(self, request, *args, **kwargs):
        """
        Fetches metadata
        """
        _ = args, kwargs
        data = {'authenticated': False,
                'authentication_state': None,
                'authentication_metadata': {},
                'username': None,
                'userguid': None,
                'roles': [],
                'identification': {},
                'storagerouter_ips': [sr.ip for sr in StorageRouterList.get_storagerouters()],
                'versions': list(settings.VERSION),
                'plugins': {},
                'registration': {'registered': False,
                                 'remaining': None}}
        try:
            # Gather plugin metadata
            plugins = {}
            # - Backends. BackendType plugins must set the has_plugin flag on True
            for backend_type in BackendTypeList.get_backend_types():
                if backend_type.has_plugin is True:
                    if backend_type.code not in plugins:
                        plugins[backend_type.code] = []
                    plugins[backend_type.code] += ['backend', 'gui']
            # - Generic plugins, as added to the configuration file(s)
            generic_plugins = Configuration.get('ovs.plugins.generic')
            for plugin_name in generic_plugins:
                if plugin_name not in plugins:
                    plugins[plugin_name] = []
                plugins[plugin_name] += ['gui']
            data['plugins'] = plugins

            # Fill identification
            data['identification'] = {'cluster_id': Configuration.get('ovs.support.cid')}

            # Registration data
            registered = Configuration.get('ovs.core.registered')
            data['registration']['registered'] = registered
            if registered is False:
                cluster_install_time = None
                for storagerouter in StorageRouterList.get_storagerouters():
                    client = SSHClient(storagerouter)
                    install_time = client.config_read('ovs.core.install_time')
                    if cluster_install_time is None or (install_time is not None and install_time < cluster_install_time):
                        cluster_install_time = install_time
                if cluster_install_time is not None:
                    timeout_days = 30 * 24 * 60 * 60
                    data['registration']['remaining'] = (timeout_days - time.time() + cluster_install_time) / 24 / 60 / 60

            # Get authentication metadata
            authentication_metadata = {'ip': System.get_my_storagerouter().ip}
            for key in ['mode', 'authorize_uri', 'client_id', 'scope']:
                if Configuration.exists('ovs.webapps.oauth2.{0}'.format(key)):
                    authentication_metadata[key] = Configuration.get('ovs.webapps.oauth2.{0}'.format(key))
            data['authentication_metadata'] = authentication_metadata

            # Gather authorization metadata
            if 'HTTP_AUTHORIZATION' not in request.META:
                return HttpResponse, dict(data.items() + {'authentication_state': 'unauthenticated'}.items())
            authorization_type, access_token = request.META['HTTP_AUTHORIZATION'].split(' ')
            if authorization_type != 'Bearer':
                return HttpResponse, dict(data.items() + {'authentication_state': 'invalid_authorization_type'}.items())
            tokens = BearerTokenList.get_by_access_token(access_token)
            if len(tokens) != 1:
                return HttpResponse, dict(data.items() + {'authentication_state': 'invalid_token'}.items())
            token = tokens[0]
            if token.expiration < time.time():
                for junction in token.roles.itersafe():
                    junction.delete()
                token.delete()
                return HttpResponse, dict(data.items() + {'authentication_state': 'token_expired'}.items())

            # Gather user metadata
            user = token.client.user
            if not user.is_active:
                return HttpResponse, dict(data.items() + {'authentication_state': 'inactive_user'}.items())
            roles = [j.role.code for j in token.roles]

            return HttpResponse, dict(data.items() + {'authenticated': True,
                                                      'authentication_state': 'authenticated',
                                                      'username': user.username,
                                                      'userguid': user.guid,
                                                      'roles': roles,
                                                      'plugins': plugins}.items())
        except Exception as ex:
            logger.exception('Unexpected exception: {0}'.format(ex))
            return HttpResponse, dict(data.items() + {'authentication_state': 'unexpected_exception'}.items())
示例#53
0
    def get(self, request, *args, **kwargs):
        """
        Fetches metadata
        """
        _ = args, kwargs
        data = {
            'authenticated':
            False,
            'authentication_state':
            None,
            'authentication_metadata': {},
            'username':
            None,
            'userguid':
            None,
            'roles': [],
            'identification': {},
            'storagerouter_ips':
            [sr.ip for sr in StorageRouterList.get_storagerouters()],
            'versions':
            list(settings.VERSION),
            'plugins': {}
        }
        try:
            # Gather plugin metadata
            plugins = {}
            # - Backends. BackendType plugins must set the has_plugin flag on True
            for backend_type in BackendTypeList.get_backend_types():
                if backend_type.has_plugin is True:
                    if backend_type.code not in plugins:
                        plugins[backend_type.code] = []
                    plugins[backend_type.code] += ['backend', 'gui']
            # - Generic plugins, as added to the configuration file(s)
            generic_plugins = EtcdConfiguration.get(
                '/ovs/framework/plugins/installed|generic')
            for plugin_name in generic_plugins:
                if plugin_name not in plugins:
                    plugins[plugin_name] = []
                plugins[plugin_name] += ['gui']
            data['plugins'] = plugins

            # Fill identification
            data['identification'] = {
                'cluster_id':
                EtcdConfiguration.get('/ovs/framework/cluster_id')
            }

            # Get authentication metadata
            authentication_metadata = {'ip': System.get_my_storagerouter().ip}
            for key in ['mode', 'authorize_uri', 'client_id', 'scope']:
                if EtcdConfiguration.exists(
                        '/ovs/framework/webapps|oauth2.{0}'.format(key)):
                    authentication_metadata[key] = EtcdConfiguration.get(
                        '/ovs/framework/webapps|oauth2.{0}'.format(key))
            data['authentication_metadata'] = authentication_metadata

            # Gather authorization metadata
            if 'HTTP_AUTHORIZATION' not in request.META:
                return HttpResponse, dict(
                    data.items() +
                    {'authentication_state': 'unauthenticated'}.items())
            authorization_type, access_token = request.META[
                'HTTP_AUTHORIZATION'].split(' ')
            if authorization_type != 'Bearer':
                return HttpResponse, dict(
                    data.items() +
                    {'authentication_state': 'invalid_authorization_type'
                     }.items())
            tokens = BearerTokenList.get_by_access_token(access_token)
            if len(tokens) != 1:
                return HttpResponse, dict(
                    data.items() +
                    {'authentication_state': 'invalid_token'}.items())
            token = tokens[0]
            if token.expiration < time.time():
                for junction in token.roles.itersafe():
                    junction.delete()
                token.delete()
                return HttpResponse, dict(
                    data.items() +
                    {'authentication_state': 'token_expired'}.items())

            # Gather user metadata
            user = token.client.user
            if not user.is_active:
                return HttpResponse, dict(
                    data.items() +
                    {'authentication_state': 'inactive_user'}.items())
            roles = [j.role.code for j in token.roles]

            return HttpResponse, dict(
                data.items() + {
                    'authenticated': True,
                    'authentication_state': 'authenticated',
                    'username': user.username,
                    'userguid': user.guid,
                    'roles': roles,
                    'plugins': plugins
                }.items())
        except Exception as ex:
            MetadataView._logger.exception(
                'Unexpected exception: {0}'.format(ex))
            return HttpResponse, dict(
                data.items() +
                {'authentication_state': 'unexpected_exception'}.items())
示例#54
0
                data={'data': json.dumps(self.get_heartbeat_data())},
                headers={'Accept': 'application/json; version=1'})
            if response.status_code != 200:
                raise RuntimeError(
                    'Received invalid status code: {0} - {1}'.format(
                        response.status_code, response.text))
            return_data = response.json()
        except Exception, ex:
            logger.exception(
                'Unexpected error during support call: {0}'.format(ex))
            raise

        try:
            # Try to save the timestamp at which we last succefully send the heartbeat data
            from ovs.extensions.generic.system import System
            storagerouter = System.get_my_storagerouter()
            storagerouter.last_heartheat = time.time()
            storagerouter.save()
        except Exception:
            logger.error('Could not save last heartbeat timestamp')
            # Ignore this error, it's not mandatory for the support agent

        if self._enable_support:
            try:
                for task in return_data['tasks']:
                    self._process_task(task['code'], task['metadata'],
                                       self.servicemanager)
            except Exception, ex:
                logger.exception(
                    'Unexpected error processing tasks: {0}'.format(ex))
                raise
示例#55
0
class ServiceHelper(object):
    """
    A service helper class
    """

    LOCAL_SR = System.get_my_storagerouter()

    def __init__(self):
        pass

    @staticmethod
    def get_services():
        """
        Fetch all services

        :return:
        """
        return ServiceList.get_services()

    @staticmethod
    def get_service(service_guid):
        """
        Fetches a service by guid
        :param service_guid: guid of the service
        :type service_guid: str
        :return: Service object
        :rtype: ovs.dal.hybrids.service.service
        """
        return Service(service_guid)

    @staticmethod
    def get_local_services():
        """
        Fetches all services that run on this node
        :return: list of all services that run on this node
        :rtype: ovs.dal.lists.datalist.DataList
        """
        return DataList(
            Service, {
                'type':
                DataList.where_operator.AND,
                'items': [('storagerouter_guid', DataList.operator.EQUALS,
                           ServiceHelper.LOCAL_SR.guid)]
            })

    @staticmethod
    def get_local_arakoon_services():
        """
        Fetches all arakoon services that run on this node
        :return: list of all arakoon services that run on this node
        :rtype: ovs.dal.lists.datalist.DataList
        """
        return DataList(
            Service, {
                'type':
                DataList.where_operator.AND,
                'items': [('storagerouter_guid', DataList.operator.EQUALS,
                           ServiceHelper.LOCAL_SR.guid),
                          ('type.name', DataList.operator.IN, [
                              ServiceType.SERVICE_TYPES.ARAKOON,
                              ServiceType.SERVICE_TYPES.ALBA_MGR,
                              ServiceType.SERVICE_TYPES.NS_MGR
                          ])]
            })

    @staticmethod
    def get_local_abm_services():
        """
        Fetches all arakoon services that run on this node
        :return: list of all arakoon services that run on this node
        :rtype: ovs.dal.lists.datalist.DataList
        """
        return DataList(
            Service, {
                'type':
                DataList.where_operator.AND,
                'items': [('storagerouter_guid', DataList.operator.EQUALS,
                           ServiceHelper.LOCAL_SR.guid),
                          ('type.name', DataList.operator.EQUALS,
                           ServiceType.SERVICE_TYPES.ALBA_MGR)]
            })

    @staticmethod
    def get_local_voldr_services():
        """
        Fetches all alba proxy services that run on this node
        :return: list of all alba proxy services that run on this node
        :rtype: ovs.dal.lists.datalist.DataList
        """
        return DataList(
            Service, {
                'type':
                DataList.where_operator.AND,
                'items': [('storagerouter_guid', DataList.operator.EQUALS,
                           ServiceHelper.LOCAL_SR.guid),
                          ('type.name', DataList.operator.EQUALS,
                           ServiceType.SERVICE_TYPES.MD_SERVER)]
            })

    @staticmethod
    def get_local_proxy_services():
        """
        Fetches all alba proxy services that run on this node
        :return: list of all alba proxy services that run on this node
        :rtype: ovs.dal.lists.datalist.DataList
        """
        return DataList(
            Service, {
                'type':
                DataList.where_operator.AND,
                'items': [('storagerouter_guid', DataList.operator.EQUALS,
                           ServiceHelper.LOCAL_SR.guid),
                          ('type.name', DataList.operator.EQUALS,
                           ServiceType.SERVICE_TYPES.ALBA_PROXY)]
            })
示例#56
0
    def get_update_information_core(information):
        """
        Called when the 'Update' button in the GUI is pressed
        This call collects additional information about the packages which can be updated
        Eg:
            * Downtime for Arakoons
            * Downtime for StorageDrivers
            * Prerequisites that haven't been met
            * Services which will be stopped during update
            * Services which will be restarted after update
        """
        # Verify arakoon info
        arakoon_ovs_info = {'down': False,
                            'name': None,
                            'internal': False}
        arakoon_cacc_info = {'down': False,
                             'name': None,
                             'internal': False}
        arakoon_voldrv_info = {'down': False,
                               'name': None,
                               'internal': False}
        for cluster in ['cacc', 'ovsdb', 'voldrv']:
            cluster_name = ArakoonClusterConfig.get_cluster_name(cluster)
            if cluster_name is None:
                continue

            if cluster == 'cacc':
                arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name, filesystem=True, ip=System.get_my_storagerouter().ip)
            else:
                arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name)

            if arakoon_metadata['internal'] is True:
                config = ArakoonClusterConfig(cluster_id=cluster_name, filesystem=(cluster == 'cacc'))
                config.load_config(System.get_my_storagerouter().ip if cluster == 'cacc' else None)
                if cluster == 'ovsdb':
                    arakoon_ovs_info['down'] = len(config.nodes) < 3
                    arakoon_ovs_info['name'] = arakoon_metadata['cluster_name']
                    arakoon_ovs_info['internal'] = True
                elif cluster == 'voldrv':
                    arakoon_voldrv_info['down'] = len(config.nodes) < 3
                    arakoon_voldrv_info['name'] = arakoon_metadata['cluster_name']
                    arakoon_voldrv_info['internal'] = True
                else:
                    arakoon_cacc_info['name'] = arakoon_metadata['cluster_name']
                    arakoon_cacc_info['internal'] = True

        # Verify StorageRouter downtime
        prerequisites = []
        all_storagerouters = StorageRouterList.get_storagerouters()
        for storagerouter in all_storagerouters:
            try:
                SSHClient(endpoint=storagerouter, username='******')
            except UnableToConnectException:
                prerequisites.append(['node_down', storagerouter.name])

        for key in ['framework', 'storagedriver']:
            if key not in information:
                information[key] = {'packages': {},
                                    'downtime': [],
                                    'prerequisites': prerequisites,
                                    'services_stop_start': set(),
                                    'services_post_update': set()}

            for storagerouter in all_storagerouters:
                if key not in storagerouter.package_information:
                    continue

                # Retrieve ALBA proxy issues
                alba_services = []
                alba_downtime = []
                for service in storagerouter.services:
                    if service.type.name != ServiceType.SERVICE_TYPES.ALBA_PROXY or service.alba_proxy is None:
                        continue
                    alba_services.append(service.name)
                    alba_downtime.append(['proxy', service.alba_proxy.storagedriver.vpool.name])

                # Retrieve StorageDriver issues
                storagedriver_downtime = []
                storagedriver_services = []
                for sd in storagerouter.storagedrivers:
                    # Order of services is important, first we want to stop all volume-drivers, then DTLs
                    storagedriver_services.append('ovs-volumedriver_{0}'.format(sd.vpool.name))
                for sd in storagerouter.storagedrivers:
                    storagedriver_services.append('ovs-dtl_{0}'.format(sd.vpool.name))
                    if len(sd.vdisks_guids) > 0:
                        storagedriver_downtime.append(['voldrv', sd.vpool.name])

                # Retrieve the actual update information
                for package_name, package_info in storagerouter.package_information[key].iteritems():
                    if package_name not in UpdateController.all_core_packages:
                        continue  # Only gather information for the core packages

                    information[key]['services_post_update'].update(package_info.pop('services_to_restart'))
                    if package_name not in information[key]['packages']:
                        information[key]['packages'][package_name] = {}
                    information[key]['packages'][package_name].update(package_info)

                    if package_name == 'openvstorage':
                        if ['gui', None] not in information[key]['downtime']:
                            information[key]['downtime'].append(['gui', None])
                        if ['api', None] not in information[key]['downtime']:
                            information[key]['downtime'].append(['api', None])
                        information[key]['services_stop_start'].update({'watcher-framework', 'memcached'})
                    elif package_name == 'alba':
                        for down in alba_downtime:
                            if down not in information[key]['downtime']:
                                information[key]['downtime'].append(down)
                        information[key]['services_post_update'].update(alba_services)
                    elif package_name == 'volumedriver-no-dedup-base':
                        for down in storagedriver_downtime:
                            if down not in information[key]['downtime']:
                                information[key]['downtime'].append(down)
                        information[key]['services_post_update'].update(storagedriver_services)
                    elif package_name == 'volumedriver-no-dedup-server':
                        for down in storagedriver_downtime:
                            if down not in information[key]['downtime']:
                                information[key]['downtime'].append(down)
                        information[key]['services_post_update'].update(storagedriver_services)
                    elif package_name == 'arakoon':
                        if key == 'framework':
                            framework_arakoons = set()
                            if arakoon_ovs_info['internal'] is True:
                                framework_arakoons.add('ovs-arakoon-{0}'.format(arakoon_ovs_info['name']))
                            if arakoon_cacc_info['internal'] is True:
                                framework_arakoons.add('ovs-arakoon-{0}'.format(arakoon_cacc_info['name']))

                            information[key]['services_post_update'].update(framework_arakoons)
                            if arakoon_ovs_info['down'] is True and ['ovsdb', None] not in information[key]['downtime']:
                                information[key]['downtime'].append(['ovsdb', None])
                        elif arakoon_voldrv_info['internal'] is True:
                            information[key]['services_post_update'].update({'ovs-arakoon-{0}'.format(arakoon_voldrv_info['name'])})
                            if arakoon_voldrv_info['down'] is True and ['voldrv', None] not in information[key]['downtime']:
                                information[key]['downtime'].append(['voldrv', None])
        return information