示例#1
0
 def mds_checkup():
     """
     Validates the current MDS setup/configuration and takes actions where required
     """
     mds_dict = {}
     for vpool in VPoolList.get_vpools():
         for mds_service in vpool.mds_services:
             storagerouter = mds_service.service.storagerouter
             if vpool not in mds_dict:
                 mds_dict[vpool] = {}
             if storagerouter not in mds_dict[vpool]:
                 mds_dict[vpool][storagerouter] = {'client': SSHClient(storagerouter, username='******'),
                                                   'services': []}
             mds_dict[vpool][storagerouter]['services'].append(mds_service)
     for vpool, storagerouter_info in mds_dict.iteritems():
         # 1. First, make sure there's at least one MDS on every StorageRouter that's not overloaded
         # If not, create an extra MDS for that StorageRouter
         for storagerouter in storagerouter_info:
             client = mds_dict[vpool][storagerouter]['client']
             mds_services = mds_dict[vpool][storagerouter]['services']
             has_room = False
             for mds_service in mds_services[:]:
                 if mds_service.capacity == 0 and len(mds_service.vdisks_guids) == 0:
                     client = SSHClient(storagerouter)
                     MDSServiceController.remove_mds_service(mds_service, client, storagerouter, vpool, reload_config=True)
                     mds_services.remove(mds_service)
             for mds_service in mds_services:
                 _, load = MDSServiceController.get_mds_load(mds_service)
                 if load < Configuration.get('ovs.storagedriver.mds.maxload'):
                     has_room = True
                     break
             if has_room is False:
                 mds_service = MDSServiceController.prepare_mds_service(client, storagerouter, vpool,
                                                                        fresh_only=False, reload_config=True)
                 if mds_service is None:
                     raise RuntimeError('Could not add MDS node')
                 mds_services.append(mds_service)
         mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(vpool)
         for storagerouter in mds_dict[vpool]:
             client = mds_dict[vpool][storagerouter]['client']
             storagedriver_config = StorageDriverConfiguration('storagedriver', vpool.name)
             storagedriver_config.load(client)
             if storagedriver_config.is_new is False:
                 storagedriver_config.clean()  # Clean out obsolete values
                 storagedriver_config.configure_filesystem(
                     fs_metadata_backend_mds_nodes=mds_config_set[storagerouter.guid]
                 )
                 storagedriver_config.save(client)
         # 2. Per VPool, execute a safety check, making sure the master/slave configuration is optimal.
         for vdisk in vpool.vdisks:
             MDSServiceController.ensure_safety(vdisk)
示例#2
0
    def mds_checkup():
        """
        Validates the current MDS setup/configuration and takes actions where required
        """
        MDSServiceController._logger.info('MDS checkup - Started')
        mds_dict = {}
        for vpool in VPoolList.get_vpools():
            MDSServiceController._logger.info('MDS checkup - vPool {0}'.format(
                vpool.name))
            mds_dict[vpool] = {}
            for mds_service in vpool.mds_services:
                storagerouter = mds_service.service.storagerouter
                if storagerouter not in mds_dict[vpool]:
                    mds_dict[vpool][storagerouter] = {
                        'client': None,
                        'services': []
                    }
                    try:
                        mds_dict[vpool][storagerouter]['client'] = SSHClient(
                            storagerouter, username='******')
                        MDSServiceController._logger.info(
                            'MDS checkup - vPool {0} - Storage Router {1} - ONLINE'
                            .format(vpool.name, storagerouter.name))
                    except UnableToConnectException:
                        MDSServiceController._logger.info(
                            'MDS checkup - vPool {0} - Storage Router {1} - OFFLINE'
                            .format(vpool.name, storagerouter.name))
                mds_dict[vpool][storagerouter]['services'].append(mds_service)

        failures = []
        max_load = Configuration.get(
            '/ovs/framework/storagedriver|mds_maxload')
        for vpool, storagerouter_info in mds_dict.iteritems():
            # 1. First, make sure there's at least one MDS on every StorageRouter that's not overloaded
            # If not, create an extra MDS for that StorageRouter
            for storagerouter in storagerouter_info:
                client = mds_dict[vpool][storagerouter]['client']
                mds_services = mds_dict[vpool][storagerouter]['services']
                has_room = False
                for mds_service in mds_services[:]:
                    if mds_service.capacity == 0 and len(
                            mds_service.vdisks_guids) == 0:
                        MDSServiceController._logger.info(
                            'MDS checkup - Removing mds_service {0} for vPool {1}'
                            .format(mds_service.number, vpool.name))
                        MDSServiceController.remove_mds_service(
                            mds_service,
                            vpool,
                            reconfigure=True,
                            allow_offline=client is None)
                        mds_services.remove(mds_service)
                for mds_service in mds_services:
                    _, load = MDSServiceController.get_mds_load(mds_service)
                    if load < max_load:
                        has_room = True
                        break
                MDSServiceController._logger.info(
                    'MDS checkup - vPool {0} - Storage Router {1} - Capacity available: {2}'
                    .format(vpool.name, storagerouter.name, has_room))
                if has_room is False and client is not None:
                    mds_service = MDSServiceController.prepare_mds_service(
                        storagerouter=storagerouter,
                        vpool=vpool,
                        fresh_only=False,
                        reload_config=True)
                    if mds_service is None:
                        raise RuntimeError('Could not add MDS node')
                    mds_services.append(mds_service)
            mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(
                vpool, True)
            for storagerouter in storagerouter_info:
                client = mds_dict[vpool][storagerouter]['client']
                if client is None:
                    MDSServiceController._logger.info(
                        'MDS checkup - vPool {0} - Storage Router {1} - Marked as offline, not setting default MDS configuration'
                        .format(vpool.name, storagerouter.name))
                    continue
                storagedriver = [
                    sd for sd in storagerouter.storagedrivers
                    if sd.vpool_guid == vpool.guid
                ][0]
                storagedriver_config = StorageDriverConfiguration(
                    'storagedriver', vpool.guid,
                    storagedriver.storagedriver_id)
                storagedriver_config.load()
                if storagedriver_config.is_new is False:
                    MDSServiceController._logger.info(
                        'MDS checkup - vPool {0} - Storage Router {1} - Storing default MDS configuration: {2}'
                        .format(vpool.name, storagerouter.name,
                                mds_config_set[storagerouter.guid]))
                    storagedriver_config.configure_filesystem(
                        fs_metadata_backend_mds_nodes=mds_config_set[
                            storagerouter.guid])
                    storagedriver_config.save(client)
            # 2. Per VPool, execute a safety check, making sure the master/slave configuration is optimal.
            MDSServiceController._logger.info(
                'MDS checkup - vPool {0} - Ensuring safety for all virtual disks'
                .format(vpool.name))
            for vdisk in vpool.vdisks:
                try:
                    MDSServiceController.ensure_safety(vdisk)
                except Exception:
                    message = 'Ensure safety for vDisk {0} with guid {1} failed'.format(
                        vdisk.name, vdisk.guid)
                    MDSServiceController._logger.exception(message)
                    failures.append(message)
        if len(failures) > 0:
            raise Exception('\n - ' + '\n - '.join(failures))
        MDSServiceController._logger.info('MDS checkup - Finished')
示例#3
0
    def add_vpool(cls, parameters):
        """
        Add a vPool to the machine this task is running on
        :param parameters: Parameters for vPool creation
        :type parameters: dict
        :return: None
        :rtype: NoneType
        """
        # TODO: Add logging
        cls._logger.debug('Adding vpool. Parameters: {}'.format(parameters))
        # VALIDATIONS
        if not isinstance(parameters, dict):
            raise ValueError(
                'Parameters passed to create a vPool should be of type dict')

        # Check StorageRouter existence
        storagerouter = StorageRouterList.get_by_ip(
            ip=parameters.get('storagerouter_ip'))
        if storagerouter is None:
            raise RuntimeError('Could not find StorageRouter')

        # Validate requested vPool configurations
        vp_installer = VPoolInstaller(name=parameters.get('vpool_name'))
        vp_installer.validate(storagerouter=storagerouter)

        # Validate requested StorageDriver configurations
        cls._logger.info(
            'vPool {0}: Validating StorageDriver configurations'.format(
                vp_installer.name))
        sd_installer = StorageDriverInstaller(
            vp_installer=vp_installer,
            configurations={
                'storage_ip': parameters.get('storage_ip'),
                'caching_info': parameters.get('caching_info'),
                'backend_info': {
                    'main':
                    parameters.get('backend_info'),
                    StorageDriverConfiguration.CACHE_BLOCK:
                    parameters.get('backend_info_bc'),
                    StorageDriverConfiguration.CACHE_FRAGMENT:
                    parameters.get('backend_info_fc')
                },
                'connection_info': {
                    'main':
                    parameters.get('connection_info'),
                    StorageDriverConfiguration.CACHE_BLOCK:
                    parameters.get('connection_info_bc'),
                    StorageDriverConfiguration.CACHE_FRAGMENT:
                    parameters.get('connection_info_fc')
                },
                'sd_configuration': parameters.get('config_params')
            })

        partitions_mutex = volatile_mutex('add_vpool_partitions_{0}'.format(
            storagerouter.guid))
        try:
            # VPOOL CREATION
            # Create the vPool as soon as possible in the process to be displayed in the GUI (INSTALLING/EXTENDING state)
            if vp_installer.is_new is True:
                vp_installer.create(rdma_enabled=sd_installer.rdma_enabled)
                vp_installer.configure_mds(
                    config=parameters.get('mds_config_params', {}))
            else:
                vp_installer.update_status(status=VPool.STATUSES.EXTENDING)

            # ADDITIONAL VALIDATIONS
            # Check StorageRouter connectivity
            cls._logger.info(
                'vPool {0}: Validating StorageRouter connectivity'.format(
                    vp_installer.name))
            linked_storagerouters = [storagerouter]
            if vp_installer.is_new is False:
                linked_storagerouters += [
                    sd.storagerouter
                    for sd in vp_installer.vpool.storagedrivers
                ]

            sr_client_map = SSHClient.get_clients(
                endpoints=linked_storagerouters, user_names=['ovs', 'root'])
            offline_nodes = sr_client_map.pop('offline')
            if storagerouter in offline_nodes:
                raise RuntimeError(
                    'Node on which the vPool is being {0} is not reachable'.
                    format('created'
                           if vp_installer.is_new is True else 'extended'))

            sr_installer = StorageRouterInstaller(
                root_client=sr_client_map[storagerouter]['root'],
                sd_installer=sd_installer,
                vp_installer=vp_installer,
                storagerouter=storagerouter)

            # When 2 or more jobs simultaneously run on the same StorageRouter, we need to check and create the StorageDriver partitions in locked context
            partitions_mutex.acquire(wait=60)
            sr_installer.partition_info = StorageRouterController.get_partition_info(
                storagerouter_guid=storagerouter.guid)
            sr_installer.validate_vpool_extendable()
            sr_installer.validate_global_write_buffer(
                requested_size=parameters.get('writecache_size', 0))
            sr_installer.validate_local_cache_size(
                requested_proxies=parameters.get('parallelism', {}).get(
                    'proxies', 2))

            # MODEL STORAGEDRIVER AND PARTITION JUNCTIONS
            sd_installer.create()
            sd_installer.create_partitions()
            partitions_mutex.release()

            vp_installer.refresh_metadata()
        except Exception:
            cls._logger.exception(
                'Something went wrong during the validation or modeling of vPool {0} on StorageRouter {1}'
                .format(vp_installer.name, storagerouter.name))
            partitions_mutex.release()
            vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
            raise

        # Arakoon setup
        counter = 0
        while counter < 300:
            try:
                if StorageDriverController.manual_voldrv_arakoon_checkup(
                ) is True:
                    break
            except Exception:
                cls._logger.exception(
                    'Arakoon checkup for voldrv cluster failed')
                vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
                raise
            counter += 1
            time.sleep(1)
            if counter == 300:
                vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
                raise RuntimeError(
                    'Arakoon checkup for the StorageDriver cluster could not be started'
                )

        # Cluster registry
        try:
            vp_installer.configure_cluster_registry(allow_raise=True)
        except Exception:
            if vp_installer.is_new is True:
                vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING)
            else:
                vp_installer.revert_vpool(status=VPool.STATUSES.FAILURE)
            raise

        try:
            sd_installer.setup_proxy_configs()
            sd_installer.configure_storagedriver_service()
            DiskController.sync_with_reality(storagerouter.guid)
            MDSServiceController.prepare_mds_service(
                storagerouter=storagerouter, vpool=vp_installer.vpool)

            # Update the MDS safety if changed via API (vpool.configuration will be available at this point also for the newly added StorageDriver)
            vp_installer.vpool.invalidate_dynamics('configuration')
            if vp_installer.mds_safety is not None and vp_installer.vpool.configuration[
                    'mds_config']['mds_safety'] != vp_installer.mds_safety:
                Configuration.set(
                    key='/ovs/vpools/{0}/mds_config|mds_safety'.format(
                        vp_installer.vpool.guid),
                    value=vp_installer.mds_safety)

            sd_installer.start_services(
            )  # Create and start watcher volumedriver, DTL, proxies and StorageDriver services

            # Post creation/extension checkups
            mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(
                vpool=vp_installer.vpool, offline_nodes=offline_nodes)
            for sr, clients in sr_client_map.iteritems():
                for current_storagedriver in [
                        sd for sd in sr.storagedrivers
                        if sd.vpool_guid == vp_installer.vpool.guid
                ]:
                    storagedriver_config = StorageDriverConfiguration(
                        vpool_guid=vp_installer.vpool.guid,
                        storagedriver_id=current_storagedriver.storagedriver_id
                    )
                    if storagedriver_config.config_missing is False:
                        # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem
                        # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them
                        storagedriver_config.configure_filesystem(
                            fs_metadata_backend_mds_nodes=mds_config_set[
                                sr.guid])
                        storagedriver_config.save(client=clients['ovs'])

            # Everything's reconfigured, refresh new cluster configuration
            for current_storagedriver in vp_installer.vpool.storagedrivers:
                if current_storagedriver.storagerouter not in sr_client_map:
                    continue
                vp_installer.vpool.storagedriver_client.update_cluster_node_configs(
                    str(current_storagedriver.storagedriver_id),
                    req_timeout_secs=10)
        except Exception:
            cls._logger.exception('vPool {0}: Creation failed'.format(
                vp_installer.name))
            vp_installer.update_status(status=VPool.STATUSES.FAILURE)
            raise

        # When a node is offline, we can run into errors, but also when 1 or more volumes are not running
        # Scheduled tasks below, so don't really care whether they succeed or not
        try:
            VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid,
                                        ensure_single_timeout=600)
        except:
            pass
        for vdisk in vp_installer.vpool.vdisks:
            try:
                MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid)
            except:
                pass
        vp_installer.update_status(status=VPool.STATUSES.RUNNING)
        cls._logger.info('Add vPool {0} ended successfully'.format(
            vp_installer.name))
示例#4
0
    def mds_checkup():
        """
        Validates the current MDS setup/configuration and takes actions where required
        Actions:
            * Verify which StorageRouters are available
            * Make mapping between vPools and its StorageRouters
            * For each vPool make sure every StorageRouter has at least 1 MDS service with capacity available
            * For each vPool retrieve the optimal configuration and store it for each StorageDriver
            * For each vPool run an ensure safety for all vDisks
        :raises RuntimeError: When ensure safety fails for any vDisk
        :return: None
        :rtype: NoneType
        """
        MDSServiceController._logger.info('Started')

        # Verify StorageRouter availability
        root_client_cache = {}
        storagerouters = StorageRouterList.get_storagerouters()
        storagerouters.sort(key=lambda _sr: ExtensionsToolbox.advanced_sort(
            element=_sr.ip, separator='.'))
        offline_nodes = []
        for storagerouter in storagerouters:
            try:
                root_client = SSHClient(endpoint=storagerouter,
                                        username='******')
                MDSServiceController._logger.debug(
                    'StorageRouter {0} - ONLINE'.format(storagerouter.name))
            except UnableToConnectException:
                root_client = None
                offline_nodes.append(storagerouter)
                MDSServiceController._logger.error(
                    'StorageRouter {0} - OFFLINE'.format(storagerouter.name))
            root_client_cache[storagerouter] = root_client

        # Create mapping per vPool and its StorageRouters
        mds_dict = collections.OrderedDict()
        for vpool in sorted(VPoolList.get_vpools(), key=lambda k: k.name):
            MDSServiceController._logger.info('vPool {0}'.format(vpool.name))
            mds_dict[vpool] = {}

            # Loop all StorageDrivers and add StorageDriver to mapping
            for storagedriver in vpool.storagedrivers:
                storagerouter = storagedriver.storagerouter
                if storagerouter not in mds_dict[vpool]:
                    mds_dict[vpool][storagerouter] = {
                        'client': root_client_cache.get(storagerouter),
                        'services': [],
                        'storagedriver': storagedriver
                    }

            # Loop all MDS Services and append services to appropriate vPool / StorageRouter combo
            mds_services = vpool.mds_services
            mds_services.sort(
                key=lambda _mds_service: ExtensionsToolbox.advanced_sort(
                    element=_mds_service.service.storagerouter.ip,
                    separator='.'))
            for mds_service in mds_services:
                service = mds_service.service
                storagerouter = service.storagerouter
                if storagerouter not in mds_dict[vpool]:
                    mds_dict[vpool][storagerouter] = {
                        'client': root_client_cache.get(storagerouter),
                        'services': [],
                        'storagedriver': None
                    }
                MDSServiceController._logger.debug(
                    'vPool {0} - StorageRouter {1} - Service on port {2}'.
                    format(vpool.name, storagerouter.name, service.ports[0]))
                mds_dict[vpool][storagerouter]['services'].append(mds_service)

        failures = []
        for vpool, storagerouter_info in mds_dict.iteritems():
            # Make sure there's at least 1 MDS on every StorageRouter that's not overloaded
            # Remove all MDS Services which have been manually marked for removal (by setting its capacity to 0)
            max_load = Configuration.get(
                '/ovs/vpools/{0}/mds_config|mds_maxload'.format(vpool.guid))
            for storagerouter in sorted(storagerouter_info,
                                        key=lambda k: k.ip):
                total_load = 0.0
                root_client = mds_dict[vpool][storagerouter]['client']
                mds_services = mds_dict[vpool][storagerouter]['services']

                for mds_service in list(
                        sorted(mds_services, key=lambda k: k.number)):
                    port = mds_service.service.ports[0]
                    number = mds_service.number
                    # Manual intervention required here in order for the MDS to be cleaned up
                    # @TODO: Remove this and make a dynamic calculation to check which MDSes to remove
                    if mds_service.capacity == 0 and len(
                            mds_service.vdisks_guids) == 0:
                        MDSServiceController._logger.warning(
                            'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Removing'
                            .format(vpool.name, storagerouter.name, number,
                                    port))
                        try:
                            MDSServiceController.remove_mds_service(
                                mds_service=mds_service,
                                reconfigure=True,
                                allow_offline=root_client is None)
                        except Exception:
                            MDSServiceController._logger.exception(
                                'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Failed to remove'
                                .format(vpool.name, storagerouter.name, number,
                                        port))
                        mds_services.remove(mds_service)
                    else:
                        _, next_load = MDSServiceController.get_mds_load(
                            mds_service=mds_service)
                        if next_load == float('inf'):
                            total_load = sys.maxint * -1  # Cast to lowest possible value if any MDS service capacity is set to infinity
                        else:
                            total_load += next_load

                        if next_load < max_load:
                            MDSServiceController._logger.debug(
                                'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Capacity available - Load at {4}%'
                                .format(vpool.name, storagerouter.name, number,
                                        port, next_load))
                        else:
                            MDSServiceController._logger.debug(
                                'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: No capacity available - Load at {4}%'
                                .format(vpool.name, storagerouter.name, number,
                                        port, next_load))

                if total_load >= max_load * len(mds_services):
                    mds_services_to_add = int(
                        math.ceil((total_load - max_load * len(mds_services)) /
                                  max_load))
                    MDSServiceController._logger.info(
                        'vPool {0} - StorageRouter {1} - Average load per service {2:.2f}% - Max load per service {3:.2f}% - {4} MDS service{5} will be added'
                        .format(vpool.name, storagerouter.name,
                                total_load / len(mds_services), max_load,
                                mds_services_to_add,
                                '' if mds_services_to_add == 1 else 's'))

                    for _ in range(mds_services_to_add):
                        MDSServiceController._logger.info(
                            'vPool {0} - StorageRouter {1} - Adding new MDS Service'
                            .format(vpool.name, storagerouter.name))
                        try:
                            mds_services.append(
                                MDSServiceController.prepare_mds_service(
                                    storagerouter=storagerouter, vpool=vpool))
                        except Exception:
                            MDSServiceController._logger.exception(
                                'vPool {0} - StorageRouter {1} - Failed to create new MDS Service'
                                .format(vpool.name, storagerouter.name))

            # After potentially having added new MDSes, retrieve the optimal configuration
            mds_config_set = {}
            try:
                mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(
                    vpool=vpool, offline_nodes=offline_nodes)
                MDSServiceController._logger.debug(
                    'vPool {0} - Optimal configuration {1}'.format(
                        vpool.name, mds_config_set))
            except (NotFoundException, RuntimeError):
                MDSServiceController._logger.exception(
                    'vPool {0} - Failed to retrieve the optimal configuration'.
                    format(vpool.name))

            # Apply the optimal MDS configuration per StorageDriver
            for storagerouter in sorted(storagerouter_info,
                                        key=lambda k: k.ip):
                root_client = mds_dict[vpool][storagerouter]['client']
                storagedriver = mds_dict[vpool][storagerouter]['storagedriver']

                if storagedriver is None:
                    MDSServiceController._logger.critical(
                        'vPool {0} - StorageRouter {1} - No matching StorageDriver found'
                        .format(vpool.name, storagerouter.name))
                    continue
                if storagerouter.guid not in mds_config_set:
                    MDSServiceController._logger.critical(
                        'vPool {0} - StorageRouter {1} - Not marked as offline, but could not retrieve an optimal MDS config'
                        .format(vpool.name, storagerouter.name))
                    continue
                if root_client is None:
                    MDSServiceController._logger.debug(
                        'vPool {0} - StorageRouter {1} - Marked as offline, not setting optimal MDS configuration'
                        .format(vpool.name, storagerouter.name))
                    continue

                storagedriver_config = StorageDriverConfiguration(
                    vpool_guid=vpool.guid,
                    storagedriver_id=storagedriver.storagedriver_id)
                if storagedriver_config.config_missing is False:
                    optimal_mds_config = mds_config_set[storagerouter.guid]
                    MDSServiceController._logger.debug(
                        'vPool {0} - StorageRouter {1} - Storing optimal MDS configuration: {2}'
                        .format(vpool.name, storagerouter.name,
                                optimal_mds_config))
                    # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem
                    # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them
                    storagedriver_config.configure_filesystem(
                        fs_metadata_backend_mds_nodes=optimal_mds_config)
                    storagedriver_config.save(root_client)

            # Execute a safety check, making sure the master/slave configuration is optimal.
            MDSServiceController._logger.info(
                'vPool {0} - Ensuring safety for all vDisks'.format(
                    vpool.name))
            for vdisk in vpool.vdisks:
                try:
                    MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid)
                except Exception:
                    message = 'Ensure safety for vDisk {0} with guid {1} failed'.format(
                        vdisk.name, vdisk.guid)
                    MDSServiceController._logger.exception(message)
                    failures.append(message)
        if len(failures) > 0:
            raise RuntimeError('\n - ' + '\n - '.join(failures))
        MDSServiceController._logger.info('Finished')
示例#5
0
    def mds_checkup_single(vpool_guid, mds_dict=None, offline_nodes=None):
        # type: (str, collections.OrderedDict, List[StorageRouter]) -> None
        """
        Validates the current MDS setup/configuration and takes actions where required
        Actions:
            * Verify which StorageRouters are available
            * Make mapping between vPools and its StorageRouters
            * For each vPool make sure every StorageRouter has at least 1 MDS service with capacity available
            * For each vPool retrieve the optimal configuration and store it for each StorageDriver
            * For each vPool run an ensure safety for all vDisks
        :param vpool_guid: Guid of the VPool to do the checkup for
        :type vpool_guid: str
        :param mds_dict: OrderedDict containing all mds related information
        :type mds_dict: collections.OrderedDict
        :param offline_nodes: Nodes that are marked as unreachable
        :type offline_nodes: List[StorageRouter]
        :raises RuntimeError: When ensure safety fails for any vDisk
        :return: None
        :rtype: NoneType
        :raises: MDSCheckupEnsureSafetyFailures when the ensure safety has failed for any vdisk
        """
        params_to_verify = [mds_dict, offline_nodes]
        vpool = VPool(vpool_guid)

        if any(p is not None for p in params_to_verify) and not all(
                p is not None for p in params_to_verify):
            raise ValueError(
                'Both mds_dict and offline_nodes must be given instead of providing either one'
            )
        if not mds_dict:
            mds_dict, offline_nodes = MDSServiceController._get_mds_information(
                [vpool])

        ensure_safety_failures = []
        storagerouter_info = mds_dict[vpool]
        # Make sure there's at least 1 MDS on every StorageRouter that's not overloaded
        # Remove all MDS Services which have been manually marked for removal (by setting its capacity to 0)
        max_load = Configuration.get(
            '/ovs/vpools/{0}/mds_config|mds_maxload'.format(vpool.guid))
        for storagerouter in sorted(storagerouter_info, key=lambda k: k.ip):
            total_load = 0.0
            root_client = mds_dict[vpool][storagerouter]['client']
            mds_services = mds_dict[vpool][storagerouter]['services']

            for mds_service in list(
                    sorted(mds_services, key=lambda k: k.number)):
                port = mds_service.service.ports[0]
                number = mds_service.number
                # Manual intervention required here in order for the MDS to be cleaned up
                # @TODO: Remove this and make a dynamic calculation to check which MDSes to remove
                if mds_service.capacity == 0 and len(
                        mds_service.vdisks_guids) == 0:
                    MDSServiceController._logger.warning(
                        'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Removing'
                        .format(vpool.name, storagerouter.name, number, port))
                    try:
                        MDSServiceController.remove_mds_service(
                            mds_service=mds_service,
                            reconfigure=True,
                            allow_offline=root_client is None)
                    except Exception:
                        MDSServiceController._logger.exception(
                            'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Failed to remove'
                            .format(vpool.name, storagerouter.name, number,
                                    port))
                    mds_services.remove(mds_service)
                else:
                    _, next_load = MDSServiceController.get_mds_load(
                        mds_service=mds_service)
                    if next_load == float('inf'):
                        total_load = sys.maxint * -1  # Cast to lowest possible value if any MDS service capacity is set to infinity
                    else:
                        total_load += next_load

                    if next_load < max_load:
                        MDSServiceController._logger.debug(
                            'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Capacity available - Load at {4}%'
                            .format(vpool.name, storagerouter.name, number,
                                    port, next_load))
                    else:
                        MDSServiceController._logger.debug(
                            'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: No capacity available - Load at {4}%'
                            .format(vpool.name, storagerouter.name, number,
                                    port, next_load))

            if total_load >= max_load * len(mds_services):
                mds_services_to_add = int(
                    math.ceil((total_load - max_load * len(mds_services)) /
                              max_load))
                MDSServiceController._logger.info(
                    'vPool {0} - StorageRouter {1} - Average load per service {2:.2f}% - Max load per service {3:.2f}% - {4} MDS service{5} will be added'
                    .format(vpool.name, storagerouter.name,
                            total_load / len(mds_services), max_load,
                            mds_services_to_add,
                            '' if mds_services_to_add == 1 else 's'))

                for _ in range(mds_services_to_add):
                    MDSServiceController._logger.info(
                        'vPool {0} - StorageRouter {1} - Adding new MDS Service'
                        .format(vpool.name, storagerouter.name))
                    try:
                        mds_services.append(
                            MDSServiceController.prepare_mds_service(
                                storagerouter=storagerouter, vpool=vpool))
                    except Exception:
                        MDSServiceController._logger.exception(
                            'vPool {0} - StorageRouter {1} - Failed to create new MDS Service'
                            .format(vpool.name, storagerouter.name))

        # After potentially having added new MDSes, retrieve the optimal configuration
        mds_config_set = {}
        try:
            mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(
                vpool=vpool, offline_nodes=offline_nodes)
            MDSServiceController._logger.debug(
                'vPool {0} - Optimal configuration {1}'.format(
                    vpool.name, mds_config_set))
        except (NotFoundException, RuntimeError):
            MDSServiceController._logger.exception(
                'vPool {0} - Failed to retrieve the optimal configuration'.
                format(vpool.name))

        # Apply the optimal MDS configuration per StorageDriver
        for storagerouter in sorted(storagerouter_info, key=lambda k: k.ip):
            root_client = mds_dict[vpool][storagerouter]['client']
            storagedriver = mds_dict[vpool][storagerouter]['storagedriver']

            if storagedriver is None:
                MDSServiceController._logger.critical(
                    'vPool {0} - StorageRouter {1} - No matching StorageDriver found'
                    .format(vpool.name, storagerouter.name))
                continue
            if storagerouter.guid not in mds_config_set:
                MDSServiceController._logger.critical(
                    'vPool {0} - StorageRouter {1} - Not marked as offline, but could not retrieve an optimal MDS config'
                    .format(vpool.name, storagerouter.name))
                continue
            if root_client is None:
                MDSServiceController._logger.debug(
                    'vPool {0} - StorageRouter {1} - Marked as offline, not setting optimal MDS configuration'
                    .format(vpool.name, storagerouter.name))
                continue

            storagedriver_config = StorageDriverConfiguration(
                vpool_guid=vpool.guid,
                storagedriver_id=storagedriver.storagedriver_id)
            if storagedriver_config.config_missing is False:
                optimal_mds_config = mds_config_set[storagerouter.guid]
                MDSServiceController._logger.debug(
                    'vPool {0} - StorageRouter {1} - Storing optimal MDS configuration: {2}'
                    .format(vpool.name, storagerouter.name,
                            optimal_mds_config))
                # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem
                # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them
                storagedriver_config.configure_filesystem(
                    fs_metadata_backend_mds_nodes=optimal_mds_config)
                storagedriver_config.save(root_client)

        # Execute a safety check, making sure the master/slave configuration is optimal.
        MDSServiceController._logger.info(
            'vPool {0} - Ensuring safety for all vDisks'.format(vpool.name))
        for vdisk in vpool.vdisks:
            try:
                MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid)
            except Exception:
                message = 'Ensure safety for vDisk {0} with guid {1} failed'.format(
                    vdisk.name, vdisk.guid)
                MDSServiceController._logger.exception(message)
                ensure_safety_failures.append(message)

        if ensure_safety_failures:
            raise MDSCheckupEnsureSafetyFailures(
                '\n - ' + '\n - '.join(ensure_safety_failures))
示例#6
0
    def mds_checkup():
        """
        Validates the current MDS setup/configuration and takes actions where required
        """
        logger.info('MDS checkup - Started')
        mds_dict = {}
        for vpool in VPoolList.get_vpools():
            logger.info('MDS checkup - vPool {0}'.format(vpool.name))
            mds_dict[vpool] = {}
            for mds_service in vpool.mds_services:
                storagerouter = mds_service.service.storagerouter
                if storagerouter not in mds_dict[vpool]:
                    mds_dict[vpool][storagerouter] = {'client': None,
                                                      'services': []}
                    try:
                        client = SSHClient(storagerouter, username = '******')
                        client.run('pwd')
                        mds_dict[vpool][storagerouter]['client'] = client
                        logger.info('MDS checkup - vPool {0} - Storage Router {1} - ONLINE'.format(vpool.name, storagerouter.name))
                    except UnableToConnectException:
                        logger.info('MDS checkup - vPool {0} - Storage Router {1} - OFFLINE'.format(vpool.name, storagerouter.name))
                mds_dict[vpool][storagerouter]['services'].append(mds_service)

        failures = []
        max_load = EtcdConfiguration.get('/ovs/framework/storagedriver|mds_maxload')
        for vpool, storagerouter_info in mds_dict.iteritems():
            # 1. First, make sure there's at least one MDS on every StorageRouter that's not overloaded
            # If not, create an extra MDS for that StorageRouter
            for storagerouter in storagerouter_info:
                client = mds_dict[vpool][storagerouter]['client']
                mds_services = mds_dict[vpool][storagerouter]['services']
                has_room = False
                for mds_service in mds_services[:]:
                    if mds_service.capacity == 0 and len(mds_service.vdisks_guids) == 0:
                        logger.info('MDS checkup - Removing mds_service {0} for vPool {1}'.format(mds_service.number, vpool.name))
                        MDSServiceController.remove_mds_service(mds_service, vpool, reconfigure=True, allow_offline=client is None)
                        mds_services.remove(mds_service)
                for mds_service in mds_services:
                    _, load = MDSServiceController.get_mds_load(mds_service)
                    if load < max_load:
                        has_room = True
                        break
                logger.info('MDS checkup - vPool {0} - Storage Router {1} - Capacity available: {2}'.format(vpool.name, storagerouter.name, has_room))
                if has_room is False and client is not None:
                    mds_service = MDSServiceController.prepare_mds_service(storagerouter=storagerouter,
                                                                           vpool=vpool,
                                                                           fresh_only=False,
                                                                           reload_config=True)
                    if mds_service is None:
                        raise RuntimeError('Could not add MDS node')
                    mds_services.append(mds_service)
            mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(vpool, True)
            for storagerouter in storagerouter_info:
                client = mds_dict[vpool][storagerouter]['client']
                if client is None:
                    logger.info('MDS checkup - vPool {0} - Storage Router {1} - Marked as offline, not setting default MDS configuration'.format(vpool.name, storagerouter.name))
                    continue
                storagedriver_config = StorageDriverConfiguration('storagedriver', vpool.name)
                storagedriver_config.load(client)
                if storagedriver_config.is_new is False:
                    logger.info('MDS checkup - vPool {0} - Storage Router {1} - Storing default MDS configuration: {2}'.format(vpool.name, storagerouter.name, mds_config_set[storagerouter.guid]))
                    storagedriver_config.clean()  # Clean out obsolete values
                    storagedriver_config.configure_filesystem(fs_metadata_backend_mds_nodes=mds_config_set[storagerouter.guid])
                    storagedriver_config.save(client)
            # 2. Per VPool, execute a safety check, making sure the master/slave configuration is optimal.
            logger.info('MDS checkup - vPool {0} - Ensuring safety for all virtual disks'.format(vpool.name))
            for vdisk in vpool.vdisks:
                try:
                    MDSServiceController.ensure_safety(vdisk)
                except Exception as ex:
                    failures.append('Ensure safety for vDisk {0} with guid {1} failed with error: {2}'.format(vdisk.name, vdisk.guid, ex))
        if len(failures) > 0:
            raise Exception('\n - ' + '\n - '.join(failures))
        logger.info('MDS checkup - Finished')
示例#7
0
    def mds_checkup():
        """
        Validates the current MDS setup/configuration and takes actions where required
        """
        MDSServiceController._logger.info("MDS checkup - Started")
        mds_dict = {}
        for vpool in VPoolList.get_vpools():
            MDSServiceController._logger.info("MDS checkup - vPool {0}".format(vpool.name))
            mds_dict[vpool] = {}
            for mds_service in vpool.mds_services:
                storagerouter = mds_service.service.storagerouter
                if storagerouter not in mds_dict[vpool]:
                    mds_dict[vpool][storagerouter] = {"client": None, "services": []}
                    try:
                        mds_dict[vpool][storagerouter]["client"] = SSHClient(storagerouter, username="******")
                        MDSServiceController._logger.info(
                            "MDS checkup - vPool {0} - Storage Router {1} - ONLINE".format(
                                vpool.name, storagerouter.name
                            )
                        )
                    except UnableToConnectException:
                        MDSServiceController._logger.info(
                            "MDS checkup - vPool {0} - Storage Router {1} - OFFLINE".format(
                                vpool.name, storagerouter.name
                            )
                        )
                mds_dict[vpool][storagerouter]["services"].append(mds_service)

        failures = []
        max_load = Configuration.get("/ovs/framework/storagedriver|mds_maxload")
        for vpool, storagerouter_info in mds_dict.iteritems():
            # 1. First, make sure there's at least one MDS on every StorageRouter that's not overloaded
            # If not, create an extra MDS for that StorageRouter
            for storagerouter in storagerouter_info:
                client = mds_dict[vpool][storagerouter]["client"]
                mds_services = mds_dict[vpool][storagerouter]["services"]
                has_room = False
                for mds_service in mds_services[:]:
                    if mds_service.capacity == 0 and len(mds_service.vdisks_guids) == 0:
                        MDSServiceController._logger.info(
                            "MDS checkup - Removing mds_service {0} for vPool {1}".format(
                                mds_service.number, vpool.name
                            )
                        )
                        MDSServiceController.remove_mds_service(
                            mds_service, vpool, reconfigure=True, allow_offline=client is None
                        )
                        mds_services.remove(mds_service)
                for mds_service in mds_services:
                    _, load = MDSServiceController.get_mds_load(mds_service)
                    if load < max_load:
                        has_room = True
                        break
                MDSServiceController._logger.info(
                    "MDS checkup - vPool {0} - Storage Router {1} - Capacity available: {2}".format(
                        vpool.name, storagerouter.name, has_room
                    )
                )
                if has_room is False and client is not None:
                    mds_service = MDSServiceController.prepare_mds_service(
                        storagerouter=storagerouter, vpool=vpool, fresh_only=False, reload_config=True
                    )
                    if mds_service is None:
                        raise RuntimeError("Could not add MDS node")
                    mds_services.append(mds_service)
            mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(vpool, True)
            for storagerouter in storagerouter_info:
                client = mds_dict[vpool][storagerouter]["client"]
                if client is None:
                    MDSServiceController._logger.info(
                        "MDS checkup - vPool {0} - Storage Router {1} - Marked as offline, not setting default MDS configuration".format(
                            vpool.name, storagerouter.name
                        )
                    )
                    continue
                storagedriver = [sd for sd in storagerouter.storagedrivers if sd.vpool_guid == vpool.guid][0]
                storagedriver_config = StorageDriverConfiguration(
                    "storagedriver", vpool.guid, storagedriver.storagedriver_id
                )
                storagedriver_config.load()
                if storagedriver_config.is_new is False:
                    MDSServiceController._logger.info(
                        "MDS checkup - vPool {0} - Storage Router {1} - Storing default MDS configuration: {2}".format(
                            vpool.name, storagerouter.name, mds_config_set[storagerouter.guid]
                        )
                    )
                    storagedriver_config.configure_filesystem(
                        fs_metadata_backend_mds_nodes=mds_config_set[storagerouter.guid]
                    )
                    storagedriver_config.save(client)
            # 2. Per VPool, execute a safety check, making sure the master/slave configuration is optimal.
            MDSServiceController._logger.info(
                "MDS checkup - vPool {0} - Ensuring safety for all virtual disks".format(vpool.name)
            )
            for vdisk in vpool.vdisks:
                try:
                    MDSServiceController.ensure_safety(vdisk)
                except Exception:
                    message = "Ensure safety for vDisk {0} with guid {1} failed".format(vdisk.name, vdisk.guid)
                    MDSServiceController._logger.exception(message)
                    failures.append(message)
        if len(failures) > 0:
            raise Exception("\n - " + "\n - ".join(failures))
        MDSServiceController._logger.info("MDS checkup - Finished")
示例#8
0
    def configure_storagedriver_service(self):
        """
        Configure the StorageDriver service
        :return: None
        :rtype: NoneType
        """
        def _generate_queue_urls():
            mq_user = Configuration.get('/ovs/framework/messagequeue|user')
            mq_protocol = Configuration.get('/ovs/framework/messagequeue|protocol')
            mq_password = Configuration.get('/ovs/framework/messagequeue|password')
            return [{'amqp_uri': '{0}://{1}:{2}@{3}:5672'.format(mq_protocol, mq_user, mq_password, sr.ip)} for sr in StorageRouterList.get_masters()]

        def _generate_config_file_system():
            config = {'fs_dtl_host': '',
                      'fs_enable_shm_interface': 0,
                      'fs_enable_network_interface': 1,
                      'fs_metadata_backend_arakoon_cluster_nodes': [],
                      'fs_metadata_backend_mds_nodes': [],
                      'fs_metadata_backend_type': 'MDS',
                      'fs_virtual_disk_format': 'raw',
                      'fs_raw_disk_suffix': '.raw',
                      'fs_file_event_rules': [{'fs_file_event_rule_calls': ['Rename'],
                                               'fs_file_event_rule_path_regex': '.*'}]}
            if self.dtl_mode == StorageDriverClient.FRAMEWORK_DTL_NO_SYNC:
                config['fs_dtl_config_mode'] = StorageDriverClient.VOLDRV_DTL_MANUAL_MODE
            else:
                config['fs_dtl_mode'] = StorageDriverClient.VPOOL_DTL_MODE_MAP[self.dtl_mode]
                config['fs_dtl_config_mode'] = StorageDriverClient.VOLDRV_DTL_AUTOMATIC_MODE
            return config

        def _generate_config_backend_connection_manager():
            config = {'backend_type': 'MULTI',
                      'backend_interface_retries_on_error': 5,
                      'backend_interface_retry_interval_secs': 1,
                      'backend_interface_retry_backoff_multiplier': 2.0}
            for index, proxy in enumerate(sorted(self.storagedriver.alba_proxies, key=lambda k: k.service.ports[0])):
                config[str(index)] = {'alba_connection_host': self.storagedriver.storage_ip,
                                      'alba_connection_port': proxy.service.ports[0],
                                      'alba_connection_preset': vpool.metadata['backend']['backend_info']['preset'],
                                      'alba_connection_timeout': 30,
                                      'alba_connection_use_rora': True,
                                      'alba_connection_transport': 'TCP',
                                      'alba_connection_rora_manifest_cache_capacity': 25000,
                                      'alba_connection_asd_connection_pool_capacity': 10,
                                      'alba_connection_rora_timeout_msecs': 50,
                                      'backend_type': 'ALBA'}
            return config

        if self.sr_installer is None:
            raise RuntimeError('No StorageRouterInstaller instance found')
        if len(self.write_caches) == 0:
            raise RuntimeError('The StorageDriverPartition junctions have not been created yet')

        vpool = self.vp_installer.vpool
        gap_configuration = StorageDriverController.calculate_trigger_and_backoff_gap(cache_size=self.sr_installer.smallest_write_partition_size)
        arakoon_cluster_name = str(Configuration.get('/ovs/framework/arakoon_clusters|voldrv'))
        arakoon_nodes = [{'host': node.ip,
                          'port': node.client_port,
                          'node_id': node.name} for node in ArakoonClusterConfig(cluster_id=arakoon_cluster_name).nodes]

        storagedriver_config = StorageDriverConfiguration(vpool.guid, self.storagedriver.storagedriver_id)
        storagedriver_config.configure_scocache(scocache_mount_points=self.write_caches,
                                                trigger_gap=ExtensionsToolbox.convert_byte_size_to_human_readable(size=gap_configuration['trigger']),
                                                backoff_gap=ExtensionsToolbox.convert_byte_size_to_human_readable(size=gap_configuration['backoff']))
        storagedriver_config.configure_file_driver(fd_cache_path=self.storagedriver_partition_file_driver.path,
                                                   fd_extent_cache_capacity='1024',
                                                   fd_namespace='fd-{0}-{1}'.format(vpool.name, vpool.guid))
        storagedriver_config.configure_volume_router(vrouter_id=self.storagedriver.storagedriver_id,
                                                     vrouter_redirect_timeout_ms='120000',
                                                     vrouter_keepalive_time_secs='15',
                                                     vrouter_keepalive_interval_secs='5',
                                                     vrouter_keepalive_retries='2',
                                                     vrouter_routing_retries=10,
                                                     vrouter_volume_read_threshold=0,
                                                     vrouter_volume_write_threshold=0,
                                                     vrouter_file_read_threshold=0,
                                                     vrouter_file_write_threshold=0,
                                                     vrouter_min_workers=4,
                                                     vrouter_max_workers=16,
                                                     vrouter_sco_multiplier=self.sco_size * 1024 / self.cluster_size,
                                                     vrouter_backend_sync_timeout_ms=60000,
                                                     vrouter_migrate_timeout_ms=60000,
                                                     vrouter_use_fencing=True)
        storagedriver_config.configure_volume_manager(tlog_path=self.storagedriver_partition_tlogs.path,
                                                      metadata_path=self.storagedriver_partition_metadata.path,
                                                      clean_interval=1,
                                                      dtl_throttle_usecs=4000,
                                                      default_cluster_size=self.cluster_size * 1024,
                                                      number_of_scos_in_tlog=self.tlog_multiplier,
                                                      non_disposable_scos_factor=float(self.write_buffer) / self.tlog_multiplier / self.sco_size)
        storagedriver_config.configure_event_publisher(events_amqp_routing_key=Configuration.get('/ovs/framework/messagequeue|queues.storagedriver'),
                                                       events_amqp_uris=_generate_queue_urls())
        storagedriver_config.configure_volume_registry(vregistry_arakoon_cluster_id=arakoon_cluster_name,
                                                       vregistry_arakoon_cluster_nodes=arakoon_nodes)
        storagedriver_config.configure_network_interface(network_max_neighbour_distance=StorageDriver.DISTANCES.FAR - 1)
        storagedriver_config.configure_threadpool_component(num_threads=16)
        storagedriver_config.configure_volume_router_cluster(vrouter_cluster_id=vpool.guid)
        storagedriver_config.configure_distributed_lock_store(dls_type='Arakoon',
                                                              dls_arakoon_cluster_id=arakoon_cluster_name,
                                                              dls_arakoon_cluster_nodes=arakoon_nodes)
        storagedriver_config.configure_content_addressed_cache(serialize_read_cache=False,
                                                               read_cache_serialization_path=[])
        storagedriver_config.configure_distributed_transaction_log(dtl_path=self.storagedriver_partition_dtl.path,  # Not used, but required
                                                                   dtl_transport=StorageDriverClient.VPOOL_DTL_TRANSPORT_MAP[self.dtl_transport])

        storagedriver_config.configure_filesystem(**_generate_config_file_system())
        storagedriver_config.configure_backend_connection_manager(**_generate_config_backend_connection_manager())

        storagedriver_config.save(client=self.sr_installer.root_client)