def mds_checkup(): """ Validates the current MDS setup/configuration and takes actions where required """ mds_dict = {} for vpool in VPoolList.get_vpools(): for mds_service in vpool.mds_services: storagerouter = mds_service.service.storagerouter if vpool not in mds_dict: mds_dict[vpool] = {} if storagerouter not in mds_dict[vpool]: mds_dict[vpool][storagerouter] = {'client': SSHClient(storagerouter, username='******'), 'services': []} mds_dict[vpool][storagerouter]['services'].append(mds_service) for vpool, storagerouter_info in mds_dict.iteritems(): # 1. First, make sure there's at least one MDS on every StorageRouter that's not overloaded # If not, create an extra MDS for that StorageRouter for storagerouter in storagerouter_info: client = mds_dict[vpool][storagerouter]['client'] mds_services = mds_dict[vpool][storagerouter]['services'] has_room = False for mds_service in mds_services[:]: if mds_service.capacity == 0 and len(mds_service.vdisks_guids) == 0: client = SSHClient(storagerouter) MDSServiceController.remove_mds_service(mds_service, client, storagerouter, vpool, reload_config=True) mds_services.remove(mds_service) for mds_service in mds_services: _, load = MDSServiceController.get_mds_load(mds_service) if load < Configuration.get('ovs.storagedriver.mds.maxload'): has_room = True break if has_room is False: mds_service = MDSServiceController.prepare_mds_service(client, storagerouter, vpool, fresh_only=False, reload_config=True) if mds_service is None: raise RuntimeError('Could not add MDS node') mds_services.append(mds_service) mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(vpool) for storagerouter in mds_dict[vpool]: client = mds_dict[vpool][storagerouter]['client'] storagedriver_config = StorageDriverConfiguration('storagedriver', vpool.name) storagedriver_config.load(client) if storagedriver_config.is_new is False: storagedriver_config.clean() # Clean out obsolete values storagedriver_config.configure_filesystem( fs_metadata_backend_mds_nodes=mds_config_set[storagerouter.guid] ) storagedriver_config.save(client) # 2. Per VPool, execute a safety check, making sure the master/slave configuration is optimal. for vdisk in vpool.vdisks: MDSServiceController.ensure_safety(vdisk)
def mds_checkup(): """ Validates the current MDS setup/configuration and takes actions where required """ MDSServiceController._logger.info('MDS checkup - Started') mds_dict = {} for vpool in VPoolList.get_vpools(): MDSServiceController._logger.info('MDS checkup - vPool {0}'.format( vpool.name)) mds_dict[vpool] = {} for mds_service in vpool.mds_services: storagerouter = mds_service.service.storagerouter if storagerouter not in mds_dict[vpool]: mds_dict[vpool][storagerouter] = { 'client': None, 'services': [] } try: mds_dict[vpool][storagerouter]['client'] = SSHClient( storagerouter, username='******') MDSServiceController._logger.info( 'MDS checkup - vPool {0} - Storage Router {1} - ONLINE' .format(vpool.name, storagerouter.name)) except UnableToConnectException: MDSServiceController._logger.info( 'MDS checkup - vPool {0} - Storage Router {1} - OFFLINE' .format(vpool.name, storagerouter.name)) mds_dict[vpool][storagerouter]['services'].append(mds_service) failures = [] max_load = Configuration.get( '/ovs/framework/storagedriver|mds_maxload') for vpool, storagerouter_info in mds_dict.iteritems(): # 1. First, make sure there's at least one MDS on every StorageRouter that's not overloaded # If not, create an extra MDS for that StorageRouter for storagerouter in storagerouter_info: client = mds_dict[vpool][storagerouter]['client'] mds_services = mds_dict[vpool][storagerouter]['services'] has_room = False for mds_service in mds_services[:]: if mds_service.capacity == 0 and len( mds_service.vdisks_guids) == 0: MDSServiceController._logger.info( 'MDS checkup - Removing mds_service {0} for vPool {1}' .format(mds_service.number, vpool.name)) MDSServiceController.remove_mds_service( mds_service, vpool, reconfigure=True, allow_offline=client is None) mds_services.remove(mds_service) for mds_service in mds_services: _, load = MDSServiceController.get_mds_load(mds_service) if load < max_load: has_room = True break MDSServiceController._logger.info( 'MDS checkup - vPool {0} - Storage Router {1} - Capacity available: {2}' .format(vpool.name, storagerouter.name, has_room)) if has_room is False and client is not None: mds_service = MDSServiceController.prepare_mds_service( storagerouter=storagerouter, vpool=vpool, fresh_only=False, reload_config=True) if mds_service is None: raise RuntimeError('Could not add MDS node') mds_services.append(mds_service) mds_config_set = MDSServiceController.get_mds_storagedriver_config_set( vpool, True) for storagerouter in storagerouter_info: client = mds_dict[vpool][storagerouter]['client'] if client is None: MDSServiceController._logger.info( 'MDS checkup - vPool {0} - Storage Router {1} - Marked as offline, not setting default MDS configuration' .format(vpool.name, storagerouter.name)) continue storagedriver = [ sd for sd in storagerouter.storagedrivers if sd.vpool_guid == vpool.guid ][0] storagedriver_config = StorageDriverConfiguration( 'storagedriver', vpool.guid, storagedriver.storagedriver_id) storagedriver_config.load() if storagedriver_config.is_new is False: MDSServiceController._logger.info( 'MDS checkup - vPool {0} - Storage Router {1} - Storing default MDS configuration: {2}' .format(vpool.name, storagerouter.name, mds_config_set[storagerouter.guid])) storagedriver_config.configure_filesystem( fs_metadata_backend_mds_nodes=mds_config_set[ storagerouter.guid]) storagedriver_config.save(client) # 2. Per VPool, execute a safety check, making sure the master/slave configuration is optimal. MDSServiceController._logger.info( 'MDS checkup - vPool {0} - Ensuring safety for all virtual disks' .format(vpool.name)) for vdisk in vpool.vdisks: try: MDSServiceController.ensure_safety(vdisk) except Exception: message = 'Ensure safety for vDisk {0} with guid {1} failed'.format( vdisk.name, vdisk.guid) MDSServiceController._logger.exception(message) failures.append(message) if len(failures) > 0: raise Exception('\n - ' + '\n - '.join(failures)) MDSServiceController._logger.info('MDS checkup - Finished')
def add_vpool(cls, parameters): """ Add a vPool to the machine this task is running on :param parameters: Parameters for vPool creation :type parameters: dict :return: None :rtype: NoneType """ # TODO: Add logging cls._logger.debug('Adding vpool. Parameters: {}'.format(parameters)) # VALIDATIONS if not isinstance(parameters, dict): raise ValueError( 'Parameters passed to create a vPool should be of type dict') # Check StorageRouter existence storagerouter = StorageRouterList.get_by_ip( ip=parameters.get('storagerouter_ip')) if storagerouter is None: raise RuntimeError('Could not find StorageRouter') # Validate requested vPool configurations vp_installer = VPoolInstaller(name=parameters.get('vpool_name')) vp_installer.validate(storagerouter=storagerouter) # Validate requested StorageDriver configurations cls._logger.info( 'vPool {0}: Validating StorageDriver configurations'.format( vp_installer.name)) sd_installer = StorageDriverInstaller( vp_installer=vp_installer, configurations={ 'storage_ip': parameters.get('storage_ip'), 'caching_info': parameters.get('caching_info'), 'backend_info': { 'main': parameters.get('backend_info'), StorageDriverConfiguration.CACHE_BLOCK: parameters.get('backend_info_bc'), StorageDriverConfiguration.CACHE_FRAGMENT: parameters.get('backend_info_fc') }, 'connection_info': { 'main': parameters.get('connection_info'), StorageDriverConfiguration.CACHE_BLOCK: parameters.get('connection_info_bc'), StorageDriverConfiguration.CACHE_FRAGMENT: parameters.get('connection_info_fc') }, 'sd_configuration': parameters.get('config_params') }) partitions_mutex = volatile_mutex('add_vpool_partitions_{0}'.format( storagerouter.guid)) try: # VPOOL CREATION # Create the vPool as soon as possible in the process to be displayed in the GUI (INSTALLING/EXTENDING state) if vp_installer.is_new is True: vp_installer.create(rdma_enabled=sd_installer.rdma_enabled) vp_installer.configure_mds( config=parameters.get('mds_config_params', {})) else: vp_installer.update_status(status=VPool.STATUSES.EXTENDING) # ADDITIONAL VALIDATIONS # Check StorageRouter connectivity cls._logger.info( 'vPool {0}: Validating StorageRouter connectivity'.format( vp_installer.name)) linked_storagerouters = [storagerouter] if vp_installer.is_new is False: linked_storagerouters += [ sd.storagerouter for sd in vp_installer.vpool.storagedrivers ] sr_client_map = SSHClient.get_clients( endpoints=linked_storagerouters, user_names=['ovs', 'root']) offline_nodes = sr_client_map.pop('offline') if storagerouter in offline_nodes: raise RuntimeError( 'Node on which the vPool is being {0} is not reachable'. format('created' if vp_installer.is_new is True else 'extended')) sr_installer = StorageRouterInstaller( root_client=sr_client_map[storagerouter]['root'], sd_installer=sd_installer, vp_installer=vp_installer, storagerouter=storagerouter) # When 2 or more jobs simultaneously run on the same StorageRouter, we need to check and create the StorageDriver partitions in locked context partitions_mutex.acquire(wait=60) sr_installer.partition_info = StorageRouterController.get_partition_info( storagerouter_guid=storagerouter.guid) sr_installer.validate_vpool_extendable() sr_installer.validate_global_write_buffer( requested_size=parameters.get('writecache_size', 0)) sr_installer.validate_local_cache_size( requested_proxies=parameters.get('parallelism', {}).get( 'proxies', 2)) # MODEL STORAGEDRIVER AND PARTITION JUNCTIONS sd_installer.create() sd_installer.create_partitions() partitions_mutex.release() vp_installer.refresh_metadata() except Exception: cls._logger.exception( 'Something went wrong during the validation or modeling of vPool {0} on StorageRouter {1}' .format(vp_installer.name, storagerouter.name)) partitions_mutex.release() vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise # Arakoon setup counter = 0 while counter < 300: try: if StorageDriverController.manual_voldrv_arakoon_checkup( ) is True: break except Exception: cls._logger.exception( 'Arakoon checkup for voldrv cluster failed') vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise counter += 1 time.sleep(1) if counter == 300: vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise RuntimeError( 'Arakoon checkup for the StorageDriver cluster could not be started' ) # Cluster registry try: vp_installer.configure_cluster_registry(allow_raise=True) except Exception: if vp_installer.is_new is True: vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) else: vp_installer.revert_vpool(status=VPool.STATUSES.FAILURE) raise try: sd_installer.setup_proxy_configs() sd_installer.configure_storagedriver_service() DiskController.sync_with_reality(storagerouter.guid) MDSServiceController.prepare_mds_service( storagerouter=storagerouter, vpool=vp_installer.vpool) # Update the MDS safety if changed via API (vpool.configuration will be available at this point also for the newly added StorageDriver) vp_installer.vpool.invalidate_dynamics('configuration') if vp_installer.mds_safety is not None and vp_installer.vpool.configuration[ 'mds_config']['mds_safety'] != vp_installer.mds_safety: Configuration.set( key='/ovs/vpools/{0}/mds_config|mds_safety'.format( vp_installer.vpool.guid), value=vp_installer.mds_safety) sd_installer.start_services( ) # Create and start watcher volumedriver, DTL, proxies and StorageDriver services # Post creation/extension checkups mds_config_set = MDSServiceController.get_mds_storagedriver_config_set( vpool=vp_installer.vpool, offline_nodes=offline_nodes) for sr, clients in sr_client_map.iteritems(): for current_storagedriver in [ sd for sd in sr.storagedrivers if sd.vpool_guid == vp_installer.vpool.guid ]: storagedriver_config = StorageDriverConfiguration( vpool_guid=vp_installer.vpool.guid, storagedriver_id=current_storagedriver.storagedriver_id ) if storagedriver_config.config_missing is False: # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them storagedriver_config.configure_filesystem( fs_metadata_backend_mds_nodes=mds_config_set[ sr.guid]) storagedriver_config.save(client=clients['ovs']) # Everything's reconfigured, refresh new cluster configuration for current_storagedriver in vp_installer.vpool.storagedrivers: if current_storagedriver.storagerouter not in sr_client_map: continue vp_installer.vpool.storagedriver_client.update_cluster_node_configs( str(current_storagedriver.storagedriver_id), req_timeout_secs=10) except Exception: cls._logger.exception('vPool {0}: Creation failed'.format( vp_installer.name)) vp_installer.update_status(status=VPool.STATUSES.FAILURE) raise # When a node is offline, we can run into errors, but also when 1 or more volumes are not running # Scheduled tasks below, so don't really care whether they succeed or not try: VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600) except: pass for vdisk in vp_installer.vpool.vdisks: try: MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid) except: pass vp_installer.update_status(status=VPool.STATUSES.RUNNING) cls._logger.info('Add vPool {0} ended successfully'.format( vp_installer.name))
def mds_checkup(): """ Validates the current MDS setup/configuration and takes actions where required Actions: * Verify which StorageRouters are available * Make mapping between vPools and its StorageRouters * For each vPool make sure every StorageRouter has at least 1 MDS service with capacity available * For each vPool retrieve the optimal configuration and store it for each StorageDriver * For each vPool run an ensure safety for all vDisks :raises RuntimeError: When ensure safety fails for any vDisk :return: None :rtype: NoneType """ MDSServiceController._logger.info('Started') # Verify StorageRouter availability root_client_cache = {} storagerouters = StorageRouterList.get_storagerouters() storagerouters.sort(key=lambda _sr: ExtensionsToolbox.advanced_sort( element=_sr.ip, separator='.')) offline_nodes = [] for storagerouter in storagerouters: try: root_client = SSHClient(endpoint=storagerouter, username='******') MDSServiceController._logger.debug( 'StorageRouter {0} - ONLINE'.format(storagerouter.name)) except UnableToConnectException: root_client = None offline_nodes.append(storagerouter) MDSServiceController._logger.error( 'StorageRouter {0} - OFFLINE'.format(storagerouter.name)) root_client_cache[storagerouter] = root_client # Create mapping per vPool and its StorageRouters mds_dict = collections.OrderedDict() for vpool in sorted(VPoolList.get_vpools(), key=lambda k: k.name): MDSServiceController._logger.info('vPool {0}'.format(vpool.name)) mds_dict[vpool] = {} # Loop all StorageDrivers and add StorageDriver to mapping for storagedriver in vpool.storagedrivers: storagerouter = storagedriver.storagerouter if storagerouter not in mds_dict[vpool]: mds_dict[vpool][storagerouter] = { 'client': root_client_cache.get(storagerouter), 'services': [], 'storagedriver': storagedriver } # Loop all MDS Services and append services to appropriate vPool / StorageRouter combo mds_services = vpool.mds_services mds_services.sort( key=lambda _mds_service: ExtensionsToolbox.advanced_sort( element=_mds_service.service.storagerouter.ip, separator='.')) for mds_service in mds_services: service = mds_service.service storagerouter = service.storagerouter if storagerouter not in mds_dict[vpool]: mds_dict[vpool][storagerouter] = { 'client': root_client_cache.get(storagerouter), 'services': [], 'storagedriver': None } MDSServiceController._logger.debug( 'vPool {0} - StorageRouter {1} - Service on port {2}'. format(vpool.name, storagerouter.name, service.ports[0])) mds_dict[vpool][storagerouter]['services'].append(mds_service) failures = [] for vpool, storagerouter_info in mds_dict.iteritems(): # Make sure there's at least 1 MDS on every StorageRouter that's not overloaded # Remove all MDS Services which have been manually marked for removal (by setting its capacity to 0) max_load = Configuration.get( '/ovs/vpools/{0}/mds_config|mds_maxload'.format(vpool.guid)) for storagerouter in sorted(storagerouter_info, key=lambda k: k.ip): total_load = 0.0 root_client = mds_dict[vpool][storagerouter]['client'] mds_services = mds_dict[vpool][storagerouter]['services'] for mds_service in list( sorted(mds_services, key=lambda k: k.number)): port = mds_service.service.ports[0] number = mds_service.number # Manual intervention required here in order for the MDS to be cleaned up # @TODO: Remove this and make a dynamic calculation to check which MDSes to remove if mds_service.capacity == 0 and len( mds_service.vdisks_guids) == 0: MDSServiceController._logger.warning( 'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Removing' .format(vpool.name, storagerouter.name, number, port)) try: MDSServiceController.remove_mds_service( mds_service=mds_service, reconfigure=True, allow_offline=root_client is None) except Exception: MDSServiceController._logger.exception( 'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Failed to remove' .format(vpool.name, storagerouter.name, number, port)) mds_services.remove(mds_service) else: _, next_load = MDSServiceController.get_mds_load( mds_service=mds_service) if next_load == float('inf'): total_load = sys.maxint * -1 # Cast to lowest possible value if any MDS service capacity is set to infinity else: total_load += next_load if next_load < max_load: MDSServiceController._logger.debug( 'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Capacity available - Load at {4}%' .format(vpool.name, storagerouter.name, number, port, next_load)) else: MDSServiceController._logger.debug( 'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: No capacity available - Load at {4}%' .format(vpool.name, storagerouter.name, number, port, next_load)) if total_load >= max_load * len(mds_services): mds_services_to_add = int( math.ceil((total_load - max_load * len(mds_services)) / max_load)) MDSServiceController._logger.info( 'vPool {0} - StorageRouter {1} - Average load per service {2:.2f}% - Max load per service {3:.2f}% - {4} MDS service{5} will be added' .format(vpool.name, storagerouter.name, total_load / len(mds_services), max_load, mds_services_to_add, '' if mds_services_to_add == 1 else 's')) for _ in range(mds_services_to_add): MDSServiceController._logger.info( 'vPool {0} - StorageRouter {1} - Adding new MDS Service' .format(vpool.name, storagerouter.name)) try: mds_services.append( MDSServiceController.prepare_mds_service( storagerouter=storagerouter, vpool=vpool)) except Exception: MDSServiceController._logger.exception( 'vPool {0} - StorageRouter {1} - Failed to create new MDS Service' .format(vpool.name, storagerouter.name)) # After potentially having added new MDSes, retrieve the optimal configuration mds_config_set = {} try: mds_config_set = MDSServiceController.get_mds_storagedriver_config_set( vpool=vpool, offline_nodes=offline_nodes) MDSServiceController._logger.debug( 'vPool {0} - Optimal configuration {1}'.format( vpool.name, mds_config_set)) except (NotFoundException, RuntimeError): MDSServiceController._logger.exception( 'vPool {0} - Failed to retrieve the optimal configuration'. format(vpool.name)) # Apply the optimal MDS configuration per StorageDriver for storagerouter in sorted(storagerouter_info, key=lambda k: k.ip): root_client = mds_dict[vpool][storagerouter]['client'] storagedriver = mds_dict[vpool][storagerouter]['storagedriver'] if storagedriver is None: MDSServiceController._logger.critical( 'vPool {0} - StorageRouter {1} - No matching StorageDriver found' .format(vpool.name, storagerouter.name)) continue if storagerouter.guid not in mds_config_set: MDSServiceController._logger.critical( 'vPool {0} - StorageRouter {1} - Not marked as offline, but could not retrieve an optimal MDS config' .format(vpool.name, storagerouter.name)) continue if root_client is None: MDSServiceController._logger.debug( 'vPool {0} - StorageRouter {1} - Marked as offline, not setting optimal MDS configuration' .format(vpool.name, storagerouter.name)) continue storagedriver_config = StorageDriverConfiguration( vpool_guid=vpool.guid, storagedriver_id=storagedriver.storagedriver_id) if storagedriver_config.config_missing is False: optimal_mds_config = mds_config_set[storagerouter.guid] MDSServiceController._logger.debug( 'vPool {0} - StorageRouter {1} - Storing optimal MDS configuration: {2}' .format(vpool.name, storagerouter.name, optimal_mds_config)) # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them storagedriver_config.configure_filesystem( fs_metadata_backend_mds_nodes=optimal_mds_config) storagedriver_config.save(root_client) # Execute a safety check, making sure the master/slave configuration is optimal. MDSServiceController._logger.info( 'vPool {0} - Ensuring safety for all vDisks'.format( vpool.name)) for vdisk in vpool.vdisks: try: MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid) except Exception: message = 'Ensure safety for vDisk {0} with guid {1} failed'.format( vdisk.name, vdisk.guid) MDSServiceController._logger.exception(message) failures.append(message) if len(failures) > 0: raise RuntimeError('\n - ' + '\n - '.join(failures)) MDSServiceController._logger.info('Finished')
def mds_checkup_single(vpool_guid, mds_dict=None, offline_nodes=None): # type: (str, collections.OrderedDict, List[StorageRouter]) -> None """ Validates the current MDS setup/configuration and takes actions where required Actions: * Verify which StorageRouters are available * Make mapping between vPools and its StorageRouters * For each vPool make sure every StorageRouter has at least 1 MDS service with capacity available * For each vPool retrieve the optimal configuration and store it for each StorageDriver * For each vPool run an ensure safety for all vDisks :param vpool_guid: Guid of the VPool to do the checkup for :type vpool_guid: str :param mds_dict: OrderedDict containing all mds related information :type mds_dict: collections.OrderedDict :param offline_nodes: Nodes that are marked as unreachable :type offline_nodes: List[StorageRouter] :raises RuntimeError: When ensure safety fails for any vDisk :return: None :rtype: NoneType :raises: MDSCheckupEnsureSafetyFailures when the ensure safety has failed for any vdisk """ params_to_verify = [mds_dict, offline_nodes] vpool = VPool(vpool_guid) if any(p is not None for p in params_to_verify) and not all( p is not None for p in params_to_verify): raise ValueError( 'Both mds_dict and offline_nodes must be given instead of providing either one' ) if not mds_dict: mds_dict, offline_nodes = MDSServiceController._get_mds_information( [vpool]) ensure_safety_failures = [] storagerouter_info = mds_dict[vpool] # Make sure there's at least 1 MDS on every StorageRouter that's not overloaded # Remove all MDS Services which have been manually marked for removal (by setting its capacity to 0) max_load = Configuration.get( '/ovs/vpools/{0}/mds_config|mds_maxload'.format(vpool.guid)) for storagerouter in sorted(storagerouter_info, key=lambda k: k.ip): total_load = 0.0 root_client = mds_dict[vpool][storagerouter]['client'] mds_services = mds_dict[vpool][storagerouter]['services'] for mds_service in list( sorted(mds_services, key=lambda k: k.number)): port = mds_service.service.ports[0] number = mds_service.number # Manual intervention required here in order for the MDS to be cleaned up # @TODO: Remove this and make a dynamic calculation to check which MDSes to remove if mds_service.capacity == 0 and len( mds_service.vdisks_guids) == 0: MDSServiceController._logger.warning( 'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Removing' .format(vpool.name, storagerouter.name, number, port)) try: MDSServiceController.remove_mds_service( mds_service=mds_service, reconfigure=True, allow_offline=root_client is None) except Exception: MDSServiceController._logger.exception( 'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Failed to remove' .format(vpool.name, storagerouter.name, number, port)) mds_services.remove(mds_service) else: _, next_load = MDSServiceController.get_mds_load( mds_service=mds_service) if next_load == float('inf'): total_load = sys.maxint * -1 # Cast to lowest possible value if any MDS service capacity is set to infinity else: total_load += next_load if next_load < max_load: MDSServiceController._logger.debug( 'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: Capacity available - Load at {4}%' .format(vpool.name, storagerouter.name, number, port, next_load)) else: MDSServiceController._logger.debug( 'vPool {0} - StorageRouter {1} - MDS Service {2} on port {3}: No capacity available - Load at {4}%' .format(vpool.name, storagerouter.name, number, port, next_load)) if total_load >= max_load * len(mds_services): mds_services_to_add = int( math.ceil((total_load - max_load * len(mds_services)) / max_load)) MDSServiceController._logger.info( 'vPool {0} - StorageRouter {1} - Average load per service {2:.2f}% - Max load per service {3:.2f}% - {4} MDS service{5} will be added' .format(vpool.name, storagerouter.name, total_load / len(mds_services), max_load, mds_services_to_add, '' if mds_services_to_add == 1 else 's')) for _ in range(mds_services_to_add): MDSServiceController._logger.info( 'vPool {0} - StorageRouter {1} - Adding new MDS Service' .format(vpool.name, storagerouter.name)) try: mds_services.append( MDSServiceController.prepare_mds_service( storagerouter=storagerouter, vpool=vpool)) except Exception: MDSServiceController._logger.exception( 'vPool {0} - StorageRouter {1} - Failed to create new MDS Service' .format(vpool.name, storagerouter.name)) # After potentially having added new MDSes, retrieve the optimal configuration mds_config_set = {} try: mds_config_set = MDSServiceController.get_mds_storagedriver_config_set( vpool=vpool, offline_nodes=offline_nodes) MDSServiceController._logger.debug( 'vPool {0} - Optimal configuration {1}'.format( vpool.name, mds_config_set)) except (NotFoundException, RuntimeError): MDSServiceController._logger.exception( 'vPool {0} - Failed to retrieve the optimal configuration'. format(vpool.name)) # Apply the optimal MDS configuration per StorageDriver for storagerouter in sorted(storagerouter_info, key=lambda k: k.ip): root_client = mds_dict[vpool][storagerouter]['client'] storagedriver = mds_dict[vpool][storagerouter]['storagedriver'] if storagedriver is None: MDSServiceController._logger.critical( 'vPool {0} - StorageRouter {1} - No matching StorageDriver found' .format(vpool.name, storagerouter.name)) continue if storagerouter.guid not in mds_config_set: MDSServiceController._logger.critical( 'vPool {0} - StorageRouter {1} - Not marked as offline, but could not retrieve an optimal MDS config' .format(vpool.name, storagerouter.name)) continue if root_client is None: MDSServiceController._logger.debug( 'vPool {0} - StorageRouter {1} - Marked as offline, not setting optimal MDS configuration' .format(vpool.name, storagerouter.name)) continue storagedriver_config = StorageDriverConfiguration( vpool_guid=vpool.guid, storagedriver_id=storagedriver.storagedriver_id) if storagedriver_config.config_missing is False: optimal_mds_config = mds_config_set[storagerouter.guid] MDSServiceController._logger.debug( 'vPool {0} - StorageRouter {1} - Storing optimal MDS configuration: {2}' .format(vpool.name, storagerouter.name, optimal_mds_config)) # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them storagedriver_config.configure_filesystem( fs_metadata_backend_mds_nodes=optimal_mds_config) storagedriver_config.save(root_client) # Execute a safety check, making sure the master/slave configuration is optimal. MDSServiceController._logger.info( 'vPool {0} - Ensuring safety for all vDisks'.format(vpool.name)) for vdisk in vpool.vdisks: try: MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid) except Exception: message = 'Ensure safety for vDisk {0} with guid {1} failed'.format( vdisk.name, vdisk.guid) MDSServiceController._logger.exception(message) ensure_safety_failures.append(message) if ensure_safety_failures: raise MDSCheckupEnsureSafetyFailures( '\n - ' + '\n - '.join(ensure_safety_failures))
def mds_checkup(): """ Validates the current MDS setup/configuration and takes actions where required """ logger.info('MDS checkup - Started') mds_dict = {} for vpool in VPoolList.get_vpools(): logger.info('MDS checkup - vPool {0}'.format(vpool.name)) mds_dict[vpool] = {} for mds_service in vpool.mds_services: storagerouter = mds_service.service.storagerouter if storagerouter not in mds_dict[vpool]: mds_dict[vpool][storagerouter] = {'client': None, 'services': []} try: client = SSHClient(storagerouter, username = '******') client.run('pwd') mds_dict[vpool][storagerouter]['client'] = client logger.info('MDS checkup - vPool {0} - Storage Router {1} - ONLINE'.format(vpool.name, storagerouter.name)) except UnableToConnectException: logger.info('MDS checkup - vPool {0} - Storage Router {1} - OFFLINE'.format(vpool.name, storagerouter.name)) mds_dict[vpool][storagerouter]['services'].append(mds_service) failures = [] max_load = EtcdConfiguration.get('/ovs/framework/storagedriver|mds_maxload') for vpool, storagerouter_info in mds_dict.iteritems(): # 1. First, make sure there's at least one MDS on every StorageRouter that's not overloaded # If not, create an extra MDS for that StorageRouter for storagerouter in storagerouter_info: client = mds_dict[vpool][storagerouter]['client'] mds_services = mds_dict[vpool][storagerouter]['services'] has_room = False for mds_service in mds_services[:]: if mds_service.capacity == 0 and len(mds_service.vdisks_guids) == 0: logger.info('MDS checkup - Removing mds_service {0} for vPool {1}'.format(mds_service.number, vpool.name)) MDSServiceController.remove_mds_service(mds_service, vpool, reconfigure=True, allow_offline=client is None) mds_services.remove(mds_service) for mds_service in mds_services: _, load = MDSServiceController.get_mds_load(mds_service) if load < max_load: has_room = True break logger.info('MDS checkup - vPool {0} - Storage Router {1} - Capacity available: {2}'.format(vpool.name, storagerouter.name, has_room)) if has_room is False and client is not None: mds_service = MDSServiceController.prepare_mds_service(storagerouter=storagerouter, vpool=vpool, fresh_only=False, reload_config=True) if mds_service is None: raise RuntimeError('Could not add MDS node') mds_services.append(mds_service) mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(vpool, True) for storagerouter in storagerouter_info: client = mds_dict[vpool][storagerouter]['client'] if client is None: logger.info('MDS checkup - vPool {0} - Storage Router {1} - Marked as offline, not setting default MDS configuration'.format(vpool.name, storagerouter.name)) continue storagedriver_config = StorageDriverConfiguration('storagedriver', vpool.name) storagedriver_config.load(client) if storagedriver_config.is_new is False: logger.info('MDS checkup - vPool {0} - Storage Router {1} - Storing default MDS configuration: {2}'.format(vpool.name, storagerouter.name, mds_config_set[storagerouter.guid])) storagedriver_config.clean() # Clean out obsolete values storagedriver_config.configure_filesystem(fs_metadata_backend_mds_nodes=mds_config_set[storagerouter.guid]) storagedriver_config.save(client) # 2. Per VPool, execute a safety check, making sure the master/slave configuration is optimal. logger.info('MDS checkup - vPool {0} - Ensuring safety for all virtual disks'.format(vpool.name)) for vdisk in vpool.vdisks: try: MDSServiceController.ensure_safety(vdisk) except Exception as ex: failures.append('Ensure safety for vDisk {0} with guid {1} failed with error: {2}'.format(vdisk.name, vdisk.guid, ex)) if len(failures) > 0: raise Exception('\n - ' + '\n - '.join(failures)) logger.info('MDS checkup - Finished')
def mds_checkup(): """ Validates the current MDS setup/configuration and takes actions where required """ MDSServiceController._logger.info("MDS checkup - Started") mds_dict = {} for vpool in VPoolList.get_vpools(): MDSServiceController._logger.info("MDS checkup - vPool {0}".format(vpool.name)) mds_dict[vpool] = {} for mds_service in vpool.mds_services: storagerouter = mds_service.service.storagerouter if storagerouter not in mds_dict[vpool]: mds_dict[vpool][storagerouter] = {"client": None, "services": []} try: mds_dict[vpool][storagerouter]["client"] = SSHClient(storagerouter, username="******") MDSServiceController._logger.info( "MDS checkup - vPool {0} - Storage Router {1} - ONLINE".format( vpool.name, storagerouter.name ) ) except UnableToConnectException: MDSServiceController._logger.info( "MDS checkup - vPool {0} - Storage Router {1} - OFFLINE".format( vpool.name, storagerouter.name ) ) mds_dict[vpool][storagerouter]["services"].append(mds_service) failures = [] max_load = Configuration.get("/ovs/framework/storagedriver|mds_maxload") for vpool, storagerouter_info in mds_dict.iteritems(): # 1. First, make sure there's at least one MDS on every StorageRouter that's not overloaded # If not, create an extra MDS for that StorageRouter for storagerouter in storagerouter_info: client = mds_dict[vpool][storagerouter]["client"] mds_services = mds_dict[vpool][storagerouter]["services"] has_room = False for mds_service in mds_services[:]: if mds_service.capacity == 0 and len(mds_service.vdisks_guids) == 0: MDSServiceController._logger.info( "MDS checkup - Removing mds_service {0} for vPool {1}".format( mds_service.number, vpool.name ) ) MDSServiceController.remove_mds_service( mds_service, vpool, reconfigure=True, allow_offline=client is None ) mds_services.remove(mds_service) for mds_service in mds_services: _, load = MDSServiceController.get_mds_load(mds_service) if load < max_load: has_room = True break MDSServiceController._logger.info( "MDS checkup - vPool {0} - Storage Router {1} - Capacity available: {2}".format( vpool.name, storagerouter.name, has_room ) ) if has_room is False and client is not None: mds_service = MDSServiceController.prepare_mds_service( storagerouter=storagerouter, vpool=vpool, fresh_only=False, reload_config=True ) if mds_service is None: raise RuntimeError("Could not add MDS node") mds_services.append(mds_service) mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(vpool, True) for storagerouter in storagerouter_info: client = mds_dict[vpool][storagerouter]["client"] if client is None: MDSServiceController._logger.info( "MDS checkup - vPool {0} - Storage Router {1} - Marked as offline, not setting default MDS configuration".format( vpool.name, storagerouter.name ) ) continue storagedriver = [sd for sd in storagerouter.storagedrivers if sd.vpool_guid == vpool.guid][0] storagedriver_config = StorageDriverConfiguration( "storagedriver", vpool.guid, storagedriver.storagedriver_id ) storagedriver_config.load() if storagedriver_config.is_new is False: MDSServiceController._logger.info( "MDS checkup - vPool {0} - Storage Router {1} - Storing default MDS configuration: {2}".format( vpool.name, storagerouter.name, mds_config_set[storagerouter.guid] ) ) storagedriver_config.configure_filesystem( fs_metadata_backend_mds_nodes=mds_config_set[storagerouter.guid] ) storagedriver_config.save(client) # 2. Per VPool, execute a safety check, making sure the master/slave configuration is optimal. MDSServiceController._logger.info( "MDS checkup - vPool {0} - Ensuring safety for all virtual disks".format(vpool.name) ) for vdisk in vpool.vdisks: try: MDSServiceController.ensure_safety(vdisk) except Exception: message = "Ensure safety for vDisk {0} with guid {1} failed".format(vdisk.name, vdisk.guid) MDSServiceController._logger.exception(message) failures.append(message) if len(failures) > 0: raise Exception("\n - " + "\n - ".join(failures)) MDSServiceController._logger.info("MDS checkup - Finished")
def configure_storagedriver_service(self): """ Configure the StorageDriver service :return: None :rtype: NoneType """ def _generate_queue_urls(): mq_user = Configuration.get('/ovs/framework/messagequeue|user') mq_protocol = Configuration.get('/ovs/framework/messagequeue|protocol') mq_password = Configuration.get('/ovs/framework/messagequeue|password') return [{'amqp_uri': '{0}://{1}:{2}@{3}:5672'.format(mq_protocol, mq_user, mq_password, sr.ip)} for sr in StorageRouterList.get_masters()] def _generate_config_file_system(): config = {'fs_dtl_host': '', 'fs_enable_shm_interface': 0, 'fs_enable_network_interface': 1, 'fs_metadata_backend_arakoon_cluster_nodes': [], 'fs_metadata_backend_mds_nodes': [], 'fs_metadata_backend_type': 'MDS', 'fs_virtual_disk_format': 'raw', 'fs_raw_disk_suffix': '.raw', 'fs_file_event_rules': [{'fs_file_event_rule_calls': ['Rename'], 'fs_file_event_rule_path_regex': '.*'}]} if self.dtl_mode == StorageDriverClient.FRAMEWORK_DTL_NO_SYNC: config['fs_dtl_config_mode'] = StorageDriverClient.VOLDRV_DTL_MANUAL_MODE else: config['fs_dtl_mode'] = StorageDriverClient.VPOOL_DTL_MODE_MAP[self.dtl_mode] config['fs_dtl_config_mode'] = StorageDriverClient.VOLDRV_DTL_AUTOMATIC_MODE return config def _generate_config_backend_connection_manager(): config = {'backend_type': 'MULTI', 'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0} for index, proxy in enumerate(sorted(self.storagedriver.alba_proxies, key=lambda k: k.service.ports[0])): config[str(index)] = {'alba_connection_host': self.storagedriver.storage_ip, 'alba_connection_port': proxy.service.ports[0], 'alba_connection_preset': vpool.metadata['backend']['backend_info']['preset'], 'alba_connection_timeout': 30, 'alba_connection_use_rora': True, 'alba_connection_transport': 'TCP', 'alba_connection_rora_manifest_cache_capacity': 25000, 'alba_connection_asd_connection_pool_capacity': 10, 'alba_connection_rora_timeout_msecs': 50, 'backend_type': 'ALBA'} return config if self.sr_installer is None: raise RuntimeError('No StorageRouterInstaller instance found') if len(self.write_caches) == 0: raise RuntimeError('The StorageDriverPartition junctions have not been created yet') vpool = self.vp_installer.vpool gap_configuration = StorageDriverController.calculate_trigger_and_backoff_gap(cache_size=self.sr_installer.smallest_write_partition_size) arakoon_cluster_name = str(Configuration.get('/ovs/framework/arakoon_clusters|voldrv')) arakoon_nodes = [{'host': node.ip, 'port': node.client_port, 'node_id': node.name} for node in ArakoonClusterConfig(cluster_id=arakoon_cluster_name).nodes] storagedriver_config = StorageDriverConfiguration(vpool.guid, self.storagedriver.storagedriver_id) storagedriver_config.configure_scocache(scocache_mount_points=self.write_caches, trigger_gap=ExtensionsToolbox.convert_byte_size_to_human_readable(size=gap_configuration['trigger']), backoff_gap=ExtensionsToolbox.convert_byte_size_to_human_readable(size=gap_configuration['backoff'])) storagedriver_config.configure_file_driver(fd_cache_path=self.storagedriver_partition_file_driver.path, fd_extent_cache_capacity='1024', fd_namespace='fd-{0}-{1}'.format(vpool.name, vpool.guid)) storagedriver_config.configure_volume_router(vrouter_id=self.storagedriver.storagedriver_id, vrouter_redirect_timeout_ms='120000', vrouter_keepalive_time_secs='15', vrouter_keepalive_interval_secs='5', vrouter_keepalive_retries='2', vrouter_routing_retries=10, vrouter_volume_read_threshold=0, vrouter_volume_write_threshold=0, vrouter_file_read_threshold=0, vrouter_file_write_threshold=0, vrouter_min_workers=4, vrouter_max_workers=16, vrouter_sco_multiplier=self.sco_size * 1024 / self.cluster_size, vrouter_backend_sync_timeout_ms=60000, vrouter_migrate_timeout_ms=60000, vrouter_use_fencing=True) storagedriver_config.configure_volume_manager(tlog_path=self.storagedriver_partition_tlogs.path, metadata_path=self.storagedriver_partition_metadata.path, clean_interval=1, dtl_throttle_usecs=4000, default_cluster_size=self.cluster_size * 1024, number_of_scos_in_tlog=self.tlog_multiplier, non_disposable_scos_factor=float(self.write_buffer) / self.tlog_multiplier / self.sco_size) storagedriver_config.configure_event_publisher(events_amqp_routing_key=Configuration.get('/ovs/framework/messagequeue|queues.storagedriver'), events_amqp_uris=_generate_queue_urls()) storagedriver_config.configure_volume_registry(vregistry_arakoon_cluster_id=arakoon_cluster_name, vregistry_arakoon_cluster_nodes=arakoon_nodes) storagedriver_config.configure_network_interface(network_max_neighbour_distance=StorageDriver.DISTANCES.FAR - 1) storagedriver_config.configure_threadpool_component(num_threads=16) storagedriver_config.configure_volume_router_cluster(vrouter_cluster_id=vpool.guid) storagedriver_config.configure_distributed_lock_store(dls_type='Arakoon', dls_arakoon_cluster_id=arakoon_cluster_name, dls_arakoon_cluster_nodes=arakoon_nodes) storagedriver_config.configure_content_addressed_cache(serialize_read_cache=False, read_cache_serialization_path=[]) storagedriver_config.configure_distributed_transaction_log(dtl_path=self.storagedriver_partition_dtl.path, # Not used, but required dtl_transport=StorageDriverClient.VPOOL_DTL_TRANSPORT_MAP[self.dtl_transport]) storagedriver_config.configure_filesystem(**_generate_config_file_system()) storagedriver_config.configure_backend_connection_manager(**_generate_config_backend_connection_manager()) storagedriver_config.save(client=self.sr_installer.root_client)