def update_preset(alba_backend_guid, name, policies): """ Updates policies for an existing preset to Alba :param alba_backend_guid: Guid of the ALBA backend :type alba_backend_guid: str :param name: Name of backend :type name: str :param policies: New policy list to be sent to alba :type policies: list :return: None """ # VALIDATIONS AlbaPresetController._validate_policies_param(policies=policies) alba_backend = AlbaBackend(alba_backend_guid) if name not in [preset['name'] for preset in alba_backend.presets]: raise RuntimeError('Could not find a preset with name {0} for ALBA Backend {1}'.format(name, alba_backend.name)) # UPDATE PRESET AlbaPresetController._logger.debug('Updating preset {0} with policies {1}'.format(name, policies)) config = Configuration.get_configuration_path(ArakoonInstaller.CONFIG_KEY.format(AlbaController.get_abm_cluster_name(alba_backend=alba_backend))) temp_config_file = tempfile.mktemp() with open(temp_config_file, 'wb') as data_file: data_file.write(json.dumps({'policies': policies})) data_file.flush() AlbaCLI.run(command='update-preset', config=config, named_params={'input-url': temp_config_file}, extra_params=[name]) alba_backend.invalidate_dynamics() os.remove(temp_config_file)
def get_load(nsm_cluster): """ Calculates the load of an NSM node, returning a float percentage :param nsm_cluster: NSM cluster to retrieve the load for :type nsm_cluster: ovs.dal.hybrids.albansmcluster.NSMCluster :return: Load of the NSM service :rtype: float """ service_capacity = float(nsm_cluster.capacity) if service_capacity < 0: return 50.0 if service_capacity == 0: return float('inf') config = Configuration.get_configuration_path( key=nsm_cluster.alba_backend.abm_cluster.config_location) hosts_data = AlbaCLI.run(command='list-nsm-hosts', config=config) try: host = [ host for host in hosts_data if host['id'] == nsm_cluster.name ][0] except IndexError: raise ValueError( 'No host data could be retrieved from Alba for NSM cluster {0}' .format(nsm_cluster.name)) usage = host['namespaces_count'] return round(usage / service_capacity * 100.0, 5)
def update_preset(alba_backend_guid, name, policies): """ Updates policies for an existing preset to Alba :param alba_backend_guid: Guid of the ALBA backend :type alba_backend_guid: str :param name: Name of preset :type name: str :param policies: New policy list to be sent to alba :type policies: list :return: None """ # VALIDATIONS AlbaPresetController._validate_policies_param(policies=policies) alba_backend = AlbaBackend(alba_backend_guid) if name not in [preset['name'] for preset in alba_backend.presets]: raise RuntimeError( 'Could not find a preset with name {0} for ALBA Backend {1}'. format(name, alba_backend.name)) # UPDATE PRESET AlbaPresetController._logger.debug( 'Updating preset {0} with policies {1}'.format(name, policies)) config = Configuration.get_configuration_path( alba_backend.abm_cluster.config_location) temp_config_file = tempfile.mktemp() with open(temp_config_file, 'wb') as data_file: data_file.write(json.dumps({'policies': policies})) data_file.flush() AlbaCLI.run(command='update-preset', config=config, named_params={'input-url': temp_config_file}, extra_params=[name]) alba_backend.invalidate_dynamics() os.remove(temp_config_file)
def check_for_halted_volumes(result_handler): """ Checks for halted volumes on a single or multiple vPools :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ result_handler.info('Checking for halted volumes.', add_to_result=False) vpools = VPoolHelper.get_vpools() if len(vpools) == 0: result_handler.skip('No vPools found!'.format(len(vpools))) return for vp in vpools: if vp.guid not in VolumedriverHealthCheck.LOCAL_SR.vpools_guids: result_handler.skip('Skipping vPool {0} because it is not living here.'.format(vp.name)) continue haltedvolumes = [] result_handler.info('Checking vPool {0}: '.format(vp.name), add_to_result=False) if len(vp.storagedrivers) > 0: config_file = Configuration.get_configuration_path('/ovs/vpools/{0}/hosts/{1}/config'.format(vp.guid, vp.storagedrivers[0].name)) else: result_handler.failure('The vpool {0} does not have any storagedrivers associated to it!'.format(vp.name)) continue try: voldrv_client = src.LocalStorageRouterClient(config_file) # noinspection PyArgumentList voldrv_volume_list = voldrv_client.list_volumes() for volume in voldrv_volume_list: # check if volume is halted, returns: 0 or 1 try: # noinspection PyTypeChecker if int(VolumedriverHealthCheck._info_volume(voldrv_client, volume).halted): haltedvolumes.append(volume) except ObjectNotFoundException: # ignore ovsdb invalid entrees # model consistency will handle it. continue except MaxRedirectsExceededException: # this means the volume is not halted but detached or unreachable for the volumedriver haltedvolumes.append(volume) except RuntimeError: haltedvolumes.append(volume) except TimeoutError: # timeout occurred haltedvolumes.append(volume) result_handler.success('Volumedriver {0} is up and running.'.format(vp.name)) except (ClusterNotReachableException, RuntimeError) as ex: result_handler.failure('Seems like the Volumedriver {0} is not running.'.format(vp.name, ex.message)) continue # print all results if len(haltedvolumes) > 0: result_handler.failure('Detected volumes that are HALTED in vPool {0}: {1}'.format(vp.name, ', '.join(haltedvolumes))) else: result_handler.success('No halted volumes detected in vPool {0}'.format(vp.name))
def _presets(self): """ Returns the policies active on the node """ if len(self.abm_services) == 0: return [] # No ABM services yet, so backend not fully installed yet asds = {} if self.scaling != AlbaBackend.SCALINGS.GLOBAL: for node in AlbaNodeList.get_albanodes(): asds[node.node_id] = 0 for disk in self.local_stack[node.node_id].values(): for asd_info in disk['asds'].values(): if asd_info['status'] in ['claimed', 'warning']: asds[node.node_id] += 1 config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(self.name)) presets = AlbaCLI.run(command='list-presets', config=config) preset_dict = {} for preset in presets: preset_dict[preset['name']] = preset if 'in_use' not in preset: preset['in_use'] = True if 'is_default' not in preset: preset['is_default'] = False preset['is_available'] = False preset['policies'] = [tuple(policy) for policy in preset['policies']] preset['policy_metadata'] = {} active_policy = None for policy in preset['policies']: is_available = False available_disks = 0 if self.scaling != AlbaBackend.SCALINGS.GLOBAL: available_disks += sum(min(asds[node], policy[3]) for node in asds) if self.scaling != AlbaBackend.SCALINGS.LOCAL: available_disks += sum(self.local_summary['devices'].values()) if available_disks >= policy[2]: if active_policy is None: active_policy = policy is_available = True preset['policy_metadata'][policy] = {'is_active': False, 'in_use': False, 'is_available': is_available} preset['is_available'] |= is_available if active_policy is not None: preset['policy_metadata'][active_policy]['is_active'] = True for namespace in self.ns_data: if namespace['namespace']['state'] != 'active': continue policy_usage = namespace['statistics']['bucket_count'] preset = preset_dict[namespace['namespace']['preset_name']] for usage in policy_usage: upolicy = tuple(usage[0]) # Policy as reported to be "in use" for cpolicy in preset['policies']: # All configured policies if upolicy[0] == cpolicy[0] and upolicy[1] == cpolicy[1] and upolicy[3] <= cpolicy[3]: preset['policy_metadata'][cpolicy]['in_use'] = True break for preset in presets: preset['policies'] = [str(policy) for policy in preset['policies']] for key in preset['policy_metadata'].keys(): preset['policy_metadata'][str(key)] = preset['policy_metadata'][key] del preset['policy_metadata'][key] return presets
def _osd_statistics(self): """ Loads statistics from all it's asds in one call """ from ovs.dal.hybrids.albaosd import AlbaOSD statistics = {} if self.abm_cluster is None: return statistics # No ABM cluster yet, so backend not fully installed yet osd_ids = [ osd.osd_id for osd in self.osds if osd.osd_type in [AlbaOSD.OSD_TYPES.ASD, AlbaOSD.OSD_TYPES.AD] ] if len(osd_ids) == 0: return statistics try: config = Configuration.get_configuration_path( self.abm_cluster.config_location) # TODO: This will need to be changed to osd-multistatistics, see openvstorage/alba#749 raw_statistics = AlbaCLI.run( command='asd-multistatistics', config=config, named_params={'long-id': ','.join(osd_ids)}) except RuntimeError: return statistics if raw_statistics: for osd_id, stats in raw_statistics.iteritems(): if stats['success'] is True: statistics[osd_id] = stats['result'] return statistics
def __init__(self, vpool_guid, storagedriver_id): """ Initializes the class """ _log_level = LOG_LEVEL_MAPPING[OVSLogger( 'extensions').getEffectiveLevel()] # noinspection PyCallByClass,PyTypeChecker storagerouterclient.Logger.setupLogging( OVSLogger.load_path('storagerouterclient'), _log_level) # noinspection PyArgumentList storagerouterclient.Logger.enableLogging() self._key = '/ovs/vpools/{0}/hosts/{1}/config'.format( vpool_guid, storagedriver_id) self._logger = OVSLogger('extensions') self._dirty_entries = [] self.remote_path = Configuration.get_configuration_path( self._key).strip('/') # Load configuration if Configuration.exists(self._key): self.configuration = Configuration.get(self._key) self.config_missing = False else: self.configuration = {} self.config_missing = True self._logger.debug( 'Could not find config {0}, a new one will be created'.format( self._key))
def delete_preset(alba_backend_guid, name): """ Deletes a preset from the Alba backend :param alba_backend_guid: Guid of the ALBA backend :type alba_backend_guid: str :param name: Name of the preset :type name: str :return: None """ # VALIDATIONS alba_backend = AlbaBackend(alba_backend_guid) preset_default_map = dict((preset['name'], preset['is_default']) for preset in alba_backend.presets) if name not in preset_default_map: AlbaPresetController._logger.warning( 'Preset with name {0} for ALBA Backend {1} could not be found, so not deleting' .format(name, alba_backend.name)) return if preset_default_map[name] is True: raise RuntimeError('Cannot delete the default preset') # DELETE PRESET AlbaPresetController._logger.debug('Deleting preset {0}'.format(name)) config = Configuration.get_configuration_path( alba_backend.abm_cluster.config_location) AlbaCLI.run(command='delete-preset', config=config, extra_params=[name]) alba_backend.invalidate_dynamics()
def get_abm_config(alba_backend): """ Retrieve the configuration string to pass to the ALBA CLI :param alba_backend: ALBA backend :return: Configuration string """ service_name = alba_backend.abm_services[0].service.name return ['--config', Configuration.get_configuration_path('/ovs/arakoon/{0}/config'.format(service_name.replace('arakoon-', '')))]
def _ns_data(self): """ Loads namespace data """ if len(self.abm_services) == 0: return [] # No ABM services yet, so backend not fully installed yet config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(self.name)) return AlbaCLI.run(command='show-namespaces', config=config, named_params={'max': -1})[1]
def get_stats_nsms(cls): """ Retrieve the amount of NSMs deployed and their statistics """ if cls._config is None: cls.validate_and_retrieve_config() stats = [] errors = False environment = cls._config['environment'] for alba_backend in AlbaBackendList.get_albabackends(): for nsm in alba_backend.nsm_clusters: stats.append({ 'tags': { 'nsm_number': nsm.number, 'environment': environment, 'backend_name': alba_backend.name, 'abm_service_name': alba_backend.abm_cluster.name }, 'fields': { 'load': float(AlbaArakoonController.get_load(nsm)) }, 'measurement': 'nsm' }) config_path = Configuration.get_configuration_path( alba_backend.abm_cluster.config_location) try: nsm_host_ids = [ nsm_host['id'] for nsm_host in AlbaCLI.run(command='list-nsm-hosts', config=config_path) ] nsm_hosts_statistics = AlbaCLI.run( command='nsm-hosts-statistics', config=config_path, named_params={'nsm-hosts': ','.join(nsm_host_ids)}) for nsm_host_id, statistics in nsm_hosts_statistics.iteritems( ): stats.append({ 'tags': { 'nsm_name': nsm_host_id, 'environment': environment, 'backend_name': alba_backend.name }, 'fields': cls._convert_to_float_values(statistics['statistics']), 'measurement': 'nsm_statistic' }) except Exception: errors = True cls._logger.exception( 'Retrieving NSM statistics for ALBA Backend {0} failed'. format(alba_backend.name)) return errors, stats
def check_backends(result_handler): """ Checks Alba as a whole :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ result_handler.info('Checking available ALBA backends.', add_to_result=False) try: alba_backends = AlbaHealthCheck._get_all_responding_backends(result_handler) if len(alba_backends) == 0: return result_handler.skip('No backends found.') result_handler.success('We found {0} backend(s)!'.format(len(alba_backends))) result_handler.info('Checking the ALBA ASDs.', add_to_result=False) for backend in alba_backends: backend_name = backend['name'] # Check disks of backend, ignore global backends if backend['type'] != 'LOCAL': result_handler.skip('Alba backend {0} is a global backend.'.format(backend_name), add_to_result=False) continue config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(backend_name)) try: result_disks = AlbaHealthCheck._check_backend_asds(result_handler, backend['disks'], backend_name, config) except Exception: result_handler.warning('Could not fetch the asd information for alba backend {0}'.format(backend_name)) continue working_disks = result_disks['working'] defective_disks = result_disks['broken'] # Check if backend is available for vPool use if backend['is_available_for_vpool']: if len(defective_disks) == 0: result_handler.success('Alba backend {0} should be available for VPool use. All asds are working fine!'.format(backend_name)) else: result_handler.warning('Alba backend {0} should be available for VPool use with {1} asds, but there are {2} defective asds: {3}' .format(backend_name, len(working_disks), len(defective_disks), ', '.join(defective_disks)), code=ErrorCodes.osd_defective) else: if len(working_disks) == 0 and len(defective_disks) == 0: result_handler.skip('Alba backend {0} is not available for vPool use, there are no asds assigned to this backend!'.format(backend_name)) else: result_handler.failure('Alba backend {0} is not available for vPool use, preset requirements not satisfied! There are {1} working asds AND {2} ' 'defective asds!'.format(backend_name, len(working_disks), len(defective_disks)), code=ErrorCodes.osd_defective_unsatisfiable) except NotFoundException as ex: result_handler.failure('Failed to fetch the object with exception: {0}'.format(ex), code=ErrorCodes.configuration_not_found) except ConnectionFailedException as ex: result_handler.failure('Failed to connect to configuration master with exception: {0}'.format(ex), code=ErrorCodes.arakoon_connection_failure) except (ArakoonNotFound, ArakoonNoMaster, ArakoonNoMasterResult) as e: result_handler.failure('Seems like an Arakoon has some problems: {0}'.format(str(e)), code=ErrorCodes.arakoon_problems)
def _ns_data(self): """ Loads namespace data """ if self.abm_cluster is None: return [] # No ABM cluster yet, so backend not fully installed yet config = Configuration.get_configuration_path( self.abm_cluster.config_location) return AlbaCLI.run(command='show-namespaces', config=config, named_params={'max': -1})[1]
def _deploy(config, filesystem, offline_nodes=None, plugins=None, delay_service_registration=False): """ Deploys a complete cluster: Distributing the configuration files, creating directories and services """ if os.environ.get('RUNNING_UNITTESTS') == 'True': if filesystem is True: raise NotImplementedError('At this moment, there is no support for unit-testing filesystem backend Arakoon clusters') ArakoonInstaller._logger.debug('Deploying cluster {0}'.format(config.cluster_id)) if offline_nodes is None: offline_nodes = [] service_metadata = {} for node in config.nodes: if node.ip in offline_nodes: continue ArakoonInstaller._logger.debug(' Deploying cluster {0} on {1}'.format(config.cluster_id, node.ip)) root_client = SSHClient(node.ip, username='******') # Distributes a configuration file to all its nodes config.write_config(node.ip) # Create dirs as root because mountpoint /mnt/cache1 is typically owned by root abs_paths = {node.tlog_dir, node.home} # That's a set if node.log_sinks.startswith('/'): abs_paths.add(os.path.dirname(os.path.abspath(node.log_sinks))) if node.crash_log_sinks.startswith('/'): abs_paths.add(os.path.dirname(os.path.abspath(node.crash_log_sinks))) abs_paths = list(abs_paths) root_client.dir_create(abs_paths) root_client.dir_chmod(abs_paths, 0755, recursive=True) root_client.dir_chown(abs_paths, 'ovs', 'ovs', recursive=True) # Creates services for/on all nodes in the config if config.filesystem is True: config_path = config.config_path else: config_path = Configuration.get_configuration_path(config.config_path) extra_version_cmd = '' if plugins is not None: extra_version_cmd = ';'.join(plugins) metadata = ServiceManager.add_service(name='ovs-arakoon', client=root_client, params={'CLUSTER': config.cluster_id, 'NODE_ID': node.name, 'CONFIG_PATH': config_path, 'EXTRA_VERSION_CMD': extra_version_cmd}, target_name='ovs-arakoon-{0}'.format(config.cluster_id), startup_dependency=('ovs-watcher-config' if filesystem is False else None), delay_registration=delay_service_registration) service_metadata[node.ip] = metadata ArakoonInstaller._logger.debug(' Deploying cluster {0} on {1} completed'.format(config.cluster_id, node.ip)) return service_metadata
def _generate_proxy_config(proxy_type, proxy_service): proxy_config = {'log_level': 'info', 'port': proxy_service.service.ports[0] if proxy_type == 'main' else 0, 'ips': [self.storagedriver.storage_ip] if proxy_type == 'main' else ['127.0.0.1'], 'manifest_cache_size': manifest_cache_size, 'fragment_cache': fragment_cache_main_proxy if proxy_type == 'main' else fragment_cache_scrub_proxy, 'transport': 'tcp', 'read_preference': read_preferences, 'albamgr_cfg_url': Configuration.get_configuration_path(config_tree.format('abm'))} if self.sr_installer.block_cache_supported: proxy_config['block_cache'] = block_cache_main_proxy if proxy_type == 'main' else block_cache_scrub_proxy return proxy_config
def restart_cluster_add(cluster_name, current_ips, new_ip, filesystem): """ Execute a (re)start sequence after adding a new node to a cluster. :param cluster_name: Name of the cluster to restart :type cluster_name: str :param current_ips: IPs of the previous nodes :type current_ips: list :param new_ip: IP of the newly added node :type new_ip: str :param filesystem: Indicates whether the configuration should be on the filesystem or in a configuration cluster :type filesystem: bool :return: None """ ArakoonInstaller._logger.debug('Restart sequence (add) for {0}'.format(cluster_name)) ArakoonInstaller._logger.debug('Current ips: {0}'.format(', '.join(current_ips))) ArakoonInstaller._logger.debug('New ip: {0}'.format(new_ip)) client = SSHClient(new_ip, username=ArakoonInstaller.SSHCLIENT_USER) if ArakoonInstaller.is_running(cluster_name, client): ArakoonInstaller._logger.info('Arakoon service for {0} is already running'.format(cluster_name)) return config = ArakoonClusterConfig(cluster_name, filesystem) config.load_config(new_ip) arakoon_client = ArakoonInstaller.build_client(config) if len(config.nodes) > 1: ArakoonInstaller._logger.debug('Catching up new node {0} for cluster {1}'.format(new_ip, cluster_name)) node_name = [node.name for node in config.nodes if node.ip == new_ip][0] if filesystem is True: config_path = config.config_path else: config_path = Configuration.get_configuration_path(config.config_path) client.run(['arakoon', '--node', node_name, '-config', config_path, '-catchup-only']) ArakoonInstaller._logger.debug('Catching up new node {0} for cluster {1} completed'.format(new_ip, cluster_name)) threshold = 2 if new_ip in current_ips else 1 for ip in current_ips: if ip == new_ip: continue current_client = SSHClient(ip, username='******') ArakoonInstaller.stop(cluster_name, client=current_client) ArakoonInstaller.start(cluster_name, client=current_client) ArakoonInstaller._logger.debug(' Restarted node {0} for cluster {1}'.format(current_client.ip, cluster_name)) if len(current_ips) > threshold: # A two node cluster needs all nodes running ArakoonInstaller.wait_for_cluster(cluster_name, ip, filesystem) client = SSHClient(new_ip, username='******') ArakoonInstaller.start(cluster_name, client=client) ArakoonInstaller.wait_for_cluster(cluster_name, new_ip, filesystem) arakoon_client.set(ArakoonInstaller.INTERNAL_CONFIG_KEY, config.export_ini()) ArakoonInstaller._logger.debug('Started node {0} for cluster {1}'.format(new_ip, cluster_name))
def collapse_arakoon(): """ Collapse Arakoon's Tlogs :return: None """ ScheduledTaskController._logger.info('Starting arakoon collapse') storagerouters = StorageRouterList.get_storagerouters() cluster_info = [('cacc', storagerouters[0], True)] cluster_names = [] for service in ServiceList.get_services(): if service.is_internal is True and service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): cluster = service.name.replace('arakoon-', '') if cluster in cluster_names: continue cluster_names.append(cluster) cluster_info.append((cluster, service.storagerouter, False)) workload = {} for cluster, storagerouter, filesystem in cluster_info: ScheduledTaskController._logger.debug(' Collecting info for cluster {0}'.format(cluster)) config = ArakoonClusterConfig(cluster, filesystem=filesystem) config.load_config(storagerouter.ip) for node in config.nodes: if node.ip not in workload: workload[node.ip] = {'node_id': node.name, 'clusters': []} workload[node.ip]['clusters'].append((cluster, filesystem)) for storagerouter in storagerouters: try: if storagerouter.ip not in workload: continue node_workload = workload[storagerouter.ip] client = SSHClient(storagerouter) for cluster, filesystem in node_workload['clusters']: try: ScheduledTaskController._logger.debug(' Collapsing cluster {0} on {1}'.format(cluster, storagerouter.ip)) if filesystem is True: config_path = ArakoonClusterConfig.CONFIG_FILE.format(cluster) else: config_path = Configuration.get_configuration_path(ArakoonClusterConfig.CONFIG_KEY.format(cluster)) client.run(['arakoon', '--collapse-local', node_workload['node_id'], '2', '-config', config_path]) ScheduledTaskController._logger.info(' Collapsing cluster {0} on {1} completed'.format(cluster, storagerouter.ip)) except: ScheduledTaskController._logger.exception(' Collapsing cluster {0} on {1} failed'.format(cluster, storagerouter.ip)) except UnableToConnectException: ScheduledTaskController._logger.error(' Could not collapse any cluster on {0} (not reachable)'.format(storagerouter.name)) ScheduledTaskController._logger.info('Arakoon collapse finished')
def get_stats_alba_backends(cls): """ Retrieve statistics about all ALBA Backends and their maintenance work """ if cls._config is None: cls.validate_and_retrieve_config() stats = [] errors = False environment = cls._config['environment'] for alba_backend in AlbaBackendList.get_albabackends(): try: local_summary = alba_backend.local_summary sizes = local_summary['sizes'] devices = local_summary['devices'] stats.append({ 'tags': { 'environment': environment, 'backend_name': alba_backend.name }, 'fields': { 'red': int(devices['red']), 'free': float(sizes['size'] - sizes['used']), 'used': float(sizes['used']), 'green': int(devices['green']), 'orange': int(devices['orange']), 'maintenance_work': int( AlbaCLI.run( command='list-work', config=Configuration.get_configuration_path( alba_backend.abm_cluster.config_location)) ['count']) }, 'measurement': 'backend' }) except Exception: errors = True cls._logger.exception( 'Retrieving statistics for ALBA Backend {0} failed'.format( alba_backend.name)) return errors, stats
def _deploy(config, filesystem, offline_nodes=None): """ Deploys a complete cluster: Distributing the configuration files, creating directories and services """ if os.environ.get('RUNNING_UNITTESTS') == 'True': if filesystem is True: raise NotImplementedError('At this moment, there is no support for unittesting filesystem backend Arakoon clusters') ArakoonInstaller._logger.debug('Deploying cluster {0}'.format(config.cluster_id)) if offline_nodes is None: offline_nodes = [] for node in config.nodes: if node.ip in offline_nodes: continue ArakoonInstaller._logger.debug(' Deploying cluster {0} on {1}'.format(config.cluster_id, node.ip)) root_client = SSHClient(node.ip, username='******') # Distributes a configuration file to all its nodes config.write_config(node.ip) # Create dirs as root because mountpoint /mnt/cache1 is typically owned by root abs_paths = {node.tlog_dir, node.home} # That's a set if node.log_sinks.startswith('/'): abs_paths.add(os.path.dirname(os.path.abspath(node.log_sinks))) if node.crash_log_sinks.startswith('/'): abs_paths.add(os.path.dirname(os.path.abspath(node.crash_log_sinks))) abs_paths = list(abs_paths) root_client.dir_create(abs_paths) root_client.dir_chmod(abs_paths, 0755, recursive=True) root_client.dir_chown(abs_paths, 'ovs', 'ovs', recursive=True) # Creates services for/on all nodes in the config if config.filesystem is True: config_path = config.config_path else: config_path = Configuration.get_configuration_path(config.config_path) base_name = 'ovs-arakoon' target_name = 'ovs-arakoon-{0}'.format(config.cluster_id) ServiceManager.add_service(base_name, root_client, params={'CLUSTER': config.cluster_id, 'NODE_ID': node.name, 'CONFIG_PATH': config_path, 'STARTUP_DEPENDENCY': 'started ovs-watcher-config' if filesystem is False else '(local-filesystems and started networking)'}, target_name=target_name) ArakoonInstaller._logger.debug(' Deploying cluster {0} on {1} completed'.format(config.cluster_id, node.ip))
def __init__(self, config_type, vpool_guid, storagedriver_id): """ Initializes the class """ if config_type != 'storagedriver': raise RuntimeError('Invalid configuration type. Allowed: storagedriver') storagerouterclient.Logger.setupLogging(LogHandler.load_path('storagerouterclient')) # noinspection PyArgumentList storagerouterclient.Logger.enableLogging() self._logger = LogHandler.get('extensions', name='storagedriver') self.config_type = config_type self.configuration = {} self.key = '/ovs/vpools/{0}/hosts/{1}/config'.format(vpool_guid, storagedriver_id) self.remote_path = Configuration.get_configuration_path(self.key).strip('/') self.is_new = True self.dirty_entries = []
def ovs_4509_validate_arakoon_collapse_test(): """ Validate arakoon collapse """ node_ips = [sr.ip for sr in GeneralStorageRouter.get_storage_routers()] node_ips.sort() for node_ip in node_ips: root_client = SSHClient(node_ip, username='******') arakoon_clusters = [] for service in ServiceList.get_services(): if service.is_internal is True and service.storagerouter.ip == node_ip and \ service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): arakoon_clusters.append(service.name.replace('arakoon-', '')) for arakoon_cluster in arakoon_clusters: arakoon_config_path = Configuration.get_configuration_path('/ovs/arakoon/{0}/config'.format(arakoon_cluster)) tlog_location = '/opt/OpenvStorage/db/arakoon/{0}/tlogs'.format(arakoon_cluster) # read_tlog_dir with remote(node_ip, [Configuration]) as rem: config_contents = rem.Configuration.get('/ovs/arakoon/{0}/config'.format(arakoon_cluster), raw=True) for line in config_contents.splitlines(): if 'tlog_dir' in line: tlog_location = line.split()[-1] nr_of_tlogs = TestArakoon.get_nr_of_tlogs_in_folder(root_client, tlog_location) old_headdb_timestamp = 0 if root_client.file_exists('/'.join([tlog_location, 'head.db'])): old_headdb_timestamp = root_client.run(['stat', '--format=%Y', tlog_location + '/head.db']) if nr_of_tlogs <= 2: benchmark_command = ['arakoon', '--benchmark', '-n_clients', '1', '-max_n', '5_000', '-config', arakoon_config_path] root_client.run(benchmark_command) GenericController.collapse_arakoon() nr_of_tlogs = TestArakoon.get_nr_of_tlogs_in_folder(root_client, tlog_location) new_headdb_timestamp = root_client.run(['stat', '--format=%Y', tlog_location + '/head.db']) assert nr_of_tlogs <= 2,\ 'Arakoon collapse left {0} tlogs on the environment, expecting less than 2'.format(nr_of_tlogs) assert old_headdb_timestamp != new_headdb_timestamp,\ 'Timestamp of the head_db file was not changed in the process of collapsing tlogs'
def _fill_slot(cls, node, slot_id, extra): # type: (AlbaNode, str, any) -> List[dict] """ Fills in the slots with ASDs and checks if the BACKEND role needs to be added :param node: The AlbaNode to fill on :type node: AlbaNode :param slot_id: ID of the slot to fill (which is an alias of the slot) :type slot_id: str :param extra: Extra information for filling :type extra: any :return: Information about the created osds :rtype: List[dict] """ if node.type == AlbaNode.NODE_TYPES.S3: extra = extra.copy() try: s3_transaction_cluster = S3TransactionClusterList.get_s3_transaction_clusters( )[0] extra[ 'transaction_arakoon_url'] = Configuration.get_configuration_path( key=s3_transaction_cluster.config_location) except IndexError: raise RuntimeError( 'No transaction arakoon was deployed for this cluster!') created_osds = node.client.fill_slot(slot_id=slot_id, extra=extra) cls._logger.info(created_osds) # Sync model if node.storagerouter is not None: stack = node.client.get_stack() # type: dict DiskController.sync_with_reality( storagerouter_guid=node.storagerouter_guid) slot_information = stack.get(slot_id, {}) slot_aliases = slot_information.get('aliases', []) for disk in node.storagerouter.disks: if set(disk.aliases).intersection(set(slot_aliases)): partition = disk.partitions[0] if DiskPartition.ROLES.BACKEND not in partition.roles: partition.roles.append(DiskPartition.ROLES.BACKEND) partition.save() return created_osds or [] # Always return a list
def _generate_proxy_cache_config(cache_settings, cache_type, proxy_index): if cache_settings['read'] is False and cache_settings['write'] is False: return ['none'] if cache_settings['is_backend'] is True: cfg_tree_name = 'abm_bc' if cache_type == StorageDriverConfiguration.CACHE_BLOCK else 'abm_aa' return ['alba', {'cache_on_read': cache_settings['read'], 'cache_on_write': cache_settings['write'], 'albamgr_cfg_url': Configuration.get_configuration_path(config_tree.format(cfg_tree_name)), 'bucket_strategy': ['1-to-1', {'prefix': vpool.guid, 'preset': cache_settings['backend_info']['preset']}], 'manifest_cache_size': manifest_cache_size}] if cache_type == StorageDriverConfiguration.CACHE_BLOCK: path = '{0}/bc'.format(self.storagedriver_partitions_caches[proxy_index].path) else: path = '{0}/fc'.format(self.storagedriver_partitions_caches[proxy_index].path) return ['local', {'path': path, 'max_size': self.cache_size_local / self.sr_installer.requested_local_proxies, 'cache_on_read': cache_settings['read'], 'cache_on_write': cache_settings['write']}]
def __init__(self, config_type, vpool_guid, storagedriver_id): """ Initializes the class """ if config_type != 'storagedriver': raise RuntimeError( 'Invalid configuration type. Allowed: storagedriver') storagerouterclient.Logger.setupLogging( LogHandler.load_path('storagerouterclient')) # noinspection PyArgumentList storagerouterclient.Logger.enableLogging() self._logger = LogHandler.get('extensions', name='storagedriver') self.config_type = config_type self.configuration = {} self.key = '/ovs/vpools/{0}/hosts/{1}/config'.format( vpool_guid, storagedriver_id) self.remote_path = Configuration.get_configuration_path( self.key).strip('/') self.is_new = True self.dirty_entries = []
def _asd_statistics(self): """ Loads statistics from all it's asds in one call """ from ovs.dal.hybrids.albaosd import AlbaOSD statistics = {} if len(self.abm_services) == 0: return statistics # No ABM services yet, so backend not fully installed yet asd_ids = [osd.osd_id for osd in self.osds if osd.osd_type == AlbaOSD.OSD_TYPES.ASD] if len(asd_ids) == 0: return statistics try: config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(self.name)) raw_statistics = AlbaCLI.run(command='asd-multistatistics', config=config, named_params={'long-id': ','.join(asd_ids)}) except RuntimeError: return statistics for asd_id, stats in raw_statistics.iteritems(): if stats['success'] is True: statistics[asd_id] = stats['result'] return statistics
def _usages(self): """ Returns an overview of free space, total space and used space """ # Collect total usage usages = {'free': 0.0, 'size': 0.0, 'used': 0.0} if self.abm_cluster is None: return usages config = Configuration.get_configuration_path( self.abm_cluster.config_location) try: osds_stats = AlbaCLI.run(command='list-osds', config=config) except AlbaError: self._logger.exception('Unable to fetch OSD information') return usages for osd_stats in osds_stats: usages['size'] += osd_stats['total'] usages['used'] += osd_stats['used'] usages['free'] = usages['size'] - usages['used'] return usages
def delete_preset(alba_backend_guid, name): """ Deletes a preset from the Alba backend :param alba_backend_guid: Guid of the ALBA backend :type alba_backend_guid: str :param name: Name of the preset :type name: str :return: None """ # VALIDATIONS alba_backend = AlbaBackend(alba_backend_guid) preset_default_map = dict((preset['name'], preset['is_default']) for preset in alba_backend.presets) if name not in preset_default_map: AlbaPresetController._logger.warning('Preset with name {0} for ALBA Backend {1} could not be found, so not deleting'.format(name, alba_backend.name)) return if preset_default_map[name] is True: raise RuntimeError('Cannot delete the default preset') # DELETE PRESET AlbaPresetController._logger.debug('Deleting preset {0}'.format(name)) config = Configuration.get_configuration_path(ArakoonInstaller.CONFIG_KEY.format(AlbaController.get_abm_cluster_name(alba_backend=alba_backend))) AlbaCLI.run(command='delete-preset', config=config, extra_params=[name]) alba_backend.invalidate_dynamics()
def _stack(self): """ Returns an overview of this node's storage stack """ from ovs.dal.hybrids.albabackend import AlbaBackend from ovs.dal.lists.albabackendlist import AlbaBackendList def _move(info): for move in [('state', 'status'), ('state_detail', 'status_detail')]: if move[0] in info: info[move[1]] = info[move[0]] del info[move[0]] stack = {} node_down = False # Fetch stack from asd-manager try: remote_stack = self.client.get_stack() for slot_id, slot_data in remote_stack.iteritems(): stack[slot_id] = {'status': 'ok'} stack[slot_id].update(slot_data) # Migrate state > status _move(stack[slot_id]) for osd_data in slot_data.get('osds', {}).itervalues(): _move(osd_data) except (requests.ConnectionError, requests.Timeout, InvalidCredentialsError): self._logger.warning( 'Error during stack retrieval. Assuming that the node is down') node_down = True model_osds = {} found_osds = {} # Apply own model to fetched stack for osd in self.osds: model_osds[osd.osd_id] = osd # Initially set the info if osd.slot_id not in stack: stack[osd.slot_id] = { 'status': self.OSD_STATUSES.UNKNOWN if node_down is True else self.OSD_STATUSES.MISSING, 'status_detail': self.OSD_STATUS_DETAILS.NODEDOWN if node_down is True else '', 'osds': {} } osd_data = stack[osd.slot_id]['osds'].get(osd.osd_id, {}) stack[osd.slot_id]['osds'][ osd.osd_id] = osd_data # Initially set the info in the stack osd_data.update(osd.stack_info) if node_down is True: osd_data['status'] = self.OSD_STATUSES.UNKNOWN osd_data['status_detail'] = self.OSD_STATUS_DETAILS.NODEDOWN elif osd.alba_backend_guid is not None: # Osds has been claimed # Load information from alba if osd.alba_backend_guid not in found_osds: found_osds[osd.alba_backend_guid] = {} if osd.alba_backend.abm_cluster is not None: config = Configuration.get_configuration_path( osd.alba_backend.abm_cluster.config_location) try: for found_osd in AlbaCLI.run( command='list-all-osds', config=config): found_osds[osd.alba_backend_guid][ found_osd['long_id']] = found_osd except (AlbaError, RuntimeError): self._logger.exception( 'Listing all osds has failed') osd_data['status'] = self.OSD_STATUSES.UNKNOWN osd_data[ 'status_detail'] = self.OSD_STATUS_DETAILS.ALBAERROR continue if osd.osd_id not in found_osds[osd.alba_backend_guid]: # Not claimed by any backend thus not in use continue found_osd = found_osds[osd.alba_backend_guid][osd.osd_id] if found_osd['decommissioned'] is True: osd_data['status'] = self.OSD_STATUSES.UNAVAILABLE osd_data[ 'status_detail'] = self.OSD_STATUS_DETAILS.DECOMMISSIONED continue backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format( osd.alba_backend_guid) if Configuration.exists(backend_interval_key): interval = Configuration.get(backend_interval_key) else: interval = Configuration.get( '/ovs/alba/backends/global_gui_error_interval') read = found_osd['read'] or [0] write = found_osd['write'] or [0] errors = found_osd['errors'] osd_data['status'] = self.OSD_STATUSES.WARNING osd_data['status_detail'] = self.OSD_STATUS_DETAILS.ERROR if len(errors) == 0 or (len(read + write) > 0 and max(min(read), min(write)) > max(error[0] for error in errors) + interval): osd_data['status'] = self.OSD_STATUSES.OK osd_data['status_detail'] = '' statistics = {} for slot_info in stack.itervalues(): for osd_id, osd in slot_info['osds'].iteritems(): if osd.get( 'status_detail') == self.OSD_STATUS_DETAILS.ACTIVATING: osd['claimed_by'] = 'unknown' # We won't be able to connect to it just yet continue if osd_id not in model_osds: # The osd is known by the remote node but not in the model # In that case, let's connect to the OSD to see whether we get some info from it try: ips = osd['hosts'] if 'hosts' in osd and len( osd['hosts']) > 0 else osd.get('ips', []) port = osd['port'] claimed_by = 'unknown' for ip in ips: try: # Output will be None if it is not claimed claimed_by = AlbaCLI.run('get-osd-claimed-by', named_params={ 'host': ip, 'port': port }) break except (AlbaError, RuntimeError): self._logger.warning( 'get-osd-claimed-by failed for IP:port {0}:{1}' .format(ip, port)) alba_backend = AlbaBackendList.get_by_alba_id( claimed_by) osd['claimed_by'] = alba_backend.guid if alba_backend is not None else claimed_by except KeyError: osd['claimed_by'] = 'unknown' except: self._logger.exception( 'Could not load OSD info: {0}'.format(osd_id)) osd['claimed_by'] = 'unknown' if osd.get('status') not in ['error', 'warning']: osd['status'] = self.OSD_STATUSES.ERROR osd['status_detail'] = self.OSD_STATUS_DETAILS.UNREACHABLE claimed_by = osd.get('claimed_by', 'unknown') if claimed_by == 'unknown': continue try: alba_backend = AlbaBackend(claimed_by) except ObjectNotFoundException: continue # Add usage information if alba_backend not in statistics: statistics[alba_backend] = alba_backend.osd_statistics osd_statistics = statistics[alba_backend] if osd_id not in osd_statistics: continue stats = osd_statistics[osd_id] osd['usage'] = { 'size': int(stats['capacity']), 'used': int(stats['disk_usage']), 'available': int(stats['capacity'] - stats['disk_usage']) } return stack
def validate_alba_backend_sanity_without_claimed_disks(alba_backend): """ Validate whether the ALBA backend is configured correctly :param alba_backend: ALBA backend :return: None """ # Attribute validation assert alba_backend.available is True,\ 'ALBA backend {0} is not available'.format(alba_backend.backend.name) assert len(alba_backend.presets) >= 1,\ 'No preset found for ALBA backend {0}'.format(alba_backend.backend.name) assert len([default for default in alba_backend.presets if default['is_default'] is True]) == 1,\ 'Could not find default preset for backend {0}'.format(alba_backend.backend.name) assert alba_backend.backend.backend_type.code == 'alba',\ 'Backend type for ALBA backend is {0}'.format(alba_backend.backend.backend_type.code) assert alba_backend.backend.status == 'RUNNING',\ 'Status for ALBA backend is {0}'.format(alba_backend.backend.status) # Validate ABM and NSM services storagerouters = GeneralStorageRouter.get_storage_routers() storagerouters_with_db_role = [sr for sr in storagerouters if GeneralStorageRouter.has_roles(storagerouter=sr, roles='DB') is True and sr.node_type == 'MASTER'] assert len(alba_backend.abm_services) == len(storagerouters_with_db_role),\ 'Not enough ABM services found' assert len(alba_backend.nsm_services) == len(storagerouters_with_db_role),\ 'Not enough NSM services found' # Validate ALBA backend configuration structure alba_backend_key = '/ovs/alba/backends' assert Configuration.dir_exists(key=alba_backend_key) is True,\ 'Configuration does not contain key {0}'.format(alba_backend_key) actual_config_keys = [key for key in Configuration.list(alba_backend_key)] expected_config_keys = ['global_gui_error_interval', alba_backend.guid, 'default_nsm_hosts'] optional_config_keys = ['verification_factor'] expected_keys_amount = 0 for optional_key in optional_config_keys: if optional_key in actual_config_keys: expected_keys_amount += 1 for expected_key in expected_config_keys: if not re.match(Toolbox.regex_guid, expected_key): expected_keys_amount += 1 assert expected_key in actual_config_keys,\ 'Key {0} was not found in tree {1}'.format(expected_key, alba_backend_key) for actual_key in list(actual_config_keys): if re.match(Toolbox.regex_guid, actual_key): actual_config_keys.remove(actual_key) # Remove all alba backend keys assert len(actual_config_keys) == expected_keys_amount,\ 'Another key was added to the {0} tree'.format(alba_backend_key) this_alba_backend_key = '{0}/{1}'.format(alba_backend_key, alba_backend.guid) actual_keys = [key for key in Configuration.list(this_alba_backend_key)] expected_keys = ['maintenance'] assert actual_keys == expected_keys,\ 'Actual keys: {0} - Expected keys: {1}'.format(actual_keys, expected_keys) maintenance_key = '{0}/maintenance'.format(this_alba_backend_key) actual_keys = [key for key in Configuration.list(maintenance_key)] expected_keys = ['nr_of_agents', 'config'] assert set(actual_keys) == set(expected_keys),\ 'Actual keys: {0} - Expected keys: {1}'.format(actual_keys, expected_keys) # @TODO: Add validation for config values # Validate ASD node configuration structure alba_nodes = GeneralAlba.get_alba_nodes() assert len(alba_nodes) > 0,\ 'Could not find any ALBA nodes in the model' alba_node_key = '/ovs/alba/asdnodes' actual_keys = [key for key in Configuration.list(alba_node_key)] assert len(alba_nodes) == len(actual_keys),\ 'Amount of ALBA nodes in model: {0} >< amount of ALBA nodes in configuration: {1}.'.format(len(alba_nodes), len(actual_keys)) for alba_node in alba_nodes: assert alba_node.node_id in actual_keys,\ 'ALBA node with ID {0} not present in configuration'.format(alba_node.node_id) actual_asdnode_keys = [key for key in Configuration.list('{0}/{1}'.format(alba_node_key, alba_node.node_id))] expected_asdnode_keys = ['config', 'services'] assert actual_asdnode_keys == expected_asdnode_keys,\ 'Actual keys: {0} - Expected keys: {1}'.format(actual_asdnode_keys, expected_asdnode_keys) actual_config_keys = [key for key in Configuration.list('{0}/{1}/config'.format(alba_node_key, alba_node.node_id))] expected_config_keys = ['main', 'network'] assert set(actual_config_keys) == set(expected_config_keys),\ 'Actual keys: {0} - Expected keys: {1}'.format(actual_config_keys, expected_config_keys) # @TODO: Add validation for main and network values # Validate Arakoon configuration structure arakoon_abm_key = '/ovs/arakoon/{0}/config'.format(alba_backend.abm_services[0].service.name).replace('arakoon-', '') arakoon_nsm_key = '/ovs/arakoon/{0}/config'.format(alba_backend.nsm_services[0].service.name).replace('arakoon-', '') assert Configuration.exists(key=arakoon_abm_key, raw=True) is True,\ 'Configuration key {0} does not exist'.format(arakoon_abm_key) assert Configuration.exists(key=arakoon_nsm_key, raw=True) is True,\ 'Configuration key {0} does not exist'.format(arakoon_nsm_key) # @TODO: Add validation for config values # Validate maintenance agents actual_amount_agents = len([service for node_services in [alba_node.client.list_maintenance_services() for alba_node in alba_nodes] for service in node_services]) expected_amount_agents = 1 assert actual_amount_agents == expected_amount_agents,\ 'Amount of maintenance agents is incorrect. Found {0} - Expected {1}'.format(actual_amount_agents, expected_amount_agents) # Validate arakoon services machine_ids = [sr.machine_id for sr in storagerouters_with_db_role] abm_service_name = alba_backend.abm_services[0].service.name nsm_service_name = alba_backend.nsm_services[0].service.name for storagerouter in storagerouters_with_db_role: root_client = SSHClient(endpoint=storagerouter, username='******') for service_name in [abm_service_name, nsm_service_name]: assert GeneralService.has_service(name=service_name, client=root_client) is True,\ 'Service {0} not deployed on Storage Router {1}'.format(service_name, storagerouter.name) exitcode, output = GeneralService.get_service_status(name=service_name, client=root_client) assert exitcode is True,\ 'Service {0} not running on Storage Router {1} - {2}'.format(service_name, storagerouter.name, output) out, err, _ = General.execute_command('arakoon --who-master -config {0}'.format(Configuration.get_configuration_path('/ovs/arakoon/{0}/config'.format(abm_service_name.replace('arakoon-', ''))))) assert out.strip() in machine_ids,\ 'Arakoon master is {0}, but should be 1 of "{1}"'.format(out.strip(), ', '.join(machine_ids))
def migrate(previous_version): """ Migrates from a given version to the current version. It uses 'previous_version' to be smart wherever possible, but the code should be able to migrate any version towards the expected version. When this is not possible, the code can set a minimum version and raise when it is not met. :param previous_version: The previous version from which to start the migration :type previous_version: float """ working_version = previous_version if working_version == 0: from ovs.dal.hybrids.servicetype import ServiceType # Initial version: # * Add any basic configuration or model entries # Add backends for backend_type_info in [('ALBA', 'alba')]: code = backend_type_info[1] backend_type = BackendTypeList.get_backend_type_by_code(code) if backend_type is None: backend_type = BackendType() backend_type.name = backend_type_info[0] backend_type.code = code backend_type.save() # Add service types for service_type_info in [ ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR, ServiceType.SERVICE_TYPES.ALBA_S3_TRANSACTION ]: service_type = ServiceType() service_type.name = service_type_info service_type.save() # From here on, all actual migration should happen to get to the expected state for THIS RELEASE elif working_version < DALMigrator.THIS_VERSION: import hashlib from ovs.dal.exceptions import ObjectNotFoundException from ovs.dal.helpers import HybridRunner, Descriptor from ovs.dal.hybrids.albaabmcluster import ABMCluster from ovs.dal.hybrids.albaosd import AlbaOSD from ovs.dal.hybrids.albansmcluster import NSMCluster from ovs.dal.hybrids.j_abmservice import ABMService from ovs.dal.hybrids.j_nsmservice import NSMService from ovs.dal.hybrids.service import Service from ovs.dal.hybrids.servicetype import ServiceType from ovs.dal.lists.albabackendlist import AlbaBackendList from ovs.dal.lists.albanodelist import AlbaNodeList from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.extensions.db.arakooninstaller import ArakoonClusterConfig, ArakoonInstaller from ovs.extensions.generic.configuration import Configuration, NotFoundException from ovs_extensions.generic.toolbox import ExtensionsToolbox from ovs.extensions.plugins.albacli import AlbaCLI from ovs.extensions.storage.persistentfactory import PersistentFactory # Migrate unique constraints & indexes client = PersistentFactory.get_client() hybrid_structure = HybridRunner.get_hybrids() for class_descriptor in hybrid_structure.values(): cls = Descriptor().load(class_descriptor).get_object() classname = cls.__name__.lower() unique_key = 'ovs_unique_{0}_{{0}}_'.format(classname) index_prefix = 'ovs_index_{0}|{{0}}|'.format(classname) index_key = 'ovs_index_{0}|{{0}}|{{1}}'.format(classname) uniques = [] indexes = [] # noinspection PyProtectedMember for prop in cls._properties: if prop.unique is True and len([ k for k in client.prefix( unique_key.format(prop.name)) ]) == 0: uniques.append(prop.name) if prop.indexed is True and len([ k for k in client.prefix( index_prefix.format(prop.name)) ]) == 0: indexes.append(prop.name) if len(uniques) > 0 or len(indexes) > 0: prefix = 'ovs_data_{0}_'.format(classname) for key, data in client.prefix_entries(prefix): for property_name in uniques: ukey = '{0}{1}'.format( unique_key.format(property_name), hashlib.sha1(str( data[property_name])).hexdigest()) client.set(ukey, key) for property_name in indexes: if property_name not in data: continue # This is the case when there's a new indexed property added. ikey = index_key.format( property_name, hashlib.sha1(str( data[property_name])).hexdigest()) index = list( client.get_multi([ikey], must_exist=False))[0] transaction = client.begin_transaction() if index is None: client.assert_value(ikey, None, transaction=transaction) client.set(ikey, [key], transaction=transaction) elif key not in index: client.assert_value(ikey, index[:], transaction=transaction) client.set(ikey, index + [key], transaction=transaction) client.apply_transaction(transaction) ############################################# # Introduction of ABMCluster and NSMCluster # ############################################# # Verify presence of unchanged ALBA Backends alba_backends = AlbaBackendList.get_albabackends() changes_required = False for alba_backend in alba_backends: if alba_backend.abm_cluster is None or len( alba_backend.nsm_clusters) == 0: changes_required = True break if changes_required: # Retrieve ABM and NSM clusters abm_cluster_info = [] nsm_cluster_info = [] for cluster_name in Configuration.list('/ovs/arakoon'): try: metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name) if metadata[ 'cluster_type'] == ServiceType.ARAKOON_CLUSTER_TYPES.ABM: abm_cluster_info.append(metadata) elif metadata[ 'cluster_type'] == ServiceType.ARAKOON_CLUSTER_TYPES.NSM: nsm_cluster_info.append(metadata) except NotFoundException: continue # Retrieve NSM Arakoon cluster information cluster_arakoon_map = {} for cluster_info in abm_cluster_info + nsm_cluster_info: cluster_name = cluster_info['cluster_name'] arakoon_config = ArakoonClusterConfig( cluster_id=cluster_name) cluster_arakoon_map[ cluster_name] = arakoon_config.export_dict() storagerouter_map = dict( (storagerouter.machine_id, storagerouter) for storagerouter in StorageRouterList.get_storagerouters()) alba_backend_id_map = dict((alba_backend.alba_id, alba_backend) for alba_backend in alba_backends) for cluster_info in abm_cluster_info: internal = cluster_info['internal'] cluster_name = cluster_info['cluster_name'] config_location = Configuration.get_configuration_path( key=ArakoonClusterConfig.CONFIG_KEY.format( cluster_name)) try: alba_id = AlbaCLI.run(command='get-alba-id', config=config_location, named_params={'attempts': 3})['id'] nsm_hosts = AlbaCLI.run(command='list-nsm-hosts', config=config_location, named_params={'attempts': 3}) except RuntimeError: continue alba_backend = alba_backend_id_map.get(alba_id) if alba_backend is None: # ALBA Backend with ID not found in model continue if alba_backend.abm_cluster is not None and len( alba_backend.nsm_clusters ) > 0: # Clusters already exist continue # Create ABM Cluster if alba_backend.abm_cluster is None: abm_cluster = ABMCluster() abm_cluster.name = cluster_name abm_cluster.alba_backend = alba_backend abm_cluster.config_location = ArakoonClusterConfig.CONFIG_KEY.format( cluster_name) abm_cluster.save() else: abm_cluster = alba_backend.abm_cluster # Create ABM Services abm_arakoon_config = cluster_arakoon_map[cluster_name] abm_arakoon_config.pop('global') arakoon_nodes = abm_arakoon_config.keys() if internal is False: services_to_create = 1 else: if set(arakoon_nodes).difference( set(storagerouter_map.keys())): continue services_to_create = len(arakoon_nodes) for index in range(services_to_create): service = Service() service.name = 'arakoon-{0}-abm'.format( alba_backend.name) service.type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ALBA_MGR) if internal is True: arakoon_node_config = abm_arakoon_config[ arakoon_nodes[index]] service.ports = [ arakoon_node_config['client_port'], arakoon_node_config['messaging_port'] ] service.storagerouter = storagerouter_map[ arakoon_nodes[index]] else: service.ports = [] service.storagerouter = None service.save() abm_service = ABMService() abm_service.service = service abm_service.abm_cluster = abm_cluster abm_service.save() # Create NSM Clusters for cluster_index, nsm_host in enumerate( sorted(nsm_hosts, key=lambda host: ExtensionsToolbox. advanced_sort(host['cluster_id'], '_'))): nsm_cluster_name = nsm_host['cluster_id'] nsm_arakoon_config = cluster_arakoon_map.get( nsm_cluster_name) if nsm_arakoon_config is None: continue number = cluster_index if internal is False else int( nsm_cluster_name.split('_')[-1]) nsm_cluster = NSMCluster() nsm_cluster.name = nsm_cluster_name nsm_cluster.number = number nsm_cluster.alba_backend = alba_backend nsm_cluster.config_location = ArakoonClusterConfig.CONFIG_KEY.format( nsm_cluster_name) nsm_cluster.save() # Create NSM Services nsm_arakoon_config.pop('global') arakoon_nodes = nsm_arakoon_config.keys() if internal is False: services_to_create = 1 else: if set(arakoon_nodes).difference( set(storagerouter_map.keys())): continue services_to_create = len(arakoon_nodes) for service_index in range(services_to_create): service = Service() service.name = 'arakoon-{0}-nsm_{1}'.format( alba_backend.name, number) service.type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.NS_MGR) if internal is True: arakoon_node_config = nsm_arakoon_config[ arakoon_nodes[service_index]] service.ports = [ arakoon_node_config['client_port'], arakoon_node_config['messaging_port'] ] service.storagerouter = storagerouter_map[ arakoon_nodes[service_index]] else: service.ports = [] service.storagerouter = None service.save() nsm_service = NSMService() nsm_service.service = service nsm_service.nsm_cluster = nsm_cluster nsm_service.save() # Clean up all junction services no longer linked to an ALBA Backend all_nsm_services = [ service.nsm_service for service in ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.NS_MGR).services if service.nsm_service.nsm_cluster is None ] all_abm_services = [ service.abm_service for service in ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ALBA_MGR).services if service.abm_service.abm_cluster is None ] for abm_service in all_abm_services: abm_service.delete() abm_service.service.delete() for nsm_service in all_nsm_services: nsm_service.delete() nsm_service.service.delete() ################################ # Introduction of Active Drive # ################################ # Update slot_id and Alba Node relation for all OSDs client = PersistentFactory.get_client() disk_osd_map = {} for key, data in client.prefix_entries('ovs_data_albaosd_'): alba_disk_guid = data.get('alba_disk', {}).get('guid') if alba_disk_guid is not None: if alba_disk_guid not in disk_osd_map: disk_osd_map[alba_disk_guid] = [] disk_osd_map[alba_disk_guid].append( key.replace('ovs_data_albaosd_', '')) try: value = client.get(key) value.pop('alba_disk', None) client.set(key=key, value=value) except Exception: pass # We don't care if we would have any leftover AlbaDisk information in _data, but its cleaner not to alba_guid_node_map = dict( (an.guid, an) for an in AlbaNodeList.get_albanodes()) for key, data in client.prefix_entries('ovs_data_albadisk_'): alba_disk_guid = key.replace('ovs_data_albadisk_', '') alba_node_guid = data.get('alba_node', {}).get('guid') if alba_disk_guid in disk_osd_map and alba_node_guid in alba_guid_node_map and len( data.get('aliases', [])) > 0: slot_id = data['aliases'][0].split('/')[-1] for osd_guid in disk_osd_map[alba_disk_guid]: try: osd = AlbaOSD(osd_guid) except ObjectNotFoundException: continue osd.slot_id = slot_id osd.alba_node = alba_guid_node_map[alba_node_guid] osd.save() client.delete(key=key, must_exist=False) # Remove unique constraints for AlbaNode IP for key in client.prefix('ovs_unique_albanode_ip_'): client.delete(key=key, must_exist=False) # Remove relation for all Alba Disks for key in client.prefix('ovs_reverseindex_albadisk_'): client.delete(key=key, must_exist=False) # Remove the relation between AlbaNode and AlbaDisk for key in client.prefix('ovs_reverseindex_albanode_'): if '|disks|' in key: client.delete(key=key, must_exist=False) return DALMigrator.THIS_VERSION
def get_disk_safety(cls, result_handler, backends_to_include=(), backends_to_skip=(), include_errored_as_dead=False): """ Fetch safety of every namespace in every backend - amount_in_bucket is in % - max_disk_safety is the max. key that should be available in current_disk_safety Output example: {'mybackend02': {'1,2': {'max_disk_safety': 2, 'current_disk_safety': {2: {'namespace': u'b4eef27e-ef54-4fe8-8658-cdfbda7ceae4_000000065', 'amount_in_bucket': 100}}}}, 'mybackend': {'1,2': {'max_disk_safety': 2, 'current_disk_safety': {2: {'namespace': u'b4eef27e-ef54-4fe8-8658-cdfbda7ceae4_000000065', 'amount_in_bucket': 100}}}}, 'mybackend-global': {'1,2': {'max_disk_safety': 2, 'current_disk_safety': {1: {'namespace': u'e88c88c9-632c-4975-b39f-e9993e352560', 'amount_in_bucket': 100}}}}} :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param backends_to_include: Backend(s) to check for :type backends_to_include: tuple[str] :param backends_to_skip: Backend(s) to skip checking for :type backends_to_skip: tuple[str] :param include_errored_as_dead: OSDs with errors as treated as dead ones during the calculation :type include_errored_as_dead: bool :return: Safety of every namespace in every backend :rtype: dict """ disk_safety_overview = {} for alba_backend in BackendHelper.get_albabackends(): if backends_to_skip and alba_backend.name in backends_to_skip: continue if backends_to_include and alba_backend.name not in backends_to_include: continue disk_safety_overview[alba_backend.name] = {} config = Configuration.get_configuration_path('ovs/arakoon/{0}-abm/config'.format(alba_backend.name)) # Fetch alba info try: extra_params = [] if include_errored_as_dead: # @TODO Revisit once the https://github.com/openvstorage/alba/issues/441 has been resolved extra_params.append('--include-errored-as-dead') namespaces = AlbaCLI.run(command='get-disk-safety', config=config, extra_params=extra_params) cache_eviction_prefix_preset_pairs = AlbaCLI.run(command='get-maintenance-config', config=config)['cache_eviction_prefix_preset_pairs'] presets = AlbaCLI.run(command='list-presets', config=config) except AlbaException as ex: result_handler.exception('Could not fetch alba information for backend {0} Message: {1}'.format(alba_backend.name, ex), code=ErrorCodes.alba_cmd_fail) # Do not execute further continue # collect in_use presets & their policies for preset in presets: if not preset['in_use']: continue for policy in preset['policies']: disk_safety_overview[alba_backend.name]['{0},{1}'.format(str(policy[0]), str(policy[1]))] = {'current_disk_safety': {}, 'max_disk_safety': policy[1]} # collect namespaces ignorable_namespaces = [cls.BASE_NAMESPACE_KEY] + cache_eviction_prefix_preset_pairs.keys() test_worthy_namespaces = (item for item in namespaces if not item['namespace'].startswith(tuple(ignorable_namespaces))) for namespace in test_worthy_namespaces: # calc total objects in namespace total_count = 0 for bucket_safety in namespace['bucket_safety']: total_count += bucket_safety['count'] for bucket_safety in namespace['bucket_safety']: # calc safety bucket calculated_disk_safety = bucket_safety['remaining_safety'] safety = '{0},{1}'.format(str(bucket_safety['bucket'][0]), str(bucket_safety['bucket'][1])) current_disk_safety = disk_safety_overview[alba_backend.name][safety]['current_disk_safety'] to_be_added_namespace = {'namespace': namespace['namespace'], 'amount_in_bucket': "%.5f" % (float(bucket_safety['count'])/float(total_count)*100)} if calculated_disk_safety in current_disk_safety: current_disk_safety[calculated_disk_safety].append(to_be_added_namespace) else: current_disk_safety[calculated_disk_safety] = [to_be_added_namespace] return disk_safety_overview
def check_if_proxies_work(cls, result_handler): """ Checks if all Alba Proxies work on a local machine, it creates a namespace and tries to put and object :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ namespace_params = {'bucket_count': (list, None), 'logical': (int, None), 'storage': (int, None), 'storage_per_osd': (list, None)} result_handler.info('Checking the ALBA proxies.', add_to_result=False) amount_of_presets_not_working = [] # ignore possible subprocess output fnull = open(os.devnull, 'w') # try put/get/verify on all available proxies on the local node local_proxies = ServiceHelper.get_local_proxy_services() if len(local_proxies) == 0: result_handler.info('Found no proxies.', add_to_result=False) return amount_of_presets_not_working api_cache = {} for service in local_proxies: try: result_handler.info('Checking ALBA proxy {0}.'.format(service.name), add_to_result=False) ip = service.alba_proxy.storagedriver.storage_ip # Encapsulating try to determine test output try: # Determine what to what backend the proxy is connected proxy_client_cfg = AlbaCLI.run(command='proxy-client-cfg', named_params={'host': ip, 'port': service.ports[0]}) except AlbaException: result_handler.failure('Fetching proxy info has failed. Please verify if {0}:{1} is the correct address for proxy {2}.'.format(ip, service.ports[0], service.name), code=ErrorCodes.alba_cmd_fail) continue # Fetch arakoon information abm_name = proxy_client_cfg.get('cluster_id') # Check if proxy config is correctly setup if abm_name is None: raise ConfigNotMatchedException('Proxy config for proxy {0} does not have the correct format on node {1} with port {2}.'.format(service.name, ip, service.ports[0])) abm_config = Configuration.get_configuration_path('/ovs/vpools/{0}/proxies/{1}/config/abm' .format(service.alba_proxy.storagedriver.vpool.guid, service.alba_proxy.guid)) # Determine presets / backend try: presets = AlbaCLI.run(command='list-presets', config=abm_config) except AlbaException: result_handler.failure('Listing the presets has failed. Please check the arakoon config path. We used {0}'.format(abm_config), code=ErrorCodes.alba_cmd_fail) continue for preset in presets: # If preset is not in use, test will fail so add a skip if preset['in_use'] is False: result_handler.skip('Preset {0} is not in use and will not be checked'.format(preset['name'])) continue preset_name = preset['name'] # Encapsulation try for cleanup try: # Generate new namespace name using the preset namespace_key_prefix = 'ovs-healthcheck-ns-{0}-{1}'.format(preset_name, AlbaHealthCheck.LOCAL_ID) namespace_key = '{0}_{1}'.format(namespace_key_prefix, uuid.uuid4()) object_key = 'ovs-healthcheck-obj-{0}'.format(str(uuid.uuid4())) # Create namespace AlbaCLI.run(command='proxy-create-namespace', named_params={'host': ip, 'port': service.ports[0]}, extra_params=[namespace_key, preset_name]) # Wait until fully created namespace_start_time = time.time() for index in xrange(2): # Running twice because the first one could give a false positive as the osds will alert the nsm # and the nsm would respond with got messages but these were not the ones we are after AlbaCLI.run(command='deliver-messages', config=abm_config) while True: if time.time() - namespace_start_time > AlbaHealthCheck.NAMESPACE_TIMEOUT: raise AlbaTimeOutException('Creating namespace has timed out after {0}s'.format(time.time() - namespace_start_time), 'deliver-messages') list_ns_osds_output = AlbaCLI.run(command='list-ns-osds', config=abm_config, extra_params=[namespace_key]) # Example output: [[0, [u'Active']], [3, [u'Active']]] namespace_ready = True for osd_info in list_ns_osds_output: if osd_info[1][0] != 'Active': # If we found an OSD not Active, check if preset is satisfiable namespace_ready = False break if namespace_ready is True: break else: result_handler.info('Not all OSDs have responded to the creation message. Fetching the safety', add_to_result=False) try: # Fetch the preset information on the Framework # This add an extra delay for the messages to propagate too vpool = service.alba_proxy.storagedriver.vpool alba_backend_guid = vpool.metadata['backend']['backend_info']['alba_backend_guid'] api_url = 'alba/backends/{0}'.format(alba_backend_guid) if api_url not in api_cache: connection_info = vpool.metadata['backend']['backend_info']['connection_info'] api_client = OVSClient(connection_info['host'], connection_info['port'], (connection_info['client_id'], connection_info['client_secret'])) start = time.time() _presets = api_client.get(api_url, params={'contents': 'presets'})['presets'] api_cache[api_url] = _presets result_handler.info('Fetching the safety took {0} seconds'.format(time.time() - start)) _presets = api_cache[api_url] _preset = filter(lambda p: p['name'] == preset_name, _presets)[0] if _preset['is_available'] is True: # Preset satisfiable, don't care about osds availability result_handler.info('Requested preset is available, no longer waiting on \'deliver_messages\'', add_to_result=False) break else: raise ValueError('Requested preset is marked as unavailable. Please check the disk safety'.format(time.time() - namespace_start_time)) except ValueError: raise except Exception: msg = 'Could not query the preset data. Checking the preset might timeout' result_handler.warning(msg) cls.logger.exception(msg) # Sleep for syncing purposes time.sleep(1) result_handler.success('Namespace successfully created on proxy {0} with preset {1}!'.format(service.name, preset_name), code=ErrorCodes.proxy_namespace_create) namespace_info = AlbaCLI.run(command='show-namespace', config=abm_config, extra_params=[namespace_key]) ExtensionsToolbox.verify_required_params(required_params=namespace_params, actual_params=namespace_info) result_handler.success('Namespace successfully fetched on proxy {0} with preset {1}!'.format(service.name, preset_name), code=ErrorCodes.proxy_namespace_fetch) # Put test object to given dir with open(AlbaHealthCheck.TEMP_FILE_LOC, 'wb') as output_file: output_file.write(os.urandom(AlbaHealthCheck.TEMP_FILE_SIZE)) AlbaCLI.run(command='proxy-upload-object', named_params={'host': ip, 'port': service.ports[0]}, extra_params=[namespace_key, AlbaHealthCheck.TEMP_FILE_LOC, object_key]) result_handler.success('Successfully uploaded the object to namespace {0}'.format(namespace_key), code=ErrorCodes.proxy_upload_obj) # download object AlbaCLI.run(command='proxy-download-object', named_params={'host': ip, 'port': service.ports[0]}, extra_params=[namespace_key, object_key, AlbaHealthCheck.TEMP_FILE_FETCHED_LOC]) result_handler.success('Successfully downloaded the object to namespace {0}'.format(namespace_key), code=ErrorCodes.proxy_download_obj) # check if files exists - issue #57 if not(os.path.isfile(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC) and os.path.isfile(AlbaHealthCheck.TEMP_FILE_LOC)): # creation of object failed raise ObjectNotFoundException(ValueError('Creation of object has failed')) hash_original = hashlib.md5(open(AlbaHealthCheck.TEMP_FILE_LOC, 'rb').read()).hexdigest() hash_fetched = hashlib.md5(open(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC, 'rb').read()).hexdigest() if hash_original == hash_fetched: result_handler.success('Fetched object {0} from namespace {1} on proxy {2} with preset {3} matches the created object!'.format(object_key, namespace_key, service.name, preset_name), code=ErrorCodes.proxy_verify_obj) else: result_handler.failure('Fetched object {0} from namespace {1} on proxy {2} with preset {3} does not match the created object!'.format(object_key, namespace_key, service.name, preset_name), code=ErrorCodes.proxy_verify_obj_fail) except ValueError: result_handler.failure('The preset is not available for use') except ObjectNotFoundException as ex: amount_of_presets_not_working.append(preset_name) result_handler.failure('Failed to put object on namespace {0} failed on proxy {1}with preset {2} With error {3}'.format(namespace_key, service.name, preset_name, ex)) except AlbaTimeOutException as ex: result_handler.failure(str(ex)) except AlbaException as ex: code = ErrorCodes.alba_cmd_fail if ex.alba_command == 'proxy-create-namespace': result_handler.failure('Create namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(str(ex), namespace_key, service.name, preset_name), code=code) elif ex.alba_command == 'show-namespace': result_handler.failure('Show namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(str(ex), namespace_key, service.name, preset_name), code=code) elif ex.alba_command == 'proxy-upload-object': result_handler.failure('Uploading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(str(ex), namespace_key, service.name, preset_name), code=code) elif ex.alba_command == 'proxy-download-object': result_handler.failure('Downloading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format(str(ex), namespace_key, service.name, preset_name), code=code) finally: # Delete the created namespace and preset subprocess.call(['rm', str(AlbaHealthCheck.TEMP_FILE_LOC)], stdout=fnull, stderr=subprocess.STDOUT) subprocess.call(['rm', str(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC)], stdout=fnull, stderr=subprocess.STDOUT) try: namespaces = AlbaCLI.run(command='list-namespaces', config=abm_config) namespaces_to_remove = [] proxy_named_params = {'host': ip, 'port': service.ports[0]} for namespace in namespaces: if namespace['name'].startswith(namespace_key_prefix): namespaces_to_remove.append(namespace['name']) for namespace_name in namespaces_to_remove: if namespace_name == namespace_key: result_handler.info('Deleting namespace {0}.'.format(namespace_name)) else: result_handler.warning('Deleting namespace {0} which was leftover from a previous run.'.format(namespace_name)) AlbaCLI.run(command='proxy-delete-namespace', named_params=proxy_named_params, extra_params=[namespace_name]) namespace_delete_start = time.time() while True: try: AlbaCLI.run(command='show-namespace', config=abm_config, extra_params=[namespace_name]) # Will fail if the namespace does not exist except AlbaException: result_handler.success('Namespace {0} successfully removed.'.format(namespace_name)) break if time.time() - namespace_delete_start > AlbaHealthCheck.NAMESPACE_TIMEOUT: raise AlbaTimeOutException('Delete namespace has timed out after {0}s'.format(time.time() - namespace_start_time), 'show-namespace') # be tidy, and make the proxy forget the namespace try: AlbaCLI.run(command='proxy-statistics', named_params=proxy_named_params, extra_params=['--forget', namespace_name]) except: result_handler.warning('Failed to make proxy forget namespace {0}.'.format(namespace_name)) except AlbaException as ex: if ex.alba_command == 'list-namespaces': result_handler.failure( 'list namespaces has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format( str(ex), namespace_key, service.name, preset_name)) elif ex.alba_command == 'proxy-delete-namespace': result_handler.failure( 'Delete namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}'.format( str(ex), namespace_key, service.name, preset_name)) except subprocess.CalledProcessError as ex: # this should stay for the deletion of the remaining files amount_of_presets_not_working.append(service.name) result_handler.failure('Proxy {0} has some problems. Got {1} as error'.format(service.name, ex), code=ErrorCodes.proxy_problems) except ConfigNotMatchedException as ex: amount_of_presets_not_working.append(service.name) result_handler.failure('Proxy {0} has some problems. Got {1} as error'.format(service.name, ex), code=ErrorCodes.proxy_problems)
def get_disk_safety(cls, result_handler): """ Fetch safety of every namespace in every backend - amount_in_bucket is in % - max_disk_safety is the max. key that should be available in current_disk_safety Output example: {'mybackend02': {'1,2': {'max_disk_safety': 2, 'current_disk_safety': {2: {'namespace': u'b4eef27e-ef54-4fe8-8658-cdfbda7ceae4_000000065', 'amount_in_bucket': 100}}}}, 'mybackend': {'1,2': {'max_disk_safety': 2, 'current_disk_safety': {2: {'namespace': u'b4eef27e-ef54-4fe8-8658-cdfbda7ceae4_000000065', 'amount_in_bucket': 100}}}}, 'mybackend-global': {'1,2': {'max_disk_safety': 2, 'current_disk_safety': {1: {'namespace': u'e88c88c9-632c-4975-b39f-e9993e352560', 'amount_in_bucket': 100}}}}} :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: Safety of every namespace in every backend :rtype: dict """ disk_safety_overview = {} for alba_backend in BackendHelper.get_albabackends(): disk_safety_overview[alba_backend.name] = {} config = Configuration.get_configuration_path( 'ovs/arakoon/{0}-abm/config'.format(alba_backend.name)) # Fetch alba info try: # @TODO add this to extra_params to include corrupt asds. Currently there is a bug with it # Ticket: https://github.com/openvstorage/alba/issues/441 # extra_params=['--include-errored-as-dead'] namespaces = AlbaCLI.run(command='get-disk-safety', config=config) cache_eviction_prefix_preset_pairs = AlbaCLI.run( command='get-maintenance-config', config=config)['cache_eviction_prefix_preset_pairs'] presets = AlbaCLI.run(command='list-presets', config=config) except AlbaException as ex: result_handler.exception( 'Could not fetch alba information for backend {0} Message: {1}' .format(alba_backend.name, ex)) # Do not execute further continue # collect in_use presets & their policies for preset in presets: if not preset['in_use']: continue for policy in preset['policies']: disk_safety_overview[alba_backend.name]['{0},{1}'.format( str(policy[0]), str(policy[1]))] = { 'current_disk_safety': {}, 'max_disk_safety': policy[1] } # collect namespaces ignorable_namespaces = [ cls.BASE_NAMESPACE_KEY ] + cache_eviction_prefix_preset_pairs.keys() test_worthy_namespaces = (item for item in namespaces if not item['namespace'].startswith( tuple(ignorable_namespaces))) for namespace in test_worthy_namespaces: # calc total objects in namespace total_count = 0 for bucket_safety in namespace['bucket_safety']: total_count += bucket_safety['count'] for bucket_safety in namespace['bucket_safety']: # calc safety bucket calculated_disk_safety = bucket_safety['remaining_safety'] safety = '{0},{1}'.format(str(bucket_safety['bucket'][0]), str(bucket_safety['bucket'][1])) current_disk_safety = disk_safety_overview[ alba_backend.name][safety]['current_disk_safety'] to_be_added_namespace = { 'namespace': namespace['namespace'], 'amount_in_bucket': "%.5f" % (float(bucket_safety['count']) / float(total_count) * 100) } if calculated_disk_safety in current_disk_safety: current_disk_safety[calculated_disk_safety].append( to_be_added_namespace) else: current_disk_safety[calculated_disk_safety] = [ to_be_added_namespace ] return disk_safety_overview
def check_model_consistency(result_handler): """ Checks if the model consistency of OVSDB vs. VOLUMEDRIVER and does a preliminary check on RABBITMQ :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ result_handler.info('Checking model consistency: ') # Checking consistency of volumedriver vs. ovsdb and backwards for vp in VPoolHelper.get_vpools(): if vp.guid not in OpenvStorageHealthCheck.LOCAL_SR.vpools_guids: result_handler.skip( 'Skipping vPool {0} because it is not living here.'.format( vp.name)) continue result_handler.info( 'Checking consistency of volumedriver vs. ovsdb for {0}: '. format(vp.name), add_to_result=False) missing_in_volumedriver = [] missing_in_model = [] config_file = Configuration.get_configuration_path( '/ovs/vpools/{0}/hosts/{1}/config'.format( vp.guid, vp.storagedrivers[0].name)) try: voldrv_client = src.LocalStorageRouterClient(config_file) # noinspection PyArgumentList voldrv_volume_list = voldrv_client.list_volumes() except (ClusterNotReachableException, RuntimeError) as ex: result_handler.warning( 'Seems like the volumedriver {0} is not running. Got {1}'. format(vp.name, ex.message)) continue vdisk_volume_ids = [] # cross-reference model vs. volumedriver for vdisk in vp.vdisks: vdisk_volume_ids.append(vdisk.volume_id) if vdisk.volume_id not in voldrv_volume_list: missing_in_volumedriver.append(vdisk.guid) else: voldrv_volume_list.remove(vdisk.volume_id) # cross-reference volumedriver vs. model for voldrv_id in voldrv_volume_list: if voldrv_id not in vdisk_volume_ids: missing_in_model.append(voldrv_id) # display discrepancies for vPool if len(missing_in_volumedriver) != 0: result_handler.warning( 'Detected volumes that are MISSING in volumedriver but are in ovsdb in vpool: {0} - vdisk guid(s):{1}.' .format(vp.name, ' '.join(missing_in_volumedriver))) else: result_handler.success( 'No discrepancies found for ovsdb in vPool {0}'.format( vp.name)) if len(missing_in_model) != 0: result_handler.warning( 'Detected volumes that are AVAILABLE in volumedriver but are not in ovsdb in vpool: {0} - vdisk volume id(s):{1}' .format(vp.name, ', '.join(missing_in_model))) else: result_handler.success( 'No discrepancies found for voldrv in vpool {0}'.format( vp.name))
def cluster_registry_checkup(): """ Verify whether changes have occurred in the cluster registry for each vPool :return: Information whether changes occurred :rtype: dict """ changed_vpools = {} for vpool in VPoolList.get_vpools(): changed_vpools[vpool.guid] = {'changes': False, 'success': True} try: StorageDriverController._logger.info('Validating cluster registry settings for Vpool {0}'.format(vpool.guid)) current_configs = vpool.clusterregistry_client.get_node_configs() changes = len(current_configs) == 0 node_configs = [] for sd in vpool.storagedrivers: sd.invalidate_dynamics(['cluster_node_config']) new_config = sd.cluster_node_config node_configs.append(ClusterNodeConfig(**new_config)) if changes is False: current_node_configs = [config for config in current_configs if config.vrouter_id == sd.storagedriver_id] if len(current_node_configs) == 1: current_node_config = current_node_configs[0] for key in new_config: if getattr(current_node_config, key) != new_config[key]: changes = True break changed_vpools[vpool.guid]['changes'] = changes if changes is True: StorageDriverController._logger.info('Cluster registry settings for Vpool {0} needs to be updated'.format(vpool.guid)) available_storagedrivers = [] for sd in vpool.storagedrivers: storagerouter = sd.storagerouter try: SSHClient(storagerouter, username='******') with remote(storagerouter.ip, [LocalStorageRouterClient]) as rem: sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, sd.storagedriver_id) if Configuration.exists(sd_key) is True: path = Configuration.get_configuration_path(sd_key) lsrc = rem.LocalStorageRouterClient(path) lsrc.server_revision() # 'Cheap' call to verify whether volumedriver is responsive available_storagedrivers.append(sd) except UnableToConnectException: StorageDriverController._logger.warning('StorageRouter {0} not available.'.format(storagerouter.name)) except Exception as ex: if 'ClusterNotReachableException' in str(ex): StorageDriverController._logger.warning('StorageDriver {0} on StorageRouter {1} not available.'.format( sd.guid, storagerouter.name )) else: StorageDriverController._logger.exception('Got exception when validating StorageDriver {0} on StorageRouter {1}.'.format( sd.guid, storagerouter.name )) StorageDriverController._logger.info('Updating cluster node configs for VPool {0}'.format(vpool.guid)) vpool.clusterregistry_client.set_node_configs(node_configs) for sd in available_storagedrivers: StorageDriverController._logger.info('Trigger config reload for StorageDriver {0}'.format(sd.guid)) vpool.storagedriver_client.update_cluster_node_configs(str(sd.storagedriver_id), req_timeout_secs=10) StorageDriverController._logger.info('Updating cluster node configs for Vpool {0} completed'.format(vpool.guid)) else: StorageDriverController._logger.info('Cluster registry settings for Vpool {0} is up to date'.format(vpool.guid)) except Exception as ex: StorageDriverController._logger.exception('Got exception when validating cluster registry settings for Vpool {0}.'.format(vpool.name)) changed_vpools[vpool.guid]['success'] = False changed_vpools[vpool.guid]['error'] = ex.message return changed_vpools
def _presets(self): """ Returns the policies active on the node """ if self.abm_cluster is None: return [] # No ABM cluster yet, so backend not fully installed yet osds = {} if self.scaling != AlbaBackend.SCALINGS.GLOBAL: for node_id, slots in self.local_stack.iteritems(): osds[node_id] = 0 for slot_id, slot_data in slots.iteritems(): for osd_id, osd_data in slot_data['osds'].iteritems(): if osd_data['status'] in [ AlbaNode.OSD_STATUSES.OK, AlbaNode.OSD_STATUSES.WARNING ] and osd_data.get('claimed_by') == self.guid: osds[node_id] += 1 config = Configuration.get_configuration_path( self.abm_cluster.config_location) presets = AlbaCLI.run(command='list-presets', config=config) preset_dict = {} for preset in presets: preset_dict[preset['name']] = preset if 'in_use' not in preset: preset['in_use'] = True if 'is_default' not in preset: preset['is_default'] = False preset['is_available'] = False preset['policies'] = [ tuple(policy) for policy in preset['policies'] ] preset['policy_metadata'] = {} active_policy = None for policy in preset['policies']: is_available = False available_disks = 0 if self.scaling == AlbaBackend.SCALINGS.GLOBAL: available_disks += sum( self.local_summary['devices'].values()) if self.scaling == AlbaBackend.SCALINGS.LOCAL: available_disks += sum( min(osds[node], policy[3]) for node in osds) if available_disks >= policy[2]: if active_policy is None: active_policy = policy is_available = True preset['policy_metadata'][policy] = { 'is_active': False, 'in_use': False, 'is_available': is_available } preset['is_available'] |= is_available if active_policy is not None: preset['policy_metadata'][active_policy]['is_active'] = True for namespace in self.ns_data: if namespace['namespace']['state'] != 'active': continue policy_usage = namespace['statistics']['bucket_count'] preset = preset_dict[namespace['namespace']['preset_name']] for usage in policy_usage: used_policy = tuple( usage[0]) # Policy as reported to be "in use" for configured_policy in preset[ 'policies']: # All configured policies if used_policy[0] == configured_policy[0] and used_policy[ 1] == configured_policy[ 1] and used_policy[3] <= configured_policy[3]: preset['policy_metadata'][configured_policy][ 'in_use'] = True break for preset in presets: preset['policies'] = [str(policy) for policy in preset['policies']] for key in preset['policy_metadata'].keys(): preset['policy_metadata'][str( key)] = preset['policy_metadata'][key] del preset['policy_metadata'][key] return presets
def start_services(self): """ Start all services related to the Storagedriver :return: None :rtype: NoneType """ if self.sr_installer is None: raise RuntimeError('No StorageRouterInstaller instance found') vpool = self.vp_installer.vpool root_client = self.sr_installer.root_client storagerouter = self.sr_installer.storagerouter alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_ALBA) voldrv_pkg_name, voldrv_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD) # Add/start watcher volumedriver service if not self.service_manager.has_service(name=ServiceFactory.SERVICE_WATCHER_VOLDRV, client=root_client): self.service_manager.add_service(name=ServiceFactory.SERVICE_WATCHER_VOLDRV, client=root_client) self.service_manager.start_service(name=ServiceFactory.SERVICE_WATCHER_VOLDRV, client=root_client) # Add/start DTL service self.service_manager.add_service(name=self.SERVICE_TEMPLATE_DTL, params={'DTL_PATH': self.storagedriver_partition_dtl.path, 'DTL_ADDRESS': self.storagedriver.storage_ip, 'DTL_PORT': str(self.storagedriver.ports['dtl']), 'DTL_TRANSPORT': StorageDriverClient.VPOOL_DTL_TRANSPORT_MAP[self.dtl_transport], 'LOG_SINK': Logger.get_sink_path('storagedriver-dtl_{0}'.format(self.storagedriver.storagedriver_id)), 'VOLDRV_PKG_NAME': voldrv_pkg_name, 'VOLDRV_VERSION_CMD': voldrv_version_cmd}, client=root_client, target_name=self.dtl_service) self.service_manager.start_service(name=self.dtl_service, client=root_client) # Add/start ALBA proxy services for proxy in self.storagedriver.alba_proxies: alba_proxy_service = 'ovs-{0}'.format(proxy.service.name) self.service_manager.add_service(name=self.SERVICE_TEMPLATE_PROXY, params={'VPOOL_NAME': vpool.name, 'LOG_SINK': Logger.get_sink_path(proxy.service.name), 'CONFIG_PATH': Configuration.get_configuration_path('/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, proxy.guid)), 'ALBA_PKG_NAME': alba_pkg_name, 'ALBA_VERSION_CMD': alba_version_cmd}, client=root_client, target_name=alba_proxy_service) self.service_manager.start_service(name=alba_proxy_service, client=root_client) # Add/start StorageDriver service self.service_manager.add_service(name=self.SERVICE_TEMPLATE_SD, params={'KILL_TIMEOUT': '30', 'VPOOL_NAME': vpool.name, 'VPOOL_MOUNTPOINT': self.storagedriver.mountpoint, 'CONFIG_PATH': StorageDriverConfiguration(vpool_guid=vpool.guid, storagedriver_id=self.storagedriver.storagedriver_id).remote_path, 'OVS_UID': root_client.run(['id', '-u', 'ovs']).strip(), 'OVS_GID': root_client.run(['id', '-g', 'ovs']).strip(), 'LOG_SINK': Logger.get_sink_path('storagedriver_{0}'.format(self.storagedriver.storagedriver_id)), 'VOLDRV_PKG_NAME': voldrv_pkg_name, 'VOLDRV_VERSION_CMD': voldrv_version_cmd, 'METADATASTORE_BITS': 5}, client=root_client, target_name=self.sd_service) current_startup_counter = self.storagedriver.startup_counter self.service_manager.start_service(name=self.sd_service, client=root_client) tries = 60 while self.storagedriver.startup_counter == current_startup_counter and tries > 0: self._logger.debug('Waiting for the StorageDriver to start up for vPool {0} on StorageRouter {1} ...'.format(vpool.name, storagerouter.name)) if self.service_manager.get_service_status(name=self.sd_service, client=root_client) != 'active': raise RuntimeError('StorageDriver service failed to start (service not running)') tries -= 1 time.sleep(60 - tries) self.storagedriver.discard() if self.storagedriver.startup_counter == current_startup_counter: raise RuntimeError('StorageDriver service failed to start (got no event)') self._logger.debug('StorageDriver running')
def stop_services(self): """ Stop all services related to the Storagedriver :return: A boolean indicating whether something went wrong :rtype: bool """ if self.sr_installer is None: raise RuntimeError('No StorageRouterInstaller instance found') root_client = self.sr_installer.root_client errors_found = False for service in [self.sd_service, self.dtl_service]: try: if self.service_manager.has_service(name=service, client=root_client): self._logger.debug('StorageDriver {0} - Stopping service {1}'.format(self.storagedriver.guid, service)) self.service_manager.stop_service(name=service, client=root_client) self._logger.debug('StorageDriver {0} - Removing service {1}'.format(self.storagedriver.guid, service)) self.service_manager.remove_service(name=service, client=root_client) except Exception: self._logger.exception('StorageDriver {0} - Disabling/stopping service {1} failed'.format(self.storagedriver.guid, service)) errors_found = True sd_config_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(self.vp_installer.vpool.guid, self.storagedriver.storagedriver_id) if self.vp_installer.storagedriver_amount <= 1 and Configuration.exists(sd_config_key): try: for proxy in self.storagedriver.alba_proxies: if self.service_manager.has_service(name=proxy.service.name, client=root_client): self._logger.debug('StorageDriver {0} - Starting proxy {1}'.format(self.storagedriver.guid, proxy.service.name)) self.service_manager.start_service(name=proxy.service.name, client=root_client) tries = 10 running = False port = proxy.service.ports[0] while running is False and tries > 0: self._logger.debug('StorageDriver {0} - Waiting for the proxy {1} to start up'.format(self.storagedriver.guid, proxy.service.name)) tries -= 1 time.sleep(10 - tries) try: root_client.run(['alba', 'proxy-statistics', '--host', self.storagedriver.storage_ip, '--port', str(port)]) running = True except CalledProcessError as ex: self._logger.error('StorageDriver {0} - Fetching alba proxy-statistics failed with error (but ignoring): {1}'.format(self.storagedriver.guid, ex)) if running is False: raise RuntimeError('Alba proxy {0} failed to start'.format(proxy.service.name)) self._logger.debug('StorageDriver {0} - Alba proxy {0} running'.format(self.storagedriver.guid, proxy.service.name)) self._logger.debug('StorageDriver {0} - Destroying filesystem and erasing node configs'.format(self.storagedriver.guid)) with remote(root_client.ip, [LocalStorageRouterClient], username='******') as rem: path = Configuration.get_configuration_path(sd_config_key) storagedriver_client = rem.LocalStorageRouterClient(path) try: storagedriver_client.destroy_filesystem() except RuntimeError as rte: # If backend has already been deleted, we cannot delete the filesystem anymore --> storage leak!!! if 'MasterLookupResult.Error' not in rte.message: raise self.vp_installer.vpool.clusterregistry_client.erase_node_configs() except RuntimeError: self._logger.exception('StorageDriver {0} - Destroying filesystem and erasing node configs failed'.format(self.storagedriver.guid)) errors_found = True for proxy in self.storagedriver.alba_proxies: service_name = proxy.service.name try: if self.service_manager.has_service(name=service_name, client=root_client): self._logger.debug('StorageDriver {0} - Stopping service {1}'.format(self.storagedriver.guid, service_name)) self.service_manager.stop_service(name=service_name, client=root_client) self._logger.debug('StorageDriver {0} - Removing service {1}'.format(self.storagedriver.guid, service_name)) self.service_manager.remove_service(name=service_name, client=root_client) except Exception: self._logger.exception('StorageDriver {0} - Disabling/stopping service {1} failed'.format(self.storagedriver.guid, service_name)) errors_found = True return errors_found
def get_stats_vdisks(cls): """ Retrieve statistics about all vDisks on the system. Check the safety, storage amount on the Backend, fail-over status and others """ if cls._config is None: cls.validate_and_retrieve_config() stats = [] errors = False environment = cls._config['environment'] alba_backend_info = {} for alba_backend in AlbaBackendList.get_albabackends(): config_path = Configuration.get_configuration_path( alba_backend.abm_cluster.config_location) disk_safety = {} namespace_usage = {} # Retrieve namespace, preset and disk safety information try: preset_info = AlbaCLI.run( command='list-presets', config=config_path ) # Not using alba_backend.presets, because it takes a whole lot longer to retrieve all_namespace_info = AlbaCLI.run(command='show-namespaces', config=config_path, extra_params=['--max=-1'])[1] all_disk_safety_info = AlbaCLI.run(command='get-disk-safety', config=config_path) except Exception: errors = True cls._logger.exception( 'Retrieving information for ALBA Backend {0} failed'. format(alba_backend.name)) continue alba_backend_info[alba_backend.guid] = { 'disk_safety': disk_safety, 'namespace_usage': namespace_usage } # Parse namespace information for namespace_info in all_namespace_info: namespace_usage[namespace_info['name']] = float( namespace_info['statistics']['storage']) # Parse preset information policies = [] preset_name = None for preset in preset_info: if preset['in_use'] is not True: continue preset_name = preset['name'] policies.extend(preset['policies']) if preset_name is None: continue # Parse disk safety information total_objects = 0 max_lost_disks = 0 max_disk_safety = 0 bucket_overview = {} disk_lost_overview = {} disk_safety_overview = {} for disk_safety_info in all_disk_safety_info: safety = disk_safety_info['safety'] volume_id = disk_safety_info['namespace'] disk_safety[volume_id] = float( safety) if safety is not None else safety for bucket_safety in disk_safety_info['bucket_safety']: bucket = bucket_safety['bucket'] objects = bucket_safety['count'] remaining_safety = bucket_safety['remaining_safety'] if bucket[1] > max_lost_disks: max_lost_disks = bucket[1] if remaining_safety > max_disk_safety: max_disk_safety = remaining_safety for policy in policies: k = policy[0] == bucket[0] m = policy[1] == bucket[1] c = policy[2] <= bucket[2] x = policy[3] >= bucket[3] if k and m and c and x: if preset_name not in bucket_overview: bucket_overview[preset_name] = { 'policy': str(policy), 'presets': {} } bucket[2] -= bucket_safety['applicable_dead_osds'] if str(bucket ) not in bucket_overview[preset_name]['presets']: bucket_overview[preset_name]['presets'][str( bucket)] = { 'objects': 0, 'disk_safety': 0 } disk_lost = bucket[0] + bucket[1] - bucket[ 2] # Data fragments + parity fragments - amount of fragments to write + dead osds if disk_lost not in disk_lost_overview: disk_lost_overview[disk_lost] = 0 if remaining_safety not in disk_safety_overview: disk_safety_overview[remaining_safety] = 0 total_objects += objects disk_lost_overview[disk_lost] += objects disk_safety_overview[remaining_safety] += objects bucket_overview[preset_name]['presets'][str( bucket)]['objects'] += objects bucket_overview[preset_name]['presets'][str( bucket)]['disk_safety'] = remaining_safety # Create statistics regarding disk safety for disk_lost_number in xrange(max_lost_disks + 1): stats.append({ 'tags': { 'disk_lost': disk_lost_number, 'environment': environment, 'backend_name': alba_backend.name }, 'fields': { 'objects': disk_lost_overview.get(disk_lost_number, 0), 'total_objects': total_objects }, 'measurement': 'disk_lost' }) for disk_safety_number in xrange(max_disk_safety + 1): stats.append({ 'tags': { 'disk_safety': disk_safety_number, 'environment': environment, 'backend_name': alba_backend.name }, 'fields': { 'objects': disk_safety_overview.get(disk_safety_number, 0), 'total_objects': total_objects }, 'measurement': 'disk_safety' }) for preset_name, result in bucket_overview.iteritems(): for bucket_count, bucket_result in result['presets'].iteritems( ): stats.append({ 'tags': { 'bucket': bucket_count, 'policy': result['policy'], 'preset_name': preset_name, 'environment': environment, 'disk_safety': bucket_result['disk_safety'], 'backend_name': alba_backend.name }, 'fields': { 'objects': bucket_result['objects'], 'total_objects': total_objects }, 'measurement': 'bucket' }) # Integrate namespace and disk safety information in vPool stats for vpool in VPoolList.get_vpools(): alba_backend_guid = vpool.metadata['backend']['backend_info'][ 'alba_backend_guid'] for vdisk in vpool.vdisks: try: metrics = cls._convert_to_float_values( cls._pop_realtime_info(vdisk.statistics)) metrics['failover_mode'] = vdisk.dtl_status metrics['frontend_size'] = float(vdisk.size) metrics['failover_mode_status'] = cls._FAILOVER_MAP.get( vdisk.dtl_status, 3) if alba_backend_guid in alba_backend_info: metrics['disk_safety'] = alba_backend_info[ alba_backend_guid]['disk_safety'].get( vdisk.volume_id) metrics['backend_stored'] = alba_backend_info[ alba_backend_guid]['namespace_usage'].get( vdisk.volume_id) stats.append({ 'tags': { 'disk_name': vdisk.name, 'volume_id': vdisk.volume_id, 'vpool_name': vdisk.vpool.name, 'environment': environment, 'storagerouter_name': StorageRouter(vdisk.storagerouter_guid).name }, 'fields': metrics, 'measurement': 'vdisk' }) except Exception: errors = True cls._logger.exception( 'Retrieving statistics for vDisk {0} with guid {1} failed' .format(vdisk.name, vdisk.guid)) return errors, stats
def shrink_vpool(cls, storagedriver_guid, offline_storage_router_guids=list()): """ Removes a StorageDriver (if its the last StorageDriver for a vPool, the vPool is removed as well) :param storagedriver_guid: Guid of the StorageDriver to remove :type storagedriver_guid: str :param offline_storage_router_guids: Guids of StorageRouters which are offline and will be removed from cluster. WHETHER VPOOL WILL BE DELETED DEPENDS ON THIS :type offline_storage_router_guids: list :return: None :rtype: NoneType """ # TODO: Add logging # TODO: Unit test individual pieces of code # Validations storagedriver = StorageDriver(storagedriver_guid) storagerouter = storagedriver.storagerouter cls._logger.info( 'StorageDriver {0} - Deleting StorageDriver {1}'.format( storagedriver.guid, storagedriver.name)) vp_installer = VPoolInstaller(name=storagedriver.vpool.name) vp_installer.validate(storagedriver=storagedriver) sd_installer = StorageDriverInstaller(vp_installer=vp_installer, storagedriver=storagedriver) cls._logger.info( 'StorageDriver {0} - Checking availability of related StorageRouters' .format(storagedriver.guid, storagedriver.name)) sr_client_map = SSHClient.get_clients(endpoints=[ sd.storagerouter for sd in vp_installer.vpool.storagedrivers ], user_names=['root']) sr_installer = StorageRouterInstaller(root_client=sr_client_map.get( storagerouter, {}).get('root'), storagerouter=storagerouter, vp_installer=vp_installer, sd_installer=sd_installer) offline_srs = sr_client_map.pop('offline') if sorted([sr.guid for sr in offline_srs ]) != sorted(offline_storage_router_guids): raise RuntimeError('Not all StorageRouters are reachable') if storagerouter not in offline_srs: mtpt_pids = sr_installer.root_client.run( "lsof -t +D '/mnt/{0}' || true".format( vp_installer.name.replace(r"'", r"'\''")), allow_insecure=True).splitlines() if len(mtpt_pids) > 0: raise RuntimeError( 'vPool cannot be deleted. Following processes keep the vPool mount point occupied: {0}' .format(', '.join(mtpt_pids))) # Retrieve reachable StorageDrivers reachable_storagedrivers = [] for sd in vp_installer.vpool.storagedrivers: if sd.storagerouter not in sr_client_map: # StorageRouter is offline continue sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format( vp_installer.vpool.guid, sd.storagedriver_id) if Configuration.exists(sd_key) is True: path = Configuration.get_configuration_path(sd_key) with remote(sd.storagerouter.ip, [LocalStorageRouterClient]) as rem: try: lsrc = rem.LocalStorageRouterClient(path) lsrc.server_revision( ) # 'Cheap' call to verify whether volumedriver is responsive cls._logger.info( 'StorageDriver {0} - Responsive StorageDriver {1} on node with IP {2}' .format(storagedriver.guid, sd.name, sd.storagerouter.ip)) reachable_storagedrivers.append(sd) except Exception as exception: if not is_connection_failure(exception): raise if len(reachable_storagedrivers) == 0: raise RuntimeError( 'Could not find any responsive node in the cluster') # Start removal if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.SHRINKING) else: vp_installer.update_status(status=VPool.STATUSES.DELETING) # Clean up stale vDisks cls._logger.info('StorageDriver {0} - Removing stale vDisks'.format( storagedriver.guid)) VDiskController.remove_stale_vdisks(vpool=vp_installer.vpool) # Reconfigure the MDSes cls._logger.info('StorageDriver {0} - Reconfiguring MDSes'.format( storagedriver.guid)) for vdisk_guid in storagerouter.vdisks_guids: try: MDSServiceController.ensure_safety( vdisk_guid=vdisk_guid, excluded_storagerouter_guids=[storagerouter.guid] + offline_storage_router_guids) except Exception: cls._logger.exception( 'StorageDriver {0} - vDisk {1} - Ensuring MDS safety failed' .format(storagedriver.guid, vdisk_guid)) # Validate that all MDSes on current StorageRouter have been moved away # Ensure safety does not always throw an error, that's why we perform this check here instead of in the Exception clause of above code vdisks = [] for mds in vp_installer.mds_services: for junction in mds.vdisks: vdisk = junction.vdisk if vdisk in vdisks: continue vdisks.append(vdisk) cls._logger.critical( 'StorageDriver {0} - vDisk {1} {2} - MDS Services have not been migrated away' .format(storagedriver.guid, vdisk.guid, vdisk.name)) if len(vdisks) > 0: # Put back in RUNNING, so it can be used again. Errors keep on displaying in GUI now anyway vp_installer.update_status(status=VPool.STATUSES.RUNNING) raise RuntimeError( 'Not all MDS Services have been successfully migrated away') # Start with actual removal errors_found = False if storagerouter not in offline_srs: errors_found &= sd_installer.stop_services() errors_found &= vp_installer.configure_cluster_registry( exclude=[storagedriver], apply_on=reachable_storagedrivers) errors_found &= vp_installer.update_node_distance_map() errors_found &= vp_installer.remove_mds_services() errors_found &= sd_installer.clean_config_management() errors_found &= sd_installer.clean_model() if storagerouter not in offline_srs: errors_found &= sd_installer.clean_directories( mountpoints=StorageRouterController.get_mountpoints( client=sr_installer.root_client)) try: DiskController.sync_with_reality( storagerouter_guid=storagerouter.guid) except Exception: cls._logger.exception( 'StorageDriver {0} - Synchronizing disks with reality failed' .format(storagedriver.guid)) errors_found = True if vp_installer.storagedriver_amount > 1: # Update the vPool metadata and run DTL checkup vp_installer.vpool.metadata['caching_info'].pop( sr_installer.storagerouter.guid, None) vp_installer.vpool.save() try: VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600) except Exception: cls._logger.exception( 'StorageDriver {0} - DTL checkup failed for vPool {1} with guid {2}' .format(storagedriver.guid, vp_installer.name, vp_installer.vpool.guid)) else: cls._logger.info( 'StorageDriver {0} - Removing vPool from model'.format( storagedriver.guid)) # Clean up model try: vp_installer.vpool.delete() except Exception: errors_found = True cls._logger.exception( 'StorageDriver {0} - Cleaning up vPool from the model failed' .format(storagedriver.guid)) Configuration.delete('/ovs/vpools/{0}'.format( vp_installer.vpool.guid)) cls._logger.info('StorageDriver {0} - Running MDS checkup'.format( storagedriver.guid)) try: MDSServiceController.mds_checkup() except Exception: cls._logger.exception( 'StorageDriver {0} - MDS checkup failed'.format( storagedriver.guid)) # Update vPool status if errors_found is True: if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.FAILURE) raise RuntimeError( '1 or more errors occurred while trying to remove the StorageDriver. Please check the logs for more information' ) if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.RUNNING) cls._logger.info( 'StorageDriver {0} - Deleted StorageDriver {1}'.format( storagedriver.guid, storagedriver.name)) if len(VPoolList.get_vpools()) == 0: cluster_name = ArakoonInstaller.get_cluster_name('voldrv') if ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name)['internal'] is True: cls._logger.debug( 'StorageDriver {0} - Removing Arakoon cluster {1}'.format( storagedriver.guid, cluster_name)) try: installer = ArakoonInstaller(cluster_name=cluster_name) installer.load() installer.delete_cluster() except Exception: cls._logger.exception( 'StorageDriver {0} - Delete voldrv Arakoon cluster failed' .format(storagedriver.guid)) service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) service_name = ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name) for service in list(service_type.services): if service.name == service_name: service.delete() # Remove watcher volumedriver service if last StorageDriver on current StorageRouter if len( storagerouter.storagedrivers ) == 0 and storagerouter not in offline_srs: # ensure client is initialized for StorageRouter try: if cls._service_manager.has_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client): cls._service_manager.stop_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client) cls._service_manager.remove_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client) except Exception: cls._logger.exception( 'StorageDriver {0} - {1} service deletion failed'.format( storagedriver.guid, ServiceFactory.SERVICE_WATCHER_VOLDRV))
def test_collapse(): """ Test the arakoon collapsing :return: """ ArakoonCollapse.LOGGER.info("Starting validating arakoon collapse") node_ips = StoragerouterHelper.get_storagerouter_ips() node_ips.sort() for node_ip in node_ips: ArakoonCollapse.LOGGER.info( "Fetching arakoons on node `{0}`".format(node_ip)) arakoon_clusters = [] root_client = SSHClient(node_ip, username='******') # fetch arakoon clusters for service in ServiceList.get_services(): if service.is_internal is True and service.storagerouter.ip == node_ip and \ service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): arakoon_clusters.append( service.name.replace('arakoon-', '')) # perform collapse ArakoonCollapse.LOGGER.info( "Starting arakoon collapse on node `{0}`".format(node_ip)) for arakoon_cluster in arakoon_clusters: ArakoonCollapse.LOGGER.info( "Fetching `{0}` arakoon on node `{1}`".format( arakoon_cluster, node_ip)) arakoon_config_path = Configuration.get_configuration_path( '/ovs/arakoon/{0}/config'.format(arakoon_cluster)) tlog_location = '/opt/OpenvStorage/db/arakoon/{0}/tlogs'.format( arakoon_cluster) # read_tlog_dir with remote(node_ip, [Configuration]) as rem: config_contents = rem.Configuration.get( '/ovs/arakoon/{0}/config'.format(arakoon_cluster), raw=True) for line in config_contents.splitlines(): if 'tlog_dir' in line: tlog_location = line.split()[-1] nr_of_tlogs = ArakoonCollapse.get_nr_of_tlogs_in_folder( root_client, tlog_location) old_headdb_timestamp = 0 if root_client.file_exists('/'.join([tlog_location, 'head.db'])): old_headdb_timestamp = root_client.run([ 'stat', '--format=%Y', '{0}/{1}'.format(tlog_location, 'head.db') ]) if nr_of_tlogs <= 2: benchmark_command = [ 'arakoon', '--benchmark', '-n_clients', '1', '-max_n', '5_000', '-config', arakoon_config_path ] root_client.run(benchmark_command) ArakoonCollapse.LOGGER.info( "Collapsing arakoon `{0}` on node `{1}` ...".format( arakoon_cluster, node_ip)) GenericController.collapse_arakoon() nr_of_tlogs = ArakoonCollapse.get_nr_of_tlogs_in_folder( root_client, tlog_location) new_headdb_timestamp = root_client.run([ 'stat', '--format=%Y', '{0}/{1}'.format(tlog_location, 'head.db') ]) # perform assertion assert nr_of_tlogs <= 2,\ 'Arakoon collapse left {0} tlogs on the environment, expecting less than 2 in `{1}` on node `{1}`'\ .format(nr_of_tlogs, arakoon_cluster, node_ip) assert old_headdb_timestamp != new_headdb_timestamp,\ 'Timestamp of the head_db file was not changed ' \ 'in the process of collapsing tlogs of arakoon `{0}` on node `{1}`'\ .format(arakoon_cluster, node_ip) ArakoonCollapse.LOGGER.info( "Successfully collapsed arakoon `{0}` on node `{1}`". format(arakoon_cluster, node_ip)) ArakoonCollapse.LOGGER.info("Finished validating arakoon collapsing")
def check_backends(result_handler): """ Checks Alba as a whole :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ result_handler.info('Checking available ALBA backends.', add_to_result=False) try: alba_backends = AlbaHealthCheck._get_all_responding_backends( result_handler) if len(alba_backends) == 0: return result_handler.skip('No backends found.') result_handler.success('We found {0} backend(s)!'.format( len(alba_backends))) result_handler.info('Checking the ALBA ASDs.', add_to_result=False) for backend in alba_backends: backend_name = backend['name'] # check disks of backend, ignore global backends if backend['type'] != 'LOCAL': result_handler.skip( 'Alba backend {0} is a global backend.'.format( backend_name), add_to_result=False) continue config = Configuration.get_configuration_path( '/ovs/arakoon/{0}-abm/config'.format(backend_name)) try: result_disks = AlbaHealthCheck._check_backend_asds( result_handler, backend['disks'], backend_name, config) except Exception: result_handler.warning( 'Could not fetch the asd information for alba backend {0}' .format(backend_name)) continue working_disks = result_disks['working'] defective_disks = result_disks['broken'] # check if backend is available for vPOOL attachment / use if backend['is_available_for_vpool']: if len(defective_disks) == 0: result_handler.success( 'Alba backend {0} should be available for VPool use. All asds are working fine!' .format(backend_name)) else: result_handler.warning( 'Alba backend {0} should be available for VPool use with {1} asds, but there are {2} defective asds: {3}' .format(backend_name, len(working_disks), len(defective_disks), ', '.join(defective_disks))) else: if len(working_disks) == 0 and len(defective_disks) == 0: result_handler.skip( 'Alba backend {0} is not available for vPool use, there are no asds assigned to this backend!' .format(backend_name)) else: result_handler.failure( 'Alba backend {0} is not available for vPool use, preset requirements not satisfied! There are {1} working asds AND {2} ' 'defective asds!'.format(backend_name, len(working_disks), len(defective_disks))) except NotFoundException as ex: result_handler.failure( 'Failed to fetch the object with exception: {0}'.format(ex)) except ConnectionFailedException as ex: result_handler.failure( 'Failed to connect to configuration master with exception: {0}' .format(ex)) except (ArakoonNotFound, ArakoonNoMaster, ArakoonNoMasterResult) as e: result_handler.failure( 'Seems like a arakoon has some problems: {0}'.format(e))
def cluster_registry_checkup(): """ Verify whether changes have occurred in the cluster registry for each vPool :return: Information whether changes occurred :rtype: dict """ changed_vpools = {} for vpool in VPoolList.get_vpools(): changed_vpools[vpool.guid] = {'changes': False, 'success': True} try: StorageDriverController._logger.info( 'Validating cluster registry settings for Vpool {0}'. format(vpool.guid)) current_configs = vpool.clusterregistry_client.get_node_configs( ) changes = len(current_configs) == 0 node_configs = [] for sd in vpool.storagedrivers: sd.invalidate_dynamics(['cluster_node_config']) new_config = sd.cluster_node_config node_configs.append(ClusterNodeConfig(**new_config)) if changes is False: current_node_configs = [ config for config in current_configs if config.vrouter_id == sd.storagedriver_id ] if len(current_node_configs) == 1: current_node_config = current_node_configs[0] for key in new_config: if getattr(current_node_config, key) != new_config[key]: changes = True break changed_vpools[vpool.guid]['changes'] = changes if changes is True: StorageDriverController._logger.info( 'Cluster registry settings for Vpool {0} needs to be updated' .format(vpool.guid)) available_storagedrivers = [] for sd in vpool.storagedrivers: storagerouter = sd.storagerouter try: SSHClient(storagerouter, username='******') except UnableToConnectException: StorageDriverController._logger.warning( 'StorageRouter {0} not available.'.format( storagerouter.name)) continue with remote(storagerouter.ip, [LocalStorageRouterClient]) as rem: sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format( vpool.guid, sd.storagedriver_id) if Configuration.exists(sd_key) is True: path = Configuration.get_configuration_path( sd_key) try: lsrc = rem.LocalStorageRouterClient(path) lsrc.server_revision( ) # 'Cheap' call to verify whether volumedriver is responsive available_storagedrivers.append(sd) except Exception as ex: if 'ClusterNotReachableException' in str( ex): StorageDriverController._logger.warning( 'StorageDriver {0} on StorageRouter {1} not available.' .format(sd.guid, storagerouter.name)) else: StorageDriverController._logger.exception( 'Got exception when validating StorageDriver {0} on StorageRouter {1}.' .format(sd.guid, storagerouter.name)) StorageDriverController._logger.info( 'Updating cluster node configs for VPool {0}'.format( vpool.guid)) vpool.clusterregistry_client.set_node_configs(node_configs) for sd in available_storagedrivers: StorageDriverController._logger.info( 'Trigger config reload for StorageDriver {0}'. format(sd.guid)) vpool.storagedriver_client.update_cluster_node_configs( str(sd.storagedriver_id), req_timeout_secs=10) StorageDriverController._logger.info( 'Updating cluster node configs for Vpool {0} completed' .format(vpool.guid)) else: StorageDriverController._logger.info( 'Cluster registry settings for Vpool {0} is up to date' .format(vpool.guid)) except Exception as ex: StorageDriverController._logger.exception( 'Got exception when validating cluster registry settings for Vpool {0}.' .format(vpool.name)) changed_vpools[vpool.guid]['success'] = False changed_vpools[vpool.guid]['error'] = ex.message return changed_vpools
def _local_stack(self): """ Returns a live list of all disks known to this AlbaBackend """ from ovs.dal.lists.albanodelist import AlbaNodeList from ovs.dal.lists.albabackendlist import AlbaBackendList if len(self.abm_services) == 0: return {} # No ABM services yet, so backend not fully installed yet alba_backend_map = {} for alba_backend in AlbaBackendList.get_albabackends(): alba_backend_map[alba_backend.alba_id] = alba_backend # Load information based on the model asd_map = {} storage_map = {} alba_nodes = AlbaNodeList.get_albanodes() for node in alba_nodes: node_id = node.node_id storage_map[node_id] = {} for disk in node.disks: disk_id = disk.aliases[0].split('/')[-1] storage_map[node_id][disk_id] = {'asds': {}, 'name': disk_id, 'guid': disk.guid, 'status': 'error', 'aliases': disk.aliases, 'status_detail': 'unknown'} for osd in disk.osds: osd_id = osd.osd_id data = {'asd_id': osd_id, 'guid': osd.guid, 'status': 'error', 'status_detail': 'unknown', 'alba_backend_guid': osd.alba_backend_guid} asd_map[osd_id] = data storage_map[node_id][disk_id]['asds'][osd_id] = data # Load information from node def _load_live_info(_node, _node_data): _data = _node.storage_stack if _data['status'] != 'ok': for disk_entry in _node_data.values(): disk_entry['status_detail'] = _data['status'] for entry in disk_entry.get('asds', {}).values(): entry['status_detail'] = _data['status'] else: for _disk_id, disk_asd_info in _data['stack'].iteritems(): if _disk_id not in _node_data: _node_data[_disk_id] = {'asds': {}} entry = _node_data[_disk_id] disk_info = copy.deepcopy(disk_asd_info) del disk_info['asds'] entry.update(disk_info) asds_info = disk_asd_info['asds'] for _asd_id, asd_info in asds_info.iteritems(): if _asd_id not in _node_data[_disk_id]['asds']: _node_data[_disk_id]['asds'][_asd_id] = asd_info else: _node_data[_disk_id]['asds'][_asd_id].update(asd_info) threads = [] for node in alba_nodes: thread = Thread(target=_load_live_info, args=(node, storage_map[node.node_id])) thread.start() threads.append(thread) for thread in threads: thread.join() # Mix in usage information for asd_id, stats in self.asd_statistics.iteritems(): if asd_id in asd_map: asd_map[asd_id]['usage'] = {'size': int(stats['capacity']), 'used': int(stats['disk_usage']), 'available': int(stats['capacity'] - stats['disk_usage'])} # Load information from alba backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format(self.guid) if Configuration.exists(backend_interval_key): interval = Configuration.get(backend_interval_key) else: interval = Configuration.get('/ovs/alba/backends/global_gui_error_interval') config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(self.name)) asds = {} for found_osd in AlbaCLI.run(command='list-all-osds', config=config): asds[found_osd['long_id']] = found_osd for node_data in storage_map.values(): for _disk in node_data.values(): for asd_id, asd_data in _disk['asds'].iteritems(): if asd_id not in asds: continue found_osd = asds[asd_id] if 'state' not in asd_data: continue if found_osd.get('decommissioned') is True: asd_data['status'] = 'unavailable' asd_data['status_detail'] = 'decommissioned' continue state = asd_data['state'] if state == 'ok': if found_osd['id'] is None: alba_id = found_osd['alba_id'] if alba_id is None: asd_data['status'] = 'available' else: asd_data['status'] = 'unavailable' alba_backend = alba_backend_map.get(alba_id) if alba_backend is not None: asd_data['alba_backend_guid'] = alba_backend.guid else: asd_data['alba_backend_guid'] = self.guid asd_data['status'] = 'warning' asd_data['status_detail'] = 'recenterrors' read = found_osd['read'] or [0] write = found_osd['write'] or [0] errors = found_osd['errors'] if len(errors) == 0 or (len(read + write) > 0 and max(min(read), min(write)) > max(error[0] for error in errors) + interval): asd_data['status'] = 'claimed' asd_data['status_detail'] = '' else: asd_data['status'] = 'error' asd_data['status_detail'] = asd_data.get('state_detail', '') alba_backend = alba_backend_map.get(found_osd.get('alba_id')) if alba_backend is not None: asd_data['alba_backend_guid'] = alba_backend.guid return storage_map
def add_preset(alba_backend_guid, name, compression, policies, encryption, fragment_size=None): """ Adds a preset to Alba :param alba_backend_guid: Guid of the ALBA backend :type alba_backend_guid: str :param name: Name of the preset :type name: str :param compression: Compression type for the preset (none | snappy | bz2) :type compression: str :param policies: Policies for the preset :type policies: list :param encryption: Encryption for the preset (none | aes-cbc-256 | aes-ctr-256) :type encryption: str :param fragment_size: Size of a fragment in bytes (e.g. 1048576) :type fragment_size: int :return: None """ # VALIDATIONS if not re.match(Toolbox.regex_preset, name): raise ValueError('Invalid preset name specified') compression_options = ['snappy', 'bz2', 'none'] if compression not in compression_options: raise ValueError( 'Invalid compression format specified, please choose from: "{0}"' .format('", "'.join(compression_options))) encryption_options = ['aes-cbc-256', 'aes-ctr-256', 'none'] if encryption not in encryption_options: raise ValueError( 'Invalid encryption format specified, please choose from: "{0}"' .format('", "'.join(encryption_options))) if fragment_size is not None and (not isinstance(fragment_size, int) or not 16 <= fragment_size <= 1024**3): raise ValueError( 'Fragment size should be a positive integer smaller than 1 GiB' ) AlbaPresetController._validate_policies_param(policies=policies) alba_backend = AlbaBackend(alba_backend_guid) if name in [preset['name'] for preset in alba_backend.presets]: raise RuntimeError( 'Preset with name {0} already exists'.format(name)) # ADD PRESET preset = { 'compression': compression, 'object_checksum': { 'default': ['crc-32c'], 'verify_upload': True, 'allowed': [['none'], ['sha-1'], ['crc-32c']] }, 'osds': ['all'], 'fragment_size': 16 * 1024**2 if fragment_size is None else int(fragment_size), 'policies': policies, 'fragment_checksum': ['crc-32c'], 'fragment_encryption': ['none'], 'in_use': False, 'name': name } # Generate encryption key temp_key_file = None if encryption != 'none': encryption_key = ''.join( random.choice(chr(random.randint(32, 126))) for _ in range(32)) temp_key_file = tempfile.mktemp() with open(temp_key_file, 'wb') as temp_file: temp_file.write(encryption_key) temp_file.flush() preset['fragment_encryption'] = [ '{0}'.format(encryption), '{0}'.format(temp_key_file) ] # Dump preset content on filesystem config = Configuration.get_configuration_path( alba_backend.abm_cluster.config_location) temp_config_file = tempfile.mktemp() with open(temp_config_file, 'wb') as data_file: data_file.write(json.dumps(preset)) data_file.flush() # Create preset AlbaPresetController._logger.debug( 'Adding preset {0} with compression {1} and policies {2}'.format( name, compression, policies)) AlbaCLI.run(command='create-preset', config=config, named_params={'input-url': temp_config_file}, extra_params=[name]) # Cleanup alba_backend.invalidate_dynamics() for filename in [temp_key_file, temp_config_file]: if filename and os.path.exists(filename) and os.path.isfile( filename): os.remove(filename)
def check_if_proxies_work(result_handler): """ Checks if all Alba Proxies work on a local machine, it creates a namespace and tries to put and object :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ namespace_params = { 'bucket_count': (list, None), 'logical': (int, None), 'storage': (int, None), 'storage_per_osd': (list, None) } result_handler.info('Checking the ALBA proxies.', add_to_result=False) amount_of_presets_not_working = [] # ignore possible subprocess output fnull = open(os.devnull, 'w') # try put/get/verify on all available proxies on the local node local_proxies = ServiceHelper.get_local_proxy_services() if len(local_proxies) == 0: result_handler.info('Found no proxies.', add_to_result=False) return amount_of_presets_not_working for service in local_proxies: try: result_handler.info('Checking ALBA proxy {0}.'.format( service.name), add_to_result=False) ip = service.alba_proxy.storagedriver.storage_ip # Encapsulating try to determine test output try: # Determine what to what backend the proxy is connected proxy_client_cfg = AlbaCLI.run(command='proxy-client-cfg', named_params={ 'host': ip, 'port': service.ports[0] }) except AlbaException: result_handler.failure( 'Fetching proxy info has failed. Please verify if {0}:{1} is the correct address for proxy {2}.' .format(ip, service.ports[0], service.name)) continue # Fetch arakoon information abm_name = proxy_client_cfg.get('cluster_id') # Check if proxy config is correctly setup if abm_name is None: raise ConfigNotMatchedException( 'Proxy config for proxy {0} does not have the correct format on node {1} with port {2}.' .format(service.name, ip, service.ports[0])) abm_config = Configuration.get_configuration_path( '/ovs/vpools/{0}/proxies/{1}/config/abm'.format( service.alba_proxy.storagedriver.vpool.guid, service.alba_proxy.guid)) # Determine presets / backend try: presets = AlbaCLI.run(command='list-presets', config=abm_config) except AlbaException: result_handler.failure( 'Listing the presets has failed. Please check the arakoon config path. We used {0}' .format(abm_config)) continue for preset in presets: # If preset is not in use, test will fail so add a skip if preset['in_use'] is False: result_handler.skip( 'Preset {0} is not in use and will not be checked'. format(preset['name'])) continue preset_name = preset['name'] # Encapsulation try for cleanup try: # Generate new namespace name using the preset namespace_key_prefix = 'ovs-healthcheck-ns-{0}-{1}'.format( preset_name, AlbaHealthCheck.LOCAL_ID) namespace_key = '{0}_{1}'.format( namespace_key_prefix, uuid.uuid4()) object_key = 'ovs-healthcheck-obj-{0}'.format( str(uuid.uuid4())) # Create namespace AlbaCLI.run(command='proxy-create-namespace', named_params={ 'host': ip, 'port': service.ports[0] }, extra_params=[namespace_key, preset_name]) # Wait until fully created namespace_start_time = time.time() for index in xrange(2): # Running twice because the first one could give a false positive as the osds will alert the nsm # and the nsm would respond with got messages but these were not the ones we are after AlbaCLI.run(command='deliver-messages', config=abm_config) while True: if time.time( ) - namespace_start_time > AlbaHealthCheck.NAMESPACE_TIMEOUT: raise RuntimeError( 'Creation namespace has timed out after {0}s' .format(time.time() - namespace_start_time)) list_ns_osds_output = AlbaCLI.run( command='list-ns-osds', config=abm_config, extra_params=[namespace_key]) # Example output: [[0, [u'Active']], [3, [u'Active']]] namespace_ready = True for osd_info in list_ns_osds_output: # If there are no osd_info records, uploading will fail so covered by HC osd_state = osd_info[1][0] if osd_state != 'Active': namespace_ready = False if namespace_ready is True: break result_handler.success( 'Namespace successfully created on proxy {0} with preset {1}!' .format(service.name, preset_name)) namespace_info = AlbaCLI.run( command='show-namespace', config=abm_config, extra_params=[namespace_key]) Toolbox.verify_required_params( required_params=namespace_params, actual_params=namespace_info) result_handler.success( 'Namespace successfully fetched on proxy {0} with preset {1}!' .format(service.name, preset_name)) # Put test object to given dir with open(AlbaHealthCheck.TEMP_FILE_LOC, 'wb') as output_file: output_file.write( os.urandom(AlbaHealthCheck.TEMP_FILE_SIZE)) AlbaCLI.run(command='proxy-upload-object', named_params={ 'host': ip, 'port': service.ports[0] }, extra_params=[ namespace_key, AlbaHealthCheck.TEMP_FILE_LOC, object_key ]) result_handler.success( 'Successfully uploaded the object to namespace {0}' .format(namespace_key)) # download object AlbaCLI.run(command='proxy-download-object', named_params={ 'host': ip, 'port': service.ports[0] }, extra_params=[ namespace_key, object_key, AlbaHealthCheck.TEMP_FILE_FETCHED_LOC ]) result_handler.success( 'Successfully downloaded the object to namespace {0}' .format(namespace_key)) # check if files exists - issue #57 if not (os.path.isfile( AlbaHealthCheck.TEMP_FILE_FETCHED_LOC) and os.path.isfile(AlbaHealthCheck.TEMP_FILE_LOC)): # creation of object failed raise ObjectNotFoundException( ValueError('Creation of object has failed')) hash_original = hashlib.md5( open(AlbaHealthCheck.TEMP_FILE_LOC, 'rb').read()).hexdigest() hash_fetched = hashlib.md5( open(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC, 'rb').read()).hexdigest() if hash_original == hash_fetched: result_handler.success( 'Fetched object {0} from namespace {1} on proxy {2} with preset {3} matches the created object!' .format(object_key, namespace_key, service.name, preset_name)) else: result_handler.failure( 'Fetched object {0} from namespace {1} on proxy {2} with preset {3} does not match the created object!' .format(object_key, namespace_key, service.name, preset_name)) except ObjectNotFoundException as ex: amount_of_presets_not_working.append(preset_name) result_handler.failure( 'Failed to put object on namespace {0} failed on proxy {1}with preset {2} With error {3}' .format(namespace_key, service.name, preset_name, ex)) except AlbaException as ex: if ex.alba_command == 'proxy-create-namespace': result_handler.failure( 'Create namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}' .format(str(ex), namespace_key, service.name, preset_name)) elif ex.alba_command == 'show-namespace': result_handler.failure( 'Show namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}' .format(str(ex), namespace_key, service.name, preset_name)) elif ex.alba_command == 'proxy-upload-object': result_handler.failure( 'Uploading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}' .format(str(ex), namespace_key, service.name, preset_name)) elif ex.alba_command == 'proxy-download-object': result_handler.failure( 'Downloading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}' .format(str(ex), namespace_key, service.name, preset_name)) finally: # Delete the created namespace and preset subprocess.call( ['rm', str(AlbaHealthCheck.TEMP_FILE_LOC)], stdout=fnull, stderr=subprocess.STDOUT) subprocess.call( ['rm', str(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC)], stdout=fnull, stderr=subprocess.STDOUT) namespaces = AlbaCLI.run(command='list-namespaces', config=abm_config) namespaces_to_remove = [] proxy_named_params = { 'host': ip, 'port': service.ports[0] } for namespace in namespaces: if namespace['name'].startswith( namespace_key_prefix): namespaces_to_remove.append(namespace['name']) for namespace_name in namespaces_to_remove: if namespace_name == namespace_key: result_handler.info( 'Deleting namespace {0}.'.format( namespace_name)) else: result_handler.warning( 'Deleting namespace {0} which was leftover from a previous run.' .format(namespace_name)) AlbaCLI.run(command='proxy-delete-namespace', named_params=proxy_named_params, extra_params=[namespace_name]) namespace_delete_start = time.time() while True: try: AlbaCLI.run( command='show-namespace', config=abm_config, extra_params=[namespace_name] ) # Will fail if the namespace does not exist except AlbaException: result_handler.success( 'Namespace {0} successfully removed.'. format(namespace_name)) break if time.time( ) - namespace_delete_start > AlbaHealthCheck.NAMESPACE_TIMEOUT: raise RuntimeError( 'Delete namespace has timed out after {0}s' .format(time.time() - namespace_start_time)) # be tidy, and make the proxy forget the namespace try: AlbaCLI.run( command='proxy-statistics', named_params=proxy_named_params, extra_params=['--forget', namespace_name]) except: result_handler.warning( 'Failed to make proxy forget namespace {0}.' .format(namespace_name)) except subprocess.CalledProcessError as ex: # this should stay for the deletion of the remaining files amount_of_presets_not_working.append(service.name) result_handler.failure( 'Proxy {0} has some problems. Got {1} as error'.format( service.name, ex)) except ConfigNotMatchedException as ex: amount_of_presets_not_working.append(service.name) result_handler.failure( 'Proxy {0} has some problems. Got {1} as error'.format( service.name, ex))
def add_preset(alba_backend_guid, name, compression, policies, encryption, fragment_size=None): """ Adds a preset to Alba :param alba_backend_guid: Guid of the ALBA backend :type alba_backend_guid: str :param name: Name of the preset :type name: str :param compression: Compression type for the preset (none | snappy | bz2) :type compression: str :param policies: Policies for the preset :type policies: list :param encryption: Encryption for the preset (none | aes-cbc-256 | aes-ctr-256) :type encryption: str :param fragment_size: Size of a fragment in bytes (e.g. 1048576) :type fragment_size: int :return: None """ # VALIDATIONS if not re.match(Toolbox.regex_preset, name): raise ValueError('Invalid preset name specified') compression_options = ['snappy', 'bz2', 'none'] if compression not in compression_options: raise ValueError('Invalid compression format specified, please choose from: "{0}"'.format('", "'.join(compression_options))) encryption_options = ['aes-cbc-256', 'aes-ctr-256', 'none'] if encryption not in encryption_options: raise ValueError('Invalid encryption format specified, please choose from: "{0}"'.format('", "'.join(encryption_options))) if fragment_size is not None and (not isinstance(fragment_size, int) or not 16 <= fragment_size <= 1024 ** 3): raise ValueError('Fragment size should be a positive integer smaller than 1 GiB') AlbaPresetController._validate_policies_param(policies=policies) alba_backend = AlbaBackend(alba_backend_guid) if name in [preset['name'] for preset in alba_backend.presets]: raise RuntimeError('Preset with name {0} already exists'.format(name)) # ADD PRESET preset = {'compression': compression, 'object_checksum': {'default': ['crc-32c'], 'verify_upload': True, 'allowed': [['none'], ['sha-1'], ['crc-32c']]}, 'osds': ['all'], 'fragment_size': 16 * 1024 ** 2 if fragment_size is None else int(fragment_size), 'policies': policies, 'fragment_checksum': ['crc-32c'], 'fragment_encryption': ['none'], 'in_use': False, 'name': name} # Generate encryption key temp_key_file = None if encryption != 'none': encryption_key = ''.join(random.choice(chr(random.randint(32, 126))) for _ in range(32)) temp_key_file = tempfile.mktemp() with open(temp_key_file, 'wb') as temp_file: temp_file.write(encryption_key) temp_file.flush() preset['fragment_encryption'] = ['{0}'.format(encryption), '{0}'.format(temp_key_file)] # Dump preset content on filesystem config = Configuration.get_configuration_path(ArakoonInstaller.CONFIG_KEY.format(AlbaController.get_abm_cluster_name(alba_backend=alba_backend))) temp_config_file = tempfile.mktemp() with open(temp_config_file, 'wb') as data_file: data_file.write(json.dumps(preset)) data_file.flush() # Create preset AlbaPresetController._logger.debug('Adding preset {0} with compression {1} and policies {2}'.format(name, compression, policies)) AlbaCLI.run(command='create-preset', config=config, named_params={'input-url': temp_config_file}, extra_params=[name]) # Cleanup alba_backend.invalidate_dynamics() for filename in [temp_key_file, temp_config_file]: if filename and os.path.exists(filename) and os.path.isfile(filename): os.remove(filename)
def check_nsm_load(cls, result_handler, max_load=None, use_total_capacity=False, total_capacity_warning=None, total_capacity_error=None): """ Checks all NSM services registered within the Framework and will report their load :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param max_load: Maximum load percentage before marking it as overloaded. Defaults to ovs/framework/plugins/alba/config|nsm.maxload :type max_load: float :param use_total_capacity: Base NSM load of the total possible capacity (capacity of NSMs before they are marked as overloaded) instead of checking the least filled NSM. Use threshold arguments for tuning' :type use_total_capacity: bool :param total_capacity_warning: Number of remaining namespaces threshold before throwing a warning. Defaults 20% of the total namespaces :type total_capacity_warning: int :param total_capacity_error: Number of remaining namespaces threshold before throwing an error. Defaults to 5% of the total namespaces :type total_capacity_error: int :return: None :rtype: NoneType """ max_nsm_load_config = Configuration.get('ovs/framework/plugins/alba/config|nsm.maxload') max_load = max_load or max_nsm_load_config for alba_backend in AlbaBackendList.get_albabackends(): if alba_backend.abm_cluster is None: result_handler.failure('No ABM cluster found for ALBA Backend {0}'.format(alba_backend.name)) continue if len(alba_backend.abm_cluster.abm_services) == 0: result_handler.failure('ALBA Backend {0} does not have any registered ABM services'.format(alba_backend.name)) continue if len(alba_backend.nsm_clusters) == 0: result_handler.failure('ALBA Backend {0} does not have any registered NSM services'.format(alba_backend.name)) continue internal = alba_backend.abm_cluster.abm_services[0].service.is_internal if use_total_capacity: maximum_capacity_before_overload = AlbaHealthCheck._get_nsm_max_capacity_before_overload(alba_backend, max_nsm_load_config) total_capacity_warning = total_capacity_warning or math.ceil(maximum_capacity_before_overload * 1.0/5) total_capacity_error = total_capacity_error or math.ceil(maximum_capacity_before_overload * 1.0/20) config = Configuration.get_configuration_path(key=alba_backend.abm_cluster.config_location) hosts_data = AlbaCLI.run(command='list-nsm-hosts', config=config) current_capacity = sum([host['namespaces_count'] for host in hosts_data if not host['lost']]) remaining_capacity = maximum_capacity_before_overload - current_capacity if remaining_capacity > total_capacity_warning and remaining_capacity > total_capacity_error: # Only error could be specified result_handler.success('NSMs for backend {0} have enough capacity remaining ({1}/{2} used)'.format(alba_backend.name, current_capacity, maximum_capacity_before_overload), code=ErrorCodes.nsm_load_ok) elif total_capacity_warning >= remaining_capacity > total_capacity_error: result_handler.warning('NSMs for backend {0} have reached the warning threshold ' '({1} namespaces had to be remaining, {2}/{3} used)'.format(alba_backend.name, total_capacity_warning, current_capacity, maximum_capacity_before_overload), code=ErrorCodes.nsm_load_ok) else: result_handler.failure('NSMs for backend {0} have reached the error threshold ' '({1} namespaces had to be remaining, ({2}/{3} used)'.format(alba_backend.name, total_capacity_error, current_capacity, maximum_capacity_before_overload), code=ErrorCodes.nsm_load_ok) else: nsm_loads = {} sorted_nsm_clusters = sorted(alba_backend.nsm_clusters, key=lambda k: k.number) for nsm_cluster in sorted_nsm_clusters: nsm_loads[nsm_cluster.number] = AlbaController.get_load(nsm_cluster) overloaded = min(nsm_loads.values()) >= max_load if overloaded is False: result_handler.success('NSMs for backend {0} are not overloaded'.format(alba_backend.name), code=ErrorCodes.nsm_load_ok) else: if internal is True: result_handler.warning('NSMs for backend {0} are overloaded. The NSM checkup will take care of this'.format(alba_backend.name), code=ErrorCodes.nsm_load_warn) else: result_handler.failure('NSMs for backend {0} are overloaded. Please add your own NSM clusters to the backend'.format(alba_backend.name), code=ErrorCodes.nsm_load_failure)
def test_arakoon_collapse(self): """ Test the Arakoon collapse functionality """ # Set up the test structure = DalHelper.build_dal_structure( structure={'storagerouters': [1, 2]}) storagerouter_1 = structure['storagerouters'][1] storagerouter_2 = structure['storagerouters'][2] MockedSSHClient._run_returns[storagerouter_1.ip] = {} MockedSSHClient._run_returns[storagerouter_2.ip] = {} # Make sure we cover all Arakoon cluster types clusters_to_create = { ServiceType.ARAKOON_CLUSTER_TYPES.SD: [{ 'name': 'unittest-voldrv', 'internal': True, 'success': True }], ServiceType.ARAKOON_CLUSTER_TYPES.CFG: [{ 'name': 'unittest-cacc', 'internal': True, 'success': True }], ServiceType.ARAKOON_CLUSTER_TYPES.FWK: [{ 'name': 'unittest-ovsdb', 'internal': True, 'success': False }], ServiceType.ARAKOON_CLUSTER_TYPES.ABM: [{ 'name': 'unittest-cluster-1-abm', 'internal': True, 'success': False }, { 'name': 'unittest-random-abm-name', 'internal': False, 'success': True }], ServiceType.ARAKOON_CLUSTER_TYPES.NSM: [{ 'name': 'unittest-cluster-1-nsm_0', 'internal': True, 'success': True }] } self.assertEqual( first=sorted(clusters_to_create.keys()), second=sorted(ServiceType.ARAKOON_CLUSTER_TYPES.keys()), msg= 'An Arakoon cluster type has been removed or added, please update this test accordingly' ) # Create all Arakoon clusters and related services failed_clusters = [] external_clusters = [] successful_clusters = [] for cluster_type, cluster_infos in clusters_to_create.iteritems(): filesystem = cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.CFG for cluster_info in cluster_infos: internal = cluster_info['internal'] cluster_name = cluster_info['name'] base_dir = DalHelper.CLUSTER_DIR.format(cluster_name) arakoon_installer = ArakoonInstaller(cluster_name=cluster_name) arakoon_installer.create_cluster(cluster_type=cluster_type, ip=storagerouter_1.ip, base_dir=base_dir, internal=internal) arakoon_installer.start_cluster() arakoon_installer.extend_cluster(new_ip=storagerouter_2.ip, base_dir=base_dir) service_name = ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name) if cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.ABM: service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ALBA_MGR) elif cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.NSM: service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.NS_MGR) else: service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) if internal is True: DalHelper.create_service( service_name=service_name, service_type=service_type, storagerouter=storagerouter_1, ports=arakoon_installer.ports[storagerouter_1.ip]) DalHelper.create_service( service_name=service_name, service_type=service_type, storagerouter=storagerouter_2, ports=arakoon_installer.ports[storagerouter_2.ip]) else: DalHelper.create_service(service_name=service_name, service_type=service_type) external_clusters.append(cluster_name) continue if cluster_info['success'] is True: if filesystem is True: config_path = ArakoonClusterConfig.CONFIG_FILE.format( cluster_name) else: config_path = Configuration.get_configuration_path( ArakoonClusterConfig.CONFIG_KEY.format( cluster_name)) MockedSSHClient._run_returns[storagerouter_1.ip][ 'arakoon --collapse-local 1 2 -config {0}'.format( config_path)] = None MockedSSHClient._run_returns[storagerouter_2.ip][ 'arakoon --collapse-local 2 2 -config {0}'.format( config_path)] = None successful_clusters.append(cluster_name) else: # For successful False clusters we don't emulate the collapse, thus making it fail failed_clusters.append(cluster_name) # Start collapse and make it fail for all clusters on StorageRouter 2 SSHClient._raise_exceptions[storagerouter_2.ip] = { 'users': ['ovs'], 'exception': UnableToConnectException('No route to host') } GenericController.collapse_arakoon() # Verify all log messages for each type of cluster generic_logs = Logger._logs.get('lib', {}) for cluster_name in successful_clusters + failed_clusters + external_clusters: collect_msg = ( 'DEBUG', 'Collecting info for cluster {0}'.format(cluster_name)) unreachable_msg = ( 'ERROR', 'Could not collapse any cluster on {0} (not reachable)'.format( storagerouter_2.name)) end_collapse_msg = ( 'DEBUG', 'Collapsing cluster {0} on {1} completed'.format( cluster_name, storagerouter_1.ip)) start_collapse_msg = ('DEBUG', 'Collapsing cluster {0} on {1}'.format( cluster_name, storagerouter_1.ip)) failed_collapse_msg = ( 'ERROR', 'Collapsing cluster {0} on {1} failed'.format( cluster_name, storagerouter_1.ip)) messages_to_validate = [] if cluster_name in successful_clusters: assert_function = self.assertIn messages_to_validate.append(collect_msg) messages_to_validate.append(unreachable_msg) messages_to_validate.append(start_collapse_msg) messages_to_validate.append(end_collapse_msg) elif cluster_name in failed_clusters: assert_function = self.assertIn messages_to_validate.append(collect_msg) messages_to_validate.append(unreachable_msg) messages_to_validate.append(start_collapse_msg) messages_to_validate.append(failed_collapse_msg) else: assert_function = self.assertNotIn messages_to_validate.append(collect_msg) messages_to_validate.append(start_collapse_msg) messages_to_validate.append(end_collapse_msg) for severity, message in messages_to_validate: if assert_function == self.assertIn: assert_message = 'Expected to find log message: {0}'.format( message) else: assert_message = 'Did not expect to find log message: {0}'.format( message) assert_function(member=message, container=generic_logs, msg=assert_message) if assert_function == self.assertIn: self.assertEqual( first=severity, second=generic_logs[message], msg='Log message {0} is of severity {1} expected {2}'. format(message, generic_logs[message], severity)) # Collapse should always have a 'finished' message since each cluster should be attempted to be collapsed for general_message in [ 'Arakoon collapse started', 'Arakoon collapse finished' ]: self.assertIn(member=general_message, container=generic_logs, msg='Expected to find log message: {0}'.format( general_message))
def execute_scrub_work(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled :type error_messages: list :return: a list of error messages :rtype: list """ def _verify_mds_config(current_vdisk): current_vdisk.invalidate_dynamics('info') vdisk_configs = current_vdisk.info['metadata_backend_config'] if len(vdisk_configs) == 0: raise RuntimeError('Could not load MDS configuration') return vdisk_configs client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] scrub_directory = '{0}/scrub_work_{1}_{2}'.format(scrub_info['scrub_path'], vpool.name, storagerouter.name) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(vpool.guid, storagerouter.guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(vpool.guid, storagerouter.guid) alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format(vpool.name, storagerouter.name) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod(scrub_directory, 0777) # Celery task executed by 'ovs' user and should be able to write in it if ServiceManager.has_service(name=alba_proxy_service, client=client) is True and ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get('/ovs/framework/hosts/{0}/ports|storagedriver'.format(machine_id)) port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] # Scrub config # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini', # u'fragment_cache': [u'none'], # u'ips': [u'127.0.0.1'], # u'log_level': u'info', # u'manifest_cache_size': 17179869184, # u'port': 0, # u'transport': u'tcp'} # Backend config # {u'alba_connection_host': u'10.100.193.155', # u'alba_connection_port': 26204, # u'alba_connection_preset': u'preset', # u'alba_connection_timeout': 15, # u'alba_connection_transport': u'TCP', # u'backend_interface_retries_on_error': 5, # u'backend_interface_retry_backoff_multiplier': 2.0, # u'backend_interface_retry_interval_secs': 1, # u'backend_type': u'ALBA'} scrub_config = Configuration.get('ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = {'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path('alba_proxy'), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key)} ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) ServiceManager.start_service(name=alba_proxy_service, client=client) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get('ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, vpool.storagedrivers[0].storagedriver_id))['backend_connection_manager'] backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config['port'] Configuration.set(backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message) if client is not None and ServiceManager.has_service(name=alba_proxy_service, client=client) is True: if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ServiceManager.stop_service(name=alba_proxy_service, client=client) ServiceManager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) try: # Empty the queue with vDisks to scrub with remote(storagerouter.ip, [VDisk]) as rem: while True: vdisk = None vdisk_guid = queue.get(False) try: # Check MDS master is local. Trigger MDS handover if necessary vdisk = rem.VDisk(vdisk_guid) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'.format(vpool.name, storagerouter.name, vdisk.name, scrub_directory)) configs = _verify_mds_config(current_vdisk=vdisk) storagedriver = StorageDriverList.get_by_storagedriver_id(vdisk.storagedriver_id) if configs[0].get('ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'.format(vpool.name, storagerouter.name, vdisk.name)) MDSServiceController.ensure_safety(VDisk(vdisk_guid)) # Do not use a remote VDisk instance here configs = _verify_mds_config(current_vdisk=vdisk) if configs[0].get('ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.warning('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'.format(vpool.name, storagerouter.name, vdisk.name)) continue # Do the actual scrubbing with vdisk.storagedriver_client.make_locked_client(str(vdisk.volume_id)) as locked_client: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'.format(vpool.name, storagerouter.name, vdisk.name)) work_units = locked_client.get_scrubbing_workunits() for work_unit in work_units: res = locked_client.scrub(work_unit=work_unit, scratch_dir=scrub_directory, log_sinks=[LogHandler.get_sink_path('scrubber', allow_override=True)], backend_config=Configuration.get_configuration_path(backend_config_key)) locked_client.apply_scrubbing_result(scrubbing_work_result=res) if work_units: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'.format(vpool.name, storagerouter.name, vdisk.name, len(work_units))) else: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'.format(vpool.name, storagerouter.name, vdisk.name)) except Exception: if vdisk is None: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(vpool.name, storagerouter.name, vdisk_guid) else: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(vpool.name, storagerouter.name, vdisk.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) except Empty: # Raised when all items have been fetched from the queue ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'.format(vpool.name, storagerouter.name)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(vpool.name, storagerouter.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if ServiceManager.has_service(alba_proxy_service, client=client): ServiceManager.stop_service(alba_proxy_service, client=client) ServiceManager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message)
def _deploy_stack_and_scrub(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled (by reference) :type error_messages: list :return: None :rtype: NoneType """ if len(vpool.storagedrivers ) == 0 or not vpool.storagedrivers[0].storagedriver_id: error_messages.append( 'vPool {0} does not have any valid StorageDrivers configured'. format(vpool.name)) return service_manager = ServiceFactory.get_manager() client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] partition_guid = scrub_info['partition_guid'] alba_proxy_service = 'ovs-albaproxy_{0}_{1}_{2}_scrub'.format( vpool.name, storagerouter.name, partition_guid) scrub_directory = '{0}/scrub_work_{1}_{2}'.format( scrub_info['scrub_path'], vpool.name, partition_guid) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format( vpool.guid, partition_guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format( vpool.guid, partition_guid) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod( scrub_directory, 0777 ) # Celery task executed by 'ovs' user and should be able to write in it if service_manager.has_service( name=alba_proxy_service, client=client ) is True and service_manager.get_service_status( name=alba_proxy_service, client=client) == 'active': GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get( '/ovs/framework/hosts/{0}/ports|storagedriver'.format( machine_id)) with volatile_mutex('deploy_proxy_for_scrub_{0}'.format( storagerouter.guid), wait=30): port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] scrub_config = Configuration.get( 'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format( vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = { 'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path(alba_proxy_service), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key) } service_manager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) service_manager.start_service(name=alba_proxy_service, client=client) GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get( 'ovs/vpools/{0}/hosts/{1}/config'.format( vpool.guid, vpool.storagedrivers[0].storagedriver_id ))['backend_connection_manager'] if backend_config.get('backend_type') != 'MULTI': backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config[ 'port'] else: for value in backend_config.itervalues(): if isinstance(value, dict): value['alba_connection_host'] = '127.0.0.1' value['alba_connection_port'] = scrub_config[ 'port'] # Copy backend connection manager information in separate key Configuration.set( backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) GenericController._logger.exception(message) if client is not None and service_manager.has_service( name=alba_proxy_service, client=client) is True: if service_manager.get_service_status( name=alba_proxy_service, client=client) == 'active': service_manager.stop_service(name=alba_proxy_service, client=client) service_manager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) # Execute the actual scrubbing threads = [] threads_key = '/ovs/framework/hosts/{0}/config|scrub_stack_threads'.format( storagerouter.machine_id) amount_threads = Configuration.get( key=threads_key) if Configuration.exists(key=threads_key) else 2 if not isinstance(amount_threads, int): error_messages.append( 'Amount of threads to spawn must be an integer for StorageRouter with ID {0}' .format(storagerouter.machine_id)) return amount_threads = max(amount_threads, 1) # Make sure amount_threads is at least 1 amount_threads = min(min(queue.qsize(), amount_threads), 20) # Make sure amount threads is max 20 GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Spawning {2} threads for proxy service {3}' .format(vpool.name, storagerouter.name, amount_threads, alba_proxy_service)) for index in range(amount_threads): thread = Thread(name='execute_scrub_{0}_{1}_{2}'.format( vpool.guid, partition_guid, index), target=GenericController._execute_scrub, args=(queue, vpool, scrub_info, scrub_directory, error_messages)) thread.start() threads.append(thread) for thread in threads: thread.join() # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if service_manager.has_service(alba_proxy_service, client=client): service_manager.stop_service(alba_proxy_service, client=client) service_manager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) GenericController._logger.exception(message)