def _fd_worker(queue, clients, result_handler, service_manager): """ Worker method to retrieve file descriptors :param queue: Queue to use :param clients: SSHClients to choose from :param result_handler: Logging object :param service_manager: Service manager instance :return: None :rtype: NoneType """ while not queue.empty(): cluster_name, _node_config, _results = queue.get(False) errors = _results['errors'] output = _results['result'] identifier = 'Arakoon cluster {0} on node {1}'.format(cluster_name, _node_config.ip) result_handler.info('Retrieving file descriptor information for {0}'.format(identifier), add_to_result=False) try: client = clients[_node_config.ip] try: # Handle config Arakoon cluster_name = cluster_name if cluster_name != 'cacc' else 'config' service_name = ArakoonInstaller.get_service_name_for_cluster(cluster_name) pid = service_manager.get_service_pid(service_name, client) file_descriptors = client.run(['lsof', '-i', '-a', '-p', pid]).splitlines()[1:] except Exception as _ex: errors.append(('lsof', _ex)) raise output['fds'] = file_descriptors except Exception as _ex: result_handler.warning( 'Could not retrieve the file descriptor information for {0} ({1})'.format(identifier, str(_ex)), add_to_result=False) finally: queue.task_done()
def ensure_s3_transaction_safety(cls, s3_cluster, available_storagerouters, s3_installer=None): # type: (S3TransactionCluster, Dict[StorageRouter, DiskPartition], Optional[S3TransactionInstaller]) -> None """ Ensure that the S3 transaction cluster is safe and sound :param s3_cluster: ABM Cluster object :type s3_cluster: ABMCluster :param available_storagerouters: All available storagerouters mapped with their DB partition :type available_storagerouters: Dict[StorageRouter, DiskPartition] :param s3_installer: The ABMInstaller to use. Defaults to creating a new one :type s3_installer: ABMInstaller :return: None :rtype: NoneType """ s3_transaction_installer = s3_installer or S3TransactionInstaller() metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=s3_cluster.name) if 0 < len(s3_cluster.s3_transaction_services) < len( available_storagerouters) and metadata['internal'] is True: current_service_ips = [ j_service.service.storagerouter.ip for j_service in s3_cluster.s3_transaction_services ] for storagerouter, partition in available_storagerouters.iteritems( ): if storagerouter.ip in current_service_ips: continue s3_transaction_installer.extend_s3_cluster( storagerouter, s3_cluster) current_service_ips.append(storagerouter.ip)
def _get_arakoon_clusters(cls, result_handler): """ Retrieves all Arakoon clusters registered in this OVSCluster :param result_handler: Logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: Dict with the Arakoon cluster types as key and list with dicts which contain cluster names and pyrakoon clients :rtype: dict(str, list[dict]) """ result_handler.info('Fetching available arakoon clusters.', add_to_result=False) arakoon_clusters = {} for cluster_name in list(Configuration.list('/ovs/arakoon')) + ['cacc']: # Determine Arakoon type is_cacc = cluster_name == 'cacc' arakoon_config = ArakoonClusterConfig(cluster_id=cluster_name, load_config=not is_cacc) if is_cacc is True: with open(Configuration.CACC_LOCATION) as config_file: contents = config_file.read() arakoon_config.read_config(contents=contents) try: arakoon_client = ArakoonInstaller.build_client(arakoon_config) except (ArakoonNoMaster, ArakoonNoMasterResult) as ex: result_handler.failure('Unable to find a master for Arakoon cluster {0}. (Message: {1})'.format(cluster_name, str(ex)), code=ErrorCodes.master_none) except Exception as ex: msg = 'Unable to connect to Arakoon cluster {0}. (Message: {1})'.format(cluster_name, str(ex)) result_handler.exception(msg, code=ErrorCodes.unhandled_exception) cls.logger.exception(msg) continue metadata = json.loads(arakoon_client.get(ArakoonInstaller.METADATA_KEY)) cluster_type = metadata['cluster_type'] if cluster_type not in arakoon_clusters: arakoon_clusters[cluster_type] = [] arakoon_clusters[cluster_type].append({'cluster_name': cluster_name, 'client': arakoon_client, 'config': arakoon_config}) return arakoon_clusters
def _get_arakoon_clusters(cls, result_handler): """ Retrieves all Arakoon clusters registered in this OVSCluster :param result_handler: Logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: Dict with the Arakoon cluster types as key and list with dicts which contain cluster names and pyrakoon clients :rtype: dict(str, list[dict]) """ result_handler.info('Fetching available arakoon clusters.', add_to_result=False) arakoon_clusters = {} for cluster_name in list( Configuration.list('/ovs/arakoon')) + ['cacc']: # Determine Arakoon type is_cacc = cluster_name == 'cacc' arakoon_config = ArakoonClusterConfig(cluster_id=cluster_name, load_config=not is_cacc) if is_cacc is True: with open(Configuration.CACC_LOCATION) as config_file: contents = config_file.read() arakoon_config.read_config(contents=contents) try: arakoon_client = ArakoonInstaller.build_client(arakoon_config) except (ArakoonNoMaster, ArakoonNoMasterResult) as ex: result_handler.failure( 'Unable to find a master for Arakoon cluster {0}. (Message: {1})' .format(cluster_name, str(ex)), code=ErrorCodes.master_none) except Exception as ex: msg = 'Unable to connect to Arakoon cluster {0}. (Message: {1})'.format( cluster_name, str(ex)) result_handler.exception(msg, code=ErrorCodes.unhandled_exception) cls.logger.exception(msg) continue metadata = json.loads( arakoon_client.get(ArakoonInstaller.METADATA_KEY)) cluster_type = metadata['cluster_type'] if cluster_type not in arakoon_clusters: arakoon_clusters[cluster_type] = [] arakoon_clusters[cluster_type].append({ 'cluster_name': cluster_name, 'client': arakoon_client, 'config': arakoon_config }) return arakoon_clusters
def _on_demote(cluster_ip, master_ip, offline_node_ips=None): """ Handles the demote for the StorageDrivers :param cluster_ip: IP of the node to demote :type cluster_ip: str :param master_ip: IP of the master node :type master_ip: str :param offline_node_ips: IPs of nodes which are offline :type offline_node_ips: list :return: None """ _ = master_ip if offline_node_ips is None: offline_node_ips = [] servicetype = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) current_service = None remaining_ips = [] for service in servicetype.services: if service.name == 'arakoon-voldrv' and service.is_internal is True: # Externally managed arakoon cluster services do not have StorageRouters if service.storagerouter.ip == cluster_ip: current_service = service elif service.storagerouter.ip not in offline_node_ips: remaining_ips.append(service.storagerouter.ip) if current_service is not None: if len(remaining_ips) == 0: raise RuntimeError( 'Could not find any remaining arakoon nodes for the voldrv cluster' ) StorageDriverController._logger.debug( '* Shrink StorageDriver cluster') cluster_name = str( Configuration.get('/ovs/framework/arakoon_clusters|voldrv')) arakoon_installer = ArakoonInstaller(cluster_name=cluster_name) arakoon_installer.load() arakoon_installer.shrink_cluster(removal_ip=cluster_ip, offline_nodes=offline_node_ips) arakoon_installer.restart_cluster_after_shrinking() current_service.delete() StorageDriverController._configure_arakoon_to_volumedriver( cluster_name=cluster_name)
def _fd_worker(queue, clients, result_handler, service_manager): """ Worker method to retrieve file descriptors :param queue: Queue to use :param clients: SSHClients to choose from :param result_handler: Logging object :param service_manager: Service manager instance :return: None :rtype: NoneType """ while not queue.empty(): cluster_name, _node_config, _results = queue.get(False) errors = _results['errors'] output = _results['result'] identifier = 'Arakoon cluster {0} on node {1}'.format( cluster_name, _node_config.ip) result_handler.info( 'Retrieving file descriptor information for {0}'.format( identifier), add_to_result=False) try: client = clients[_node_config.ip] try: # Handle config Arakoon cluster_name = cluster_name if cluster_name != 'cacc' else 'config' service_name = ArakoonInstaller.get_service_name_for_cluster( cluster_name) pid = service_manager.get_service_pid(service_name, client) file_descriptors = client.run( ['lsof', '-i', '-a', '-p', pid]).splitlines()[1:] except Exception as _ex: errors.append(('lsof', _ex)) raise output['fds'] = file_descriptors except Exception as _ex: result_handler.warning( 'Could not retrieve the file descriptor information for {0} ({1})' .format(identifier, str(_ex)), add_to_result=False) finally: queue.task_done()
def manual_alba_arakoon_checkup(alba_backend_guid, nsm_clusters, abm_cluster=None): # type: (str, List[str], Optional[str]) -> Union[bool, None] """ Creates a new Arakoon cluster if required and extends cluster if possible on all available master nodes :param alba_backend_guid: Guid of the ALBA Backend :type alba_backend_guid: str :param nsm_clusters: NSM clusters for this ALBA Backend The code will claim the Arakoon clusters for this backend when provided :type nsm_clusters: list[str] :param abm_cluster: ABM cluster for this ALBA Backend The code will claim the Arakoon cluster for this backend when provided :type abm_cluster: str|None :return: True if task completed, None if task was discarded (by decorator) :rtype: bool|None """ if (abm_cluster is not None and len(nsm_clusters) == 0) or (len(nsm_clusters) > 0 and abm_cluster is None): raise ValueError( 'Both ABM cluster and NSM clusters must be provided') if abm_cluster is not None: # Check if the requested clusters are available for cluster_name in [abm_cluster] + nsm_clusters: try: metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name) if metadata['in_use'] is True: raise ValueError( 'Cluster {0} has already been claimed'.format( cluster_name)) except NotFoundException: raise ValueError( 'Could not find an Arakoon cluster with name: {0}'. format(cluster_name)) AlbaArakoonController._alba_arakoon_checkup( alba_backend_guid=alba_backend_guid, abm_cluster=abm_cluster, nsm_clusters=nsm_clusters) return True
def remove_arakoon_cluster(cluster_name, master_storagerouter_ip): """ Delete a whole arakoon cluster :param cluster_name: name of a existing arakoon cluster :type cluster_name: str :param master_storagerouter_ip: master ip address of a existing arakoon cluster :type master_storagerouter_ip: str """ ArakoonRemover.LOGGER.info( "Starting removing arakoon cluster with name `{0}`, master_ip `{1}`" .format(cluster_name, master_storagerouter_ip)) arakoon_installer = ArakoonInstaller(cluster_name) arakoon_installer.load() arakoon_installer.delete_cluster() ArakoonRemover.LOGGER.info( "Finished removing arakoon cluster with name `{0}`, master_ip `{1}`" .format(cluster_name, master_storagerouter_ip))
def test_node_config_checkup(self): """ Validates correct working of cluster registry checkup """ base_structure = { '1': { 'vrouter_id': '1', 'message_host': '10.0.1.1', 'message_port': 1, 'xmlrpc_host': '10.0.0.1', 'xmlrpc_port': 2, 'failovercache_host': '10.0.1.1', 'failovercache_port': 3, 'network_server_uri': 'tcp://10.0.1.1:4', 'node_distance_map': None }, '2': { 'vrouter_id': '2', 'message_host': '10.0.1.2', 'message_port': 1, 'xmlrpc_host': '10.0.0.2', 'xmlrpc_port': 2, 'failovercache_host': '10.0.1.2', 'failovercache_port': 3, 'network_server_uri': 'tcp://10.0.1.2:4', 'node_distance_map': None } } def _validate_node_config(_config, _expected_map): expected = copy.deepcopy(base_structure[_config.vrouter_id]) expected['node_distance_map'] = _expected_map[_config.vrouter_id] self.assertDictEqual( expected, { 'vrouter_id': _config.vrouter_id, 'message_host': _config.message_host, 'message_port': _config.message_port, 'xmlrpc_host': _config.xmlrpc_host, 'xmlrpc_port': _config.xmlrpc_port, 'failovercache_host': _config.failovercache_host, 'failovercache_port': _config.failovercache_port, 'network_server_uri': _config.network_server_uri, 'node_distance_map': _config.node_distance_map }) structure = DalHelper.build_dal_structure({ 'vpools': [1], 'domains': [1, 2], 'storagerouters': [1, 2], 'storagedrivers': [(1, 1, 1), (2, 1, 2)], # (<id>, <vpool_id>, <storagerouter_id>) 'storagerouter_domains': [(1, 1, 1, False), (2, 2, 1, False)] } # (id>, <storagerouter_id>, <domain_id>, <backup>) ) storagerouters = structure['storagerouters'] vpool = structure['vpools'][1] arakoon_installer = ArakoonInstaller(cluster_name='voldrv') arakoon_installer.create_cluster( cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD, ip=storagerouters[1].ip, base_dir='/tmp') # Initial run, it will now be configured StorageRouterClient.node_config_recordings = [] result = StorageDriverController.cluster_registry_checkup() self.assertDictEqual(result, {vpool.guid: { 'success': True, 'changes': True }}) self.assertListEqual( sorted(StorageRouterClient.node_config_recordings), ['1', '2']) expected_map = { '1': { '2': StorageDriver.DISTANCES.NEAR }, '2': { '1': StorageDriver.DISTANCES.NEAR } } configs = vpool.clusterregistry_client.get_node_configs() for config in configs: _validate_node_config(config, expected_map) # Running it again should not change anything StorageRouterClient.node_config_recordings = [] result = StorageDriverController.cluster_registry_checkup() self.assertDictEqual(result, {vpool.guid: { 'success': True, 'changes': False }}) self.assertListEqual( sorted(StorageRouterClient.node_config_recordings), []) expected_map = { '1': { '2': StorageDriver.DISTANCES.NEAR }, '2': { '1': StorageDriver.DISTANCES.NEAR } } configs = vpool.clusterregistry_client.get_node_configs() for config in configs: _validate_node_config(config, expected_map) # Validate some error paths domain = structure['domains'][2] junction = structure['storagerouters'][1].domains[0] junction.domain = domain junction.save() vpool_config_path = 'file://opt/OpenvStorage/config/framework.json?key=/ovs/vpools/{0}/hosts/1/config'.format( vpool.guid) StorageRouterClient.exceptions['server_revision'] = { vpool_config_path: Exception('ClusterNotReachableException') } StorageRouterClient.node_config_recordings = [] result = StorageDriverController.cluster_registry_checkup() self.assertDictEqual(result, {vpool.guid: { 'success': True, 'changes': True }}) self.assertListEqual( sorted(StorageRouterClient.node_config_recordings), ['2']) expected_map = { '1': { '2': StorageDriver.DISTANCES.INFINITE }, '2': { '1': StorageDriver.DISTANCES.INFINITE } } configs = vpool.clusterregistry_client.get_node_configs() for config in configs: _validate_node_config(config, expected_map)
def _get_update_information_cluster_alba(cls, client, update_info, package_info): """ In this function the services for each component / package combination are defined This service information consists out of: * Services to stop (before update) and start (after update of packages) -> 'services_stop_start' * Services to restart after update (post-update logic) -> 'services_post_update' * Down-times which will be caused due to service restarts -> 'downtime' * Prerequisites that have not been met -> 'prerequisites' Verify whether all relevant services have the correct binary active Whether a service has the correct binary version in use, we use the ServiceFactory.get_service_update_versions functionality When a service has an older binary version running, we add this information to the 'update_info' This combined information is then stored in the 'package_information' of the StorageRouter DAL object :param client: SSHClient on which to retrieve the service information required for an update :type client: ovs.extensions.generic.sshclient.SSHClient :param update_info: Dictionary passed in by the thread calling this function used to store all update information :type update_info: dict :param package_info: Dictionary containing the components and packages which have an update available for current SSHClient :type package_info: dict :return: None :rtype: NoneType """ cls._logger.info( 'StorageRouter {0}: Refreshing ALBA update information'.format( client.ip)) try: binaries = cls._package_manager.get_binary_versions(client=client) storagerouter = StorageRouterList.get_by_ip(ip=client.ip) cls._logger.debug('StorageRouter {0}: Binary versions: {1}'.format( client.ip, binaries)) # Retrieve Arakoon information arakoon_info = {} for service in storagerouter.services: if service.type.name not in [ ServiceType.SERVICE_TYPES.ALBA_MGR, ServiceType.SERVICE_TYPES.NS_MGR ]: continue if service.type.name == ServiceType.SERVICE_TYPES.ALBA_MGR: cluster_name = service.abm_service.abm_cluster.name alba_backend_name = service.abm_service.abm_cluster.alba_backend.name else: cluster_name = service.nsm_service.nsm_cluster.name alba_backend_name = service.nsm_service.nsm_cluster.alba_backend.name cls._logger.debug( 'StorageRouter {0}: Retrieving update information for Arakoon cluster {1}' .format(client.ip, cluster_name)) arakoon_update_info = ArakoonInstaller.get_arakoon_update_info( cluster_name=cluster_name) cls._logger.debug( 'StorageRouter {0}: Arakoon update information for cluster {1}: {2}' .format(client.ip, cluster_name, arakoon_update_info)) if arakoon_update_info['internal'] is True: arakoon_info[arakoon_update_info['service_name']] = [ 'backend', alba_backend_name ] if arakoon_update_info['downtime'] is True else None for component, package_names in PackageFactory.get_package_info( )['names'].iteritems(): package_names = sorted(package_names) cls._logger.debug( 'StorageRouter {0}: Validating component {1} and related packages: {2}' .format(client.ip, component, package_names)) if component not in update_info[client.ip]: update_info[client.ip][component] = copy.deepcopy( ServiceFactory.DEFAULT_UPDATE_ENTRY) svc_component_info = update_info[client.ip][component] pkg_component_info = package_info.get(component, {}) for package_name in package_names: cls._logger.debug( 'StorageRouter {0}: Validating ALBA plugin related package {1}' .format(client.ip, package_name)) if package_name == PackageFactory.PKG_OVS_BACKEND and package_name in pkg_component_info: if ['gui', None] not in svc_component_info['downtime']: svc_component_info['downtime'].append( ['gui', None]) if ['api', None] not in svc_component_info['downtime']: svc_component_info['downtime'].append( ['api', None]) svc_component_info['services_stop_start'][10].append( 'ovs-watcher-framework') svc_component_info['services_stop_start'][20].append( 'memcached') cls._logger.debug( 'StorageRouter {0}: Added services "ovs-watcher-framework" and "memcached" to stop-start services' .format(client.ip)) cls._logger.debug( 'StorageRouter {0}: Added GUI and API to downtime'. format(client.ip)) elif package_name in [ PackageFactory.PKG_ALBA, PackageFactory.PKG_ALBA_EE ]: # Retrieve proxy service information for service in storagerouter.services: if service.type.name != ServiceType.SERVICE_TYPES.ALBA_PROXY or service.alba_proxy is None: continue service_version = None if package_name not in pkg_component_info: service_version = ServiceFactory.get_service_update_versions( client=client, service_name=service.name, binary_versions=binaries) cls._logger.debug( 'StorageRouter {0}: Service {1} is running version {2}' .format(client.ip, service.name, service_version)) if package_name in pkg_component_info or service_version is not None: if service_version is not None and package_name not in svc_component_info[ 'packages']: svc_component_info['packages'][ package_name] = service_version svc_component_info['services_post_update'][ 10].append('ovs-{0}'.format(service.name)) cls._logger.debug( 'StorageRouter {0}: Added service {1} to post-update services' .format(client.ip, 'ovs-{0}'.format(service.name))) downtime = [ 'proxy', service.alba_proxy.storagedriver.vpool.name ] if downtime not in svc_component_info[ 'downtime']: svc_component_info['downtime'].append( downtime) cls._logger.debug( 'StorageRouter {0}: Added ALBA proxy downtime for vPool {1} to downtime' .format( client.ip, service.alba_proxy. storagedriver.vpool.name)) if package_name in [ PackageFactory.PKG_ALBA, PackageFactory.PKG_ALBA_EE, PackageFactory.PKG_ARAKOON ]: for service_name, downtime in arakoon_info.iteritems(): service_version = ServiceFactory.get_service_update_versions( client=client, service_name=service_name, binary_versions=binaries, package_name=package_name) cls._logger.debug( 'StorageRouter {0}: Arakoon service {1} information: {2}' .format(client.ip, service_name, service_version)) if package_name in pkg_component_info or service_version is not None: svc_component_info['services_post_update'][ 10].append('ovs-{0}'.format(service_name)) cls._logger.debug( 'StorageRouter {0}: Added service {1} to post-update services' .format(client.ip, 'ovs-{0}'.format(service_name))) if service_version is not None and package_name not in svc_component_info[ 'packages']: svc_component_info['packages'][ package_name] = service_version if downtime is not None and downtime not in svc_component_info[ 'downtime']: svc_component_info['downtime'].append( downtime) cls._logger.debug( 'StorageRouter {0}: Added Arakoon cluster for ALBA Backend {1} to downtime' .format(client.ip, downtime[1])) # Extend the service information with the package information related to this repository for current StorageRouter if package_name in pkg_component_info and package_name not in svc_component_info[ 'packages']: cls._logger.debug( 'StorageRouter {0}: Adding package {1} because it has an update available' .format(client.ip, package_name)) svc_component_info['packages'][ package_name] = pkg_component_info[package_name] if component == PackageFactory.COMP_ALBA: for alba_node in AlbaNodeList.get_albanodes(): try: alba_node.client.get_metadata() except: svc_component_info['prerequisites'].append( ['alba_node_unresponsive', alba_node.ip]) cls._logger.debug( 'StorageRouter {0}: Added unresponsive ALBA Node {1} to prerequisites' .format(client.ip, alba_node.ip)) # Verify whether migration (DAL and extension) code needs to be executed (only if no packages have an update available so far) elif component == PackageFactory.COMP_FWK and PackageFactory.PKG_OVS_BACKEND not in svc_component_info[ 'packages']: cls._logger.debug( 'StorageRouter {0}: No updates detected, checking for required migrations' .format(client.ip)) # Extension migration check key = '/ovs/framework/hosts/{0}/versions'.format( System.get_my_machine_id(client=client)) old_version = Configuration.get(key, default={}).get( PackageFactory.COMP_MIGRATION_ALBA) installed_version = str( cls._package_manager.get_installed_versions( client=client, package_names=[PackageFactory.PKG_OVS_BACKEND ])[PackageFactory.PKG_OVS_BACKEND]) migrations_detected = False if old_version is not None: cls._logger.debug( 'StorageRouter {0}: Current running version for {1} extension migrations: {2}' .format(client.ip, PackageFactory.COMP_ALBA, old_version)) with remote(client.ip, [ExtensionMigrator]) as rem: cls._logger.debug( 'StorageRouter {0}: Available version for {1} extension migrations: {2}' .format(client.ip, PackageFactory.COMP_ALBA, rem.ExtensionMigrator.THIS_VERSION)) if rem.ExtensionMigrator.THIS_VERSION > old_version: migrations_detected = True svc_component_info['packages'][ PackageFactory.PKG_OVS_BACKEND] = { 'installed': 'migrations', 'candidate': installed_version } # DAL migration check if migrations_detected is False: persistent_client = PersistentFactory.get_client() old_version = persistent_client.get( 'ovs_model_version').get( PackageFactory.COMP_MIGRATION_ALBA ) if persistent_client.exists( 'ovs_model_version') else None if old_version is not None: cls._logger.debug( 'StorageRouter {0}: Current running version for {1} DAL migrations: {2}' .format(client.ip, PackageFactory.COMP_ALBA, old_version)) with remote(client.ip, [DALMigrator]) as rem: cls._logger.debug( 'StorageRouter {0}: Available version for {1} DAL migrations: {2}' .format(client.ip, PackageFactory.COMP_ALBA, rem.DALMigrator.THIS_VERSION)) if rem.DALMigrator.THIS_VERSION > old_version: svc_component_info['packages'][ PackageFactory.PKG_OVS_BACKEND] = { 'installed': 'migrations', 'candidate': installed_version } cls._logger.info( 'StorageRouter {0}: Refreshed ALBA update information'.format( client.ip)) except Exception as ex: cls._logger.exception( 'StorageRouter {0}: Refreshing ALBA update information failed'. format(client.ip)) if 'errors' not in update_info[client.ip]: update_info[client.ip]['errors'] = [] update_info[client.ip]['errors'].append(ex)
def migrate(previous_version): """ Migrates from a given version to the current version. It uses 'previous_version' to be smart wherever possible, but the code should be able to migrate any version towards the expected version. When this is not possible, the code can set a minimum version and raise when it is not met. :param previous_version: The previous version from which to start the migration :type previous_version: float """ working_version = previous_version if working_version == 0: from ovs.dal.hybrids.servicetype import ServiceType # Initial version: # * Add any basic configuration or model entries # Add backends for backend_type_info in [('ALBA', 'alba')]: code = backend_type_info[1] backend_type = BackendTypeList.get_backend_type_by_code(code) if backend_type is None: backend_type = BackendType() backend_type.name = backend_type_info[0] backend_type.code = code backend_type.save() # Add service types for service_type_info in [ ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR, ServiceType.SERVICE_TYPES.ALBA_S3_TRANSACTION ]: service_type = ServiceType() service_type.name = service_type_info service_type.save() # From here on, all actual migration should happen to get to the expected state for THIS RELEASE elif working_version < DALMigrator.THIS_VERSION: import hashlib from ovs.dal.exceptions import ObjectNotFoundException from ovs.dal.helpers import HybridRunner, Descriptor from ovs.dal.hybrids.albaabmcluster import ABMCluster from ovs.dal.hybrids.albaosd import AlbaOSD from ovs.dal.hybrids.albansmcluster import NSMCluster from ovs.dal.hybrids.j_abmservice import ABMService from ovs.dal.hybrids.j_nsmservice import NSMService from ovs.dal.hybrids.service import Service from ovs.dal.hybrids.servicetype import ServiceType from ovs.dal.lists.albabackendlist import AlbaBackendList from ovs.dal.lists.albanodelist import AlbaNodeList from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.extensions.db.arakooninstaller import ArakoonClusterConfig, ArakoonInstaller from ovs.extensions.generic.configuration import Configuration, NotFoundException from ovs_extensions.generic.toolbox import ExtensionsToolbox from ovs.extensions.plugins.albacli import AlbaCLI from ovs.extensions.storage.persistentfactory import PersistentFactory # Migrate unique constraints & indexes client = PersistentFactory.get_client() hybrid_structure = HybridRunner.get_hybrids() for class_descriptor in hybrid_structure.values(): cls = Descriptor().load(class_descriptor).get_object() classname = cls.__name__.lower() unique_key = 'ovs_unique_{0}_{{0}}_'.format(classname) index_prefix = 'ovs_index_{0}|{{0}}|'.format(classname) index_key = 'ovs_index_{0}|{{0}}|{{1}}'.format(classname) uniques = [] indexes = [] # noinspection PyProtectedMember for prop in cls._properties: if prop.unique is True and len([ k for k in client.prefix( unique_key.format(prop.name)) ]) == 0: uniques.append(prop.name) if prop.indexed is True and len([ k for k in client.prefix( index_prefix.format(prop.name)) ]) == 0: indexes.append(prop.name) if len(uniques) > 0 or len(indexes) > 0: prefix = 'ovs_data_{0}_'.format(classname) for key, data in client.prefix_entries(prefix): for property_name in uniques: ukey = '{0}{1}'.format( unique_key.format(property_name), hashlib.sha1(str( data[property_name])).hexdigest()) client.set(ukey, key) for property_name in indexes: if property_name not in data: continue # This is the case when there's a new indexed property added. ikey = index_key.format( property_name, hashlib.sha1(str( data[property_name])).hexdigest()) index = list( client.get_multi([ikey], must_exist=False))[0] transaction = client.begin_transaction() if index is None: client.assert_value(ikey, None, transaction=transaction) client.set(ikey, [key], transaction=transaction) elif key not in index: client.assert_value(ikey, index[:], transaction=transaction) client.set(ikey, index + [key], transaction=transaction) client.apply_transaction(transaction) ############################################# # Introduction of ABMCluster and NSMCluster # ############################################# # Verify presence of unchanged ALBA Backends alba_backends = AlbaBackendList.get_albabackends() changes_required = False for alba_backend in alba_backends: if alba_backend.abm_cluster is None or len( alba_backend.nsm_clusters) == 0: changes_required = True break if changes_required: # Retrieve ABM and NSM clusters abm_cluster_info = [] nsm_cluster_info = [] for cluster_name in Configuration.list('/ovs/arakoon'): try: metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name) if metadata[ 'cluster_type'] == ServiceType.ARAKOON_CLUSTER_TYPES.ABM: abm_cluster_info.append(metadata) elif metadata[ 'cluster_type'] == ServiceType.ARAKOON_CLUSTER_TYPES.NSM: nsm_cluster_info.append(metadata) except NotFoundException: continue # Retrieve NSM Arakoon cluster information cluster_arakoon_map = {} for cluster_info in abm_cluster_info + nsm_cluster_info: cluster_name = cluster_info['cluster_name'] arakoon_config = ArakoonClusterConfig( cluster_id=cluster_name) cluster_arakoon_map[ cluster_name] = arakoon_config.export_dict() storagerouter_map = dict( (storagerouter.machine_id, storagerouter) for storagerouter in StorageRouterList.get_storagerouters()) alba_backend_id_map = dict((alba_backend.alba_id, alba_backend) for alba_backend in alba_backends) for cluster_info in abm_cluster_info: internal = cluster_info['internal'] cluster_name = cluster_info['cluster_name'] config_location = Configuration.get_configuration_path( key=ArakoonClusterConfig.CONFIG_KEY.format( cluster_name)) try: alba_id = AlbaCLI.run(command='get-alba-id', config=config_location, named_params={'attempts': 3})['id'] nsm_hosts = AlbaCLI.run(command='list-nsm-hosts', config=config_location, named_params={'attempts': 3}) except RuntimeError: continue alba_backend = alba_backend_id_map.get(alba_id) if alba_backend is None: # ALBA Backend with ID not found in model continue if alba_backend.abm_cluster is not None and len( alba_backend.nsm_clusters ) > 0: # Clusters already exist continue # Create ABM Cluster if alba_backend.abm_cluster is None: abm_cluster = ABMCluster() abm_cluster.name = cluster_name abm_cluster.alba_backend = alba_backend abm_cluster.config_location = ArakoonClusterConfig.CONFIG_KEY.format( cluster_name) abm_cluster.save() else: abm_cluster = alba_backend.abm_cluster # Create ABM Services abm_arakoon_config = cluster_arakoon_map[cluster_name] abm_arakoon_config.pop('global') arakoon_nodes = abm_arakoon_config.keys() if internal is False: services_to_create = 1 else: if set(arakoon_nodes).difference( set(storagerouter_map.keys())): continue services_to_create = len(arakoon_nodes) for index in range(services_to_create): service = Service() service.name = 'arakoon-{0}-abm'.format( alba_backend.name) service.type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ALBA_MGR) if internal is True: arakoon_node_config = abm_arakoon_config[ arakoon_nodes[index]] service.ports = [ arakoon_node_config['client_port'], arakoon_node_config['messaging_port'] ] service.storagerouter = storagerouter_map[ arakoon_nodes[index]] else: service.ports = [] service.storagerouter = None service.save() abm_service = ABMService() abm_service.service = service abm_service.abm_cluster = abm_cluster abm_service.save() # Create NSM Clusters for cluster_index, nsm_host in enumerate( sorted(nsm_hosts, key=lambda host: ExtensionsToolbox. advanced_sort(host['cluster_id'], '_'))): nsm_cluster_name = nsm_host['cluster_id'] nsm_arakoon_config = cluster_arakoon_map.get( nsm_cluster_name) if nsm_arakoon_config is None: continue number = cluster_index if internal is False else int( nsm_cluster_name.split('_')[-1]) nsm_cluster = NSMCluster() nsm_cluster.name = nsm_cluster_name nsm_cluster.number = number nsm_cluster.alba_backend = alba_backend nsm_cluster.config_location = ArakoonClusterConfig.CONFIG_KEY.format( nsm_cluster_name) nsm_cluster.save() # Create NSM Services nsm_arakoon_config.pop('global') arakoon_nodes = nsm_arakoon_config.keys() if internal is False: services_to_create = 1 else: if set(arakoon_nodes).difference( set(storagerouter_map.keys())): continue services_to_create = len(arakoon_nodes) for service_index in range(services_to_create): service = Service() service.name = 'arakoon-{0}-nsm_{1}'.format( alba_backend.name, number) service.type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.NS_MGR) if internal is True: arakoon_node_config = nsm_arakoon_config[ arakoon_nodes[service_index]] service.ports = [ arakoon_node_config['client_port'], arakoon_node_config['messaging_port'] ] service.storagerouter = storagerouter_map[ arakoon_nodes[service_index]] else: service.ports = [] service.storagerouter = None service.save() nsm_service = NSMService() nsm_service.service = service nsm_service.nsm_cluster = nsm_cluster nsm_service.save() # Clean up all junction services no longer linked to an ALBA Backend all_nsm_services = [ service.nsm_service for service in ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.NS_MGR).services if service.nsm_service.nsm_cluster is None ] all_abm_services = [ service.abm_service for service in ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ALBA_MGR).services if service.abm_service.abm_cluster is None ] for abm_service in all_abm_services: abm_service.delete() abm_service.service.delete() for nsm_service in all_nsm_services: nsm_service.delete() nsm_service.service.delete() ################################ # Introduction of Active Drive # ################################ # Update slot_id and Alba Node relation for all OSDs client = PersistentFactory.get_client() disk_osd_map = {} for key, data in client.prefix_entries('ovs_data_albaosd_'): alba_disk_guid = data.get('alba_disk', {}).get('guid') if alba_disk_guid is not None: if alba_disk_guid not in disk_osd_map: disk_osd_map[alba_disk_guid] = [] disk_osd_map[alba_disk_guid].append( key.replace('ovs_data_albaosd_', '')) try: value = client.get(key) value.pop('alba_disk', None) client.set(key=key, value=value) except Exception: pass # We don't care if we would have any leftover AlbaDisk information in _data, but its cleaner not to alba_guid_node_map = dict( (an.guid, an) for an in AlbaNodeList.get_albanodes()) for key, data in client.prefix_entries('ovs_data_albadisk_'): alba_disk_guid = key.replace('ovs_data_albadisk_', '') alba_node_guid = data.get('alba_node', {}).get('guid') if alba_disk_guid in disk_osd_map and alba_node_guid in alba_guid_node_map and len( data.get('aliases', [])) > 0: slot_id = data['aliases'][0].split('/')[-1] for osd_guid in disk_osd_map[alba_disk_guid]: try: osd = AlbaOSD(osd_guid) except ObjectNotFoundException: continue osd.slot_id = slot_id osd.alba_node = alba_guid_node_map[alba_node_guid] osd.save() client.delete(key=key, must_exist=False) # Remove unique constraints for AlbaNode IP for key in client.prefix('ovs_unique_albanode_ip_'): client.delete(key=key, must_exist=False) # Remove relation for all Alba Disks for key in client.prefix('ovs_reverseindex_albadisk_'): client.delete(key=key, must_exist=False) # Remove the relation between AlbaNode and AlbaDisk for key in client.prefix('ovs_reverseindex_albanode_'): if '|disks|' in key: client.delete(key=key, must_exist=False) return DALMigrator.THIS_VERSION
def add_arakoon(cluster_name, storagerouter_ip, cluster_basedir, service_type=ServiceType.ARAKOON_CLUSTER_TYPES.FWK): """ Adds a external arakoon to a storagerouter :param cluster_name: name of the new arakoon cluster :type cluster_name: str :param service_type: type of plugin for arakoon (DEFAULT=ServiceType.ARAKOON_CLUSTER_TYPES.FWK) * FWK * ABM * NSM :type service_type: ovs.dal.hybrids.ServiceType.ARAKOON_CLUSTER_TYPES :param storagerouter_ip: ip of a storagerouter :type storagerouter_ip: str :param cluster_basedir: absolute path for the new arakoon cluster :type cluster_basedir: str :return: """ client = SSHClient(storagerouter_ip, username='******') # create required directories if not client.dir_exists(cluster_basedir): client.dir_create(cluster_basedir) # determine plugin if service_type == ServiceType.ARAKOON_CLUSTER_TYPES.FWK: plugins = None elif service_type == ServiceType.ARAKOON_CLUSTER_TYPES.ABM: plugins = { AlbaController.ABM_PLUGIN: AlbaController.ALBA_VERSION_GET } elif service_type == ServiceType.ARAKOON_CLUSTER_TYPES.NSM: plugins = { AlbaController.NSM_PLUGIN: AlbaController.ALBA_VERSION_GET } else: raise RuntimeError( "Incompatible Arakoon cluster type selected: {0}".format( service_type)) ArakoonSetup.LOGGER.info( "Starting creation of new arakoon cluster with name `{0}`, servicetype `{1}`, ip `{2}`, base_dir `{3}`" .format(cluster_name, service_type, storagerouter_ip, cluster_basedir)) arakoon_installer = ArakoonInstaller(cluster_name) arakoon_installer.create_cluster( cluster_type=service_type, ip=storagerouter_ip, base_dir=cluster_basedir, plugins=plugins, locked=False, internal=False, log_sinks=Logger.get_sink_path('automation_lib_arakoon_server'), crash_log_sinks=Logger.get_sink_path( 'automation_lib_arakoon_server_crash')) if service_type == ServiceType.ARAKOON_CLUSTER_TYPES.ABM: client.run([ 'ln', '-s', '/usr/lib/alba/albamgr_plugin.cmxs', '{0}/arakoon/{1}/db'.format(cluster_basedir, cluster_name) ]) elif service_type == ServiceType.ARAKOON_CLUSTER_TYPES.NSM: client.run([ 'ln', '-s', '/usr/lib/alba/nsm_host_plugin.cmxs', '{0}/arakoon/{1}/db'.format(cluster_basedir, cluster_name) ]) arakoon_installer.start_cluster() arakoon_installer.unclaim_cluster() ArakoonSetup.LOGGER.info( "Finished creation of new arakoon cluster with name `{0}`, servicetype `{1}`, ip `{2}`, base_dir `{3}`" .format(cluster_name, service_type, storagerouter_ip, cluster_basedir))
def test_alba_arakoon_checkup(self): """ Validates whether the ALBA Arakoon checkup works (Manual and Scheduled) """ ovs_structure = DalHelper.build_dal_structure( structure={'storagerouters': [1]}) alba_structure = AlbaDalHelper.build_dal_structure( structure={'alba_backends': [[1, 'LOCAL']]}) ############################# # SCHEDULED_ARAKOON_CHECKUP # ############################# # Create an ABM and NSM cluster for ALBA Backend 1 and do some basic validations sr_1 = ovs_structure['storagerouters'][1] ab_1 = alba_structure['alba_backends'][1] MockedSSHClient._run_returns[sr_1.ip] = {} MockedSSHClient._run_returns[sr_1.ip][ 'ln -s /usr/lib/alba/albamgr_plugin.cmxs /tmp/unittest/sr_1/disk_1/partition_1/arakoon/backend_1-abm/db'] = None MockedSSHClient._run_returns[sr_1.ip][ 'ln -s /usr/lib/alba/nsm_host_plugin.cmxs /tmp/unittest/sr_1/disk_1/partition_1/arakoon/backend_1-nsm_0/db'] = None AlbaController.add_cluster(ab_1.guid) abm_cluster_name = '{0}-abm'.format(ab_1.name) nsm_cluster_name = '{0}-nsm_0'.format(ab_1.name) arakoon_clusters = sorted(Configuration.list('/ovs/arakoon')) self.assertListEqual(list1=[abm_cluster_name, nsm_cluster_name], list2=arakoon_clusters) abm_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=abm_cluster_name) nsm_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=nsm_cluster_name) self.assertTrue(expr=abm_metadata['in_use']) self.assertTrue(expr=nsm_metadata['in_use']) # Run scheduled Arakoon checkup and validate amount of Arakoon clusters did not change AlbaArakoonController.scheduled_alba_arakoon_checkup() self.assertListEqual(list1=[abm_cluster_name, nsm_cluster_name], list2=arakoon_clusters) self.assertEqual(first=len(ab_1.abm_cluster.abm_services), second=1) self.assertEqual(first=len(ab_1.nsm_clusters), second=1) self.assertEqual(first=len(ab_1.nsm_clusters[0].nsm_services), second=1) # Create 2 additional StorageRouters srs = DalHelper.build_dal_structure( structure={'storagerouters': [2, 3]}, previous_structure=ovs_structure)['storagerouters'] sr_2 = srs[2] sr_3 = srs[3] # Run scheduled checkup again and do some validations MockedSSHClient._run_returns[sr_2.ip] = {} MockedSSHClient._run_returns[sr_3.ip] = {} MockedSSHClient._run_returns[sr_2.ip][ 'ln -s /usr/lib/alba/albamgr_plugin.cmxs /tmp/unittest/sr_2/disk_1/partition_1/arakoon/backend_1-abm/db'] = None MockedSSHClient._run_returns[sr_3.ip][ 'ln -s /usr/lib/alba/albamgr_plugin.cmxs /tmp/unittest/sr_3/disk_1/partition_1/arakoon/backend_1-abm/db'] = None MockedSSHClient._run_returns[sr_2.ip][ 'arakoon --node {0} -config file://opt/OpenvStorage/config/framework.json?key=/ovs/arakoon/backend_1-abm/config -catchup-only' .format(sr_2.machine_id)] = None MockedSSHClient._run_returns[sr_3.ip][ 'arakoon --node {0} -config file://opt/OpenvStorage/config/framework.json?key=/ovs/arakoon/backend_1-abm/config -catchup-only' .format(sr_3.machine_id)] = None AlbaArakoonController.scheduled_alba_arakoon_checkup() self.assertListEqual(list1=[abm_cluster_name, nsm_cluster_name], list2=arakoon_clusters) self.assertEqual(first=len(ab_1.abm_cluster.abm_services), second=3) # Gone up from 1 to 3 self.assertEqual(first=len(ab_1.nsm_clusters), second=1) self.assertEqual(first=len(ab_1.nsm_clusters[0].nsm_services), second=1) # Still 1 since NSM checkup hasn't ran yet # Make sure 1 StorageRouter is unreachable SSHClient._raise_exceptions[sr_3.ip] = { 'users': ['ovs'], 'exception': UnableToConnectException('No route to host') } AlbaArakoonController.scheduled_alba_arakoon_checkup() alba_logs = Logger._logs.get('lib', []) self.assertIn( member='Storage Router with IP {0} is not reachable'.format( sr_3.ip), container=alba_logs) ########################## # MANUAL_ARAKOON_CHECKUP # ########################## AlbaDalHelper.setup() # Clear everything ovs_structure = DalHelper.build_dal_structure( structure={'storagerouters': [1]}) alba_structure = AlbaDalHelper.build_dal_structure( structure={'alba_backends': [[1, 'LOCAL']]}) sr_1 = ovs_structure['storagerouters'][1] ab_1 = alba_structure['alba_backends'][1] MockedSSHClient._run_returns[sr_1.ip] = {} MockedSSHClient._run_returns[sr_1.ip][ 'ln -s /usr/lib/alba/albamgr_plugin.cmxs /tmp/unittest/sr_1/disk_1/partition_1/arakoon/backend_1-abm/db'] = None MockedSSHClient._run_returns[sr_1.ip][ 'ln -s /usr/lib/alba/nsm_host_plugin.cmxs /tmp/unittest/sr_1/disk_1/partition_1/arakoon/backend_1-nsm_0/db'] = None AlbaController.add_cluster(ab_1.guid) # Run manual Arakoon checkup and validate amount of Arakoon clusters did not change AlbaArakoonController.manual_alba_arakoon_checkup( alba_backend_guid=ab_1.guid, nsm_clusters=[], abm_cluster=None) self.assertListEqual(list1=[abm_cluster_name, nsm_cluster_name], list2=arakoon_clusters) self.assertEqual(first=len(ab_1.abm_cluster.abm_services), second=1) self.assertEqual(first=len(ab_1.nsm_clusters), second=1) self.assertEqual(first=len(ab_1.nsm_clusters[0].nsm_services), second=1) # Test some error paths with self.assertRaises(ValueError) as raise_info: AlbaArakoonController.manual_alba_arakoon_checkup( alba_backend_guid=ab_1.guid, nsm_clusters=['no_abm_cluster_passed']) self.assertEqual( first=raise_info.exception.message, second='Both ABM cluster and NSM clusters must be provided') with self.assertRaises(ValueError) as raise_info: AlbaArakoonController.manual_alba_arakoon_checkup( alba_backend_guid=ab_1.guid, nsm_clusters=[], abm_cluster='no_nsm_clusters_passed') self.assertEqual( first=raise_info.exception.message, second='Both ABM cluster and NSM clusters must be provided') with self.assertRaises(ValueError) as raise_info: AlbaArakoonController.manual_alba_arakoon_checkup( alba_backend_guid=ab_1.guid, nsm_clusters=[nsm_cluster_name], abm_cluster=abm_cluster_name) self.assertEqual(first=raise_info.exception.message, second='Cluster {0} has already been claimed'.format( abm_cluster_name)) with self.assertRaises(ValueError) as raise_info: AlbaArakoonController.manual_alba_arakoon_checkup( alba_backend_guid=ab_1.guid, nsm_clusters=['non-existing-nsm-cluster'], abm_cluster='non-existing-abm-cluster') self.assertEqual( first=raise_info.exception.message, second= 'Could not find an Arakoon cluster with name: non-existing-abm-cluster' ) # Recreate ALBA Backend with Arakoon clusters AlbaDalHelper.setup() # Clear everything ovs_structure = DalHelper.build_dal_structure( structure={'storagerouters': [1]}) alba_structure = AlbaDalHelper.build_dal_structure( structure={'alba_backends': [[1, 'LOCAL']]}) sr_1 = ovs_structure['storagerouters'][1] ab_1 = alba_structure['alba_backends'][1] # Create some Arakoon clusters to be claimed by the manual checkup for cluster_name, cluster_type in { 'manual-abm-1': ServiceType.ARAKOON_CLUSTER_TYPES.ABM, 'manual-abm-2': ServiceType.ARAKOON_CLUSTER_TYPES.ABM, 'manual-nsm-1': ServiceType.ARAKOON_CLUSTER_TYPES.NSM, 'manual-nsm-2': ServiceType.ARAKOON_CLUSTER_TYPES.NSM, 'manual-nsm-3': ServiceType.ARAKOON_CLUSTER_TYPES.NSM }.iteritems(): arakoon_installer = ArakoonInstaller(cluster_name=cluster_name) arakoon_installer.create_cluster( cluster_type=cluster_type, ip=sr_1.ip, base_dir=DalHelper.CLUSTER_DIR.format(cluster_name), internal=False) arakoon_installer.start_cluster() arakoon_installer.unclaim_cluster() AlbaArakoonController.manual_alba_arakoon_checkup( alba_backend_guid=ab_1.guid, nsm_clusters=['manual-nsm-1', 'manual-nsm-3'], abm_cluster='manual-abm-2') # Validate the correct clusters have been claimed by the manual checkup unused_abms = ArakoonInstaller.get_unused_arakoon_clusters( cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.ABM) unused_nsms = ArakoonInstaller.get_unused_arakoon_clusters( cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.NSM) self.assertEqual(first=len(unused_abms), second=1) self.assertEqual(first=len(unused_nsms), second=1) self.assertEqual(first=unused_abms[0]['cluster_name'], second='manual-abm-1') self.assertEqual(first=unused_nsms[0]['cluster_name'], second='manual-nsm-2')
def nsm_checkup(alba_backend_guid=None, min_internal_nsms=1, external_nsm_cluster_names=None): # type: (Optional[str], Optional[int], Optional[List[str]]) -> None """ Validates the current NSM setup/configuration and takes actions where required. Assumptions: * A 2 node NSM is considered safer than a 1 node NSM. * When adding an NSM, the nodes with the least amount of NSM participation are preferred :param alba_backend_guid: Run for a specific ALBA Backend :type alba_backend_guid: str :param min_internal_nsms: Minimum amount of NSM hosts that need to be provided :type min_internal_nsms: int :param external_nsm_cluster_names: Information about the additional clusters to claim (only for externally managed Arakoon clusters) :type external_nsm_cluster_names: list :return: None :rtype: NoneType """ ############### # Validations # ############### if external_nsm_cluster_names is None: external_nsm_cluster_names = [] AlbaArakoonController._logger.info('NSM checkup started') if min_internal_nsms < 1: raise ValueError( 'Minimum amount of NSM clusters must be 1 or more') if not isinstance(external_nsm_cluster_names, list): raise ValueError( "'external_nsm_cluster_names' must be of type 'list'") if len(external_nsm_cluster_names) > 0: if alba_backend_guid is None: raise ValueError( 'Additional NSMs can only be configured for a specific ALBA Backend' ) if min_internal_nsms > 1: raise ValueError( "'min_internal_nsms' and 'external_nsm_cluster_names' are mutually exclusive" ) external_nsm_cluster_names = list(set( external_nsm_cluster_names)) # Remove duplicate cluster names for cluster_name in external_nsm_cluster_names: try: ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name) except NotFoundException: raise ValueError( 'Arakoon cluster with name {0} does not exist'.format( cluster_name)) if alba_backend_guid is None: alba_backends = [ alba_backend for alba_backend in AlbaBackendList.get_albabackends() if alba_backend.backend.status == 'RUNNING' ] else: alba_backends = [AlbaBackend(alba_backend_guid)] masters = StorageRouterList.get_masters() storagerouters = set() for alba_backend in alba_backends: if alba_backend.abm_cluster is None: raise ValueError( 'No ABM cluster found for ALBA Backend {0}'.format( alba_backend.name)) if len(alba_backend.abm_cluster.abm_services) == 0: raise ValueError( 'ALBA Backend {0} does not have any registered ABM services' .format(alba_backend.name)) if len(alba_backend.nsm_clusters) + len( external_nsm_cluster_names) > MAX_NSM_AMOUNT: raise ValueError( 'The maximum of {0} NSM Arakoon clusters will be exceeded. Amount of clusters that can be deployed for this ALBA Backend: {1}' .format(MAX_NSM_AMOUNT, MAX_NSM_AMOUNT - len(alba_backend.nsm_clusters))) # Validate enough externally managed Arakoon clusters are available if alba_backend.abm_cluster.abm_services[ 0].service.is_internal is False: unused_cluster_names = set([ cluster_info['cluster_name'] for cluster_info in ArakoonInstaller.get_unused_arakoon_clusters( cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.NSM) ]) if set(external_nsm_cluster_names).difference( unused_cluster_names): raise ValueError( 'Some of the provided cluster_names have already been claimed before' ) storagerouters.update( set(masters) ) # For externally managed we need an available master node else: for abm_service in alba_backend.abm_cluster.abm_services: # For internally managed we need all StorageRouters online storagerouters.add(abm_service.service.storagerouter) for nsm_cluster in alba_backend.nsm_clusters: # For internally managed we need all StorageRouters online for nsm_service in nsm_cluster.nsm_services: storagerouters.add(nsm_service.service.storagerouter) ssh_clients = {} for storagerouter in storagerouters: try: ssh_clients[storagerouter] = SSHClient(endpoint=storagerouter) except UnableToConnectException: raise RuntimeError( 'StorageRouter {0} with IP {1} is not reachable'.format( storagerouter.name, storagerouter.ip)) version_str = AlbaArakoonInstaller.get_alba_version_string() nsm_installer = NSMInstaller(version_str=version_str, ssh_clients=ssh_clients) ################## # Check Clusters # ################## safety = Configuration.get( '/ovs/framework/plugins/alba/config|nsm.safety') maxload = Configuration.get( '/ovs/framework/plugins/alba/config|nsm.maxload') AlbaArakoonController._logger.debug( 'NSM safety is configured at: {0}'.format(safety)) AlbaArakoonController._logger.debug( 'NSM max load is configured at: {0}'.format(maxload)) master_client = None failed_backends = [] for alba_backend in alba_backends: try: # Gather information AlbaArakoonController._logger.info( 'ALBA Backend {0} - Ensuring NSM safety'.format( alba_backend.name)) internal = AlbaArakoonInstaller.is_internally_managed( alba_backend) nsm_loads = AlbaArakoonController.get_nsm_loads(alba_backend) nsm_storagerouters = AlbaArakoonController.get_nsms_per_storagerouter( alba_backend) sorted_nsm_clusters = sorted(alba_backend.nsm_clusters, key=lambda k: k.number) if not internal and len(external_nsm_cluster_names) > 0: for sr, cl in ssh_clients.iteritems(): if sr.node_type == 'MASTER': master_client = cl break if master_client is None: # Internal is False and we specified the NSM clusters to claim, but no MASTER nodes online raise ValueError( 'Could not find an online master node') AlbaArakoonController._logger.debug( 'ALBA Backend {0} - Arakoon clusters are {1} managed'. format(alba_backend.name, 'internally' if internal is True else 'externally')) for nsm_number, nsm_load in nsm_loads.iteritems(): AlbaArakoonController._logger.debug( 'ALBA Backend {0} - NSM Cluster {1} - Load {2}'.format( alba_backend.name, nsm_number, nsm_load)) for sr, count in nsm_storagerouters.iteritems(): AlbaArakoonController._logger.debug( 'ALBA Backend {0} - StorageRouter {1} - NSM Services {2}' .format(alba_backend.name, sr.name, count)) if internal: # Extend existing NSM clusters if safety not met for nsm_cluster in sorted_nsm_clusters: AlbaArakoonController._logger.debug( 'ALBA Backend {0} - Processing NSM {1} - Expected safety {2} - Current safety {3}' .format(alba_backend.name, nsm_cluster.number, safety, len(nsm_cluster.nsm_services))) AlbaArakoonController.ensure_nsm_cluster_safety( nsm_cluster, nsm_storagerouters, nsm_installer=nsm_installer) AlbaArakoonController.ensure_nsm_clusters_load( alba_backend, nsms_per_storagerouter=nsm_storagerouters, ssh_clients=ssh_clients, version_str=version_str, min_internal_nsms=min_internal_nsms, external_nsm_cluster_names=external_nsm_cluster_names) except Exception: AlbaArakoonController._logger.exception( 'NSM Checkup failed for Backend {0}'.format( alba_backend.name)) failed_backends.append(alba_backend.name)
def ensure_nsm_clusters_load(cls, alba_backend, nsms_per_storagerouter=None, min_internal_nsms=1, external_nsm_cluster_names=None, version_str=None, ssh_clients=None): # type: (AlbaBackend, Optional[Dict[StorageRouter, int]], Optional[int], Optional[List[str], Optional[str]], Optional[StorageRouter, SSHClient]) -> None """ Ensure that all NSM clusters are not overloaded :param alba_backend: Alba Backend to ensure NSM Cluster load for :type alba_backend: AlbaBackend :param nsms_per_storagerouter: Amount of NSMs mapped by StorageRouter :type nsms_per_storagerouter: Dict[StorageRouter, int] :param min_internal_nsms: Minimum amount of NSM hosts that need to be provided :type min_internal_nsms: int :param external_nsm_cluster_names: Information about the additional clusters to claim (only for externally managed Arakoon clusters) :type external_nsm_cluster_names: list :param version_str: Alba version string :type version_str: str :param ssh_clients: SSHClients to use :type ssh_clients: Dict[Storagerouter, SSHClient] :return: None :rtype: NoneType """ if ssh_clients is None: ssh_clients = {} if external_nsm_cluster_names is None: external_nsm_cluster_names = [] nsms_per_storagerouter = nsms_per_storagerouter if nsms_per_storagerouter is not None else cls.get_nsms_per_storagerouter( alba_backend) version_str = version_str or AlbaArakoonInstaller.get_alba_version_string( ) nsm_loads = cls.get_nsm_loads(alba_backend) internal = AlbaArakoonInstaller.is_internally_managed(alba_backend) abm_cluster_name = alba_backend.abm_cluster.name safety = Configuration.get( '/ovs/framework/plugins/alba/config|nsm.safety') maxload = Configuration.get( '/ovs/framework/plugins/alba/config|nsm.maxload') overloaded = min(nsm_loads.values()) >= maxload if not overloaded: # At least 1 NSM is not overloaded yet AlbaArakoonController._logger.debug( 'ALBA Backend {0} - NSM load OK'.format(alba_backend.name)) if internal: # When load is not OK, deploy at least 1 additional NSM nsms_to_add = max(0, min_internal_nsms - len(nsm_loads)) else: nsms_to_add = len(external_nsm_cluster_names) if nsms_to_add == 0: return else: AlbaArakoonController._logger.warning( 'ALBA Backend {0} - NSM load is NOT OK'.format( alba_backend.name)) if internal: # When load is not OK, deploy at least 1 additional NSM nsms_to_add = max(1, min_internal_nsms - len(nsm_loads)) else: # For externally managed clusters we only claim the specified clusters, if none provided, we just log it nsms_to_add = len(external_nsm_cluster_names) if nsms_to_add == 0: cls._logger.critical( 'ALBA Backend {0} - All NSM clusters are overloaded'. format(alba_backend.name)) return # Deploy new (internal) or claim existing (external) NSM clusters cls._logger.debug( 'ALBA Backend {0} - Currently {1} NSM cluster{2}'.format( alba_backend.name, len(nsm_loads), '' if len(nsm_loads) == 1 else 's')) AlbaArakoonController._logger.debug( 'ALBA Backend {0} - Trying to add {1} NSM cluster{2}'.format( alba_backend.name, nsms_to_add, '' if nsms_to_add == 1 else 's')) base_number = max(nsm_loads.keys()) + 1 for index, number in enumerate( xrange(base_number, base_number + nsms_to_add)): if not internal: # External clusters master_client = None if not ssh_clients: for storagerouter in StorageRouterList.get_masters(): try: master_client = SSHClient(storagerouter) except UnableToConnectException: cls._logger.warning( 'StorageRouter {0} with IP {1} is not reachable' .format(storagerouter.name, storagerouter.ip)) else: for storagerouter, ssh_client in ssh_clients.iteritems(): if storagerouter.node_type == 'MASTER': master_client = ssh_client if not master_client: raise ValueError('Could not find an online master node') # @todo this might raise an indexerror? nsm_cluster_name = external_nsm_cluster_names[index] cls._logger.debug( 'ALBA Backend {0} - Claiming NSM cluster {1}'.format( alba_backend.name, nsm_cluster_name)) metadata = ArakoonInstaller.get_unused_arakoon_metadata_and_claim( cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.NSM, cluster_name=nsm_cluster_name) if metadata is None: cls._logger.critical( 'ALBA Backend {0} - NSM cluster with name {1} could not be found' .format(alba_backend.name, nsm_cluster_name)) continue cls._logger.debug( 'ALBA Backend {0} - Modeling services'.format( alba_backend.name)) AlbaArakoonInstaller.model_arakoon_service( alba_backend=alba_backend, cluster_name=nsm_cluster_name, number=number) cls._logger.debug('ALBA Backend {0} - Registering NSM'.format( alba_backend.name)) NSMInstaller.register_nsm(abm_name=abm_cluster_name, nsm_name=nsm_cluster_name, ip=master_client.ip) AlbaArakoonController._logger.debug( 'ALBA Backend {0} - Extended cluster'.format( alba_backend.name)) else: # Internal clusters nsm_cluster_name = '{0}-nsm_{1}'.format( alba_backend.name, number) cls._logger.debug( 'ALBA Backend {0} - Adding NSM cluster {1}'.format( alba_backend.name, nsm_cluster_name)) # One of the NSM nodes is overloaded. This means the complete NSM is considered overloaded # Figure out which StorageRouters are the least occupied loads = sorted(nsms_per_storagerouter.values())[:safety] storagerouters = [] for storagerouter, load in nsms_per_storagerouter.iteritems(): if load in loads: storagerouters.append(storagerouter) if len(storagerouters) == safety: break # Creating a new NSM cluster for sub_index, storagerouter in enumerate(storagerouters): nsms_per_storagerouter[storagerouter] += 1 partition = AlbaArakoonInstaller.get_db_partition( storagerouter) arakoon_installer = ArakoonInstaller( cluster_name=nsm_cluster_name) # @todo Use deploy and extend code. (Disable register nsm in those parts) if sub_index == 0: arakoon_installer.create_cluster( cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.NSM, ip=storagerouter.ip, base_dir=partition.folder, plugins={NSM_PLUGIN: version_str}) else: cls._logger.debug( 'ALBA Backend {0} - Extending NSM cluster {1}'. format(alba_backend.name, nsm_cluster_name)) arakoon_installer.load() arakoon_installer.extend_cluster( new_ip=storagerouter.ip, base_dir=partition.folder, plugins={NSM_PLUGIN: version_str}) cls._logger.debug( 'ALBA Backend {0} - Linking plugins'.format( alba_backend.name)) ssh_client = ssh_clients.get(storagerouter) or SSHClient( StorageRouter) AlbaArakoonInstaller.link_plugins( client=ssh_client, data_dir=partition.folder, plugins=[NSM_PLUGIN], cluster_name=nsm_cluster_name) cls._logger.debug( 'ALBA Backend {0} - Modeling services'.format( alba_backend.name)) AlbaArakoonInstaller.model_arakoon_service( alba_backend=alba_backend, cluster_name=nsm_cluster_name, ports=arakoon_installer.ports[storagerouter.ip], storagerouter=storagerouter, number=number) if sub_index == 0: cls._logger.debug( 'ALBA Backend {0} - Starting cluster'.format( alba_backend.name)) arakoon_installer.start_cluster() else: AlbaArakoonController._logger.debug( 'ALBA Backend {0} - Restarting cluster'.format( alba_backend.name)) arakoon_installer.restart_cluster_after_extending( new_ip=storagerouter.ip) cls._logger.debug('ALBA Backend {0} - Registering NSM'.format( alba_backend.name)) NSMInstaller.register_nsm(abm_name=abm_cluster_name, nsm_name=nsm_cluster_name, ip=storagerouters[0].ip) cls._logger.debug( 'ALBA Backend {0} - Added NSM cluster {1}'.format( alba_backend.name, nsm_cluster_name))
def _voldrv_arakoon_checkup(create_cluster): def _add_service(service_storagerouter, arakoon_ports, service_name): """ Add a service to the storage router """ new_service = Service() new_service.name = service_name new_service.type = service_type new_service.ports = arakoon_ports new_service.storagerouter = service_storagerouter new_service.save() return new_service current_ips = [] current_services = [] service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) cluster_name = Configuration.get( '/ovs/framework/arakoon_clusters').get('voldrv') if cluster_name is not None: arakoon_service_name = ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name) for service in service_type.services: if service.name == arakoon_service_name: current_services.append(service) if service.is_internal is True: current_ips.append(service.storagerouter.ip) all_sr_ips = [ storagerouter.ip for storagerouter in StorageRouterList.get_slaves() ] available_storagerouters = {} for storagerouter in StorageRouterList.get_masters(): storagerouter.invalidate_dynamics(['partition_config']) if len(storagerouter.partition_config[DiskPartition.ROLES.DB]) > 0: available_storagerouters[storagerouter] = DiskPartition( storagerouter.partition_config[DiskPartition.ROLES.DB][0]) all_sr_ips.append(storagerouter.ip) if create_cluster is True and len( current_services) == 0: # Create new cluster metadata = ArakoonInstaller.get_unused_arakoon_metadata_and_claim( cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD) if metadata is None: # No externally managed cluster found, we create 1 ourselves if not available_storagerouters: raise RuntimeError( 'Could not find any Storage Router with a DB role') storagerouter, partition = available_storagerouters.items()[0] arakoon_voldrv_cluster = 'voldrv' arakoon_installer = ArakoonInstaller( cluster_name=arakoon_voldrv_cluster) arakoon_installer.create_cluster( cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD, ip=storagerouter.ip, base_dir=partition.folder, log_sinks=LogHandler.get_sink_path( 'arakoon-server_{0}'.format(arakoon_voldrv_cluster)), crash_log_sinks=LogHandler.get_sink_path( 'arakoon-server-crash_{0}'.format( arakoon_voldrv_cluster))) arakoon_installer.start_cluster() ports = arakoon_installer.ports[storagerouter.ip] metadata = arakoon_installer.metadata current_ips.append(storagerouter.ip) else: ports = [] storagerouter = None cluster_name = metadata['cluster_name'] Configuration.set('/ovs/framework/arakoon_clusters|voldrv', cluster_name) StorageDriverController._logger.info( 'Claiming {0} managed arakoon cluster: {1}'.format( 'externally' if storagerouter is None else 'internally', cluster_name)) StorageDriverController._configure_arakoon_to_volumedriver( cluster_name=cluster_name) current_services.append( _add_service( service_storagerouter=storagerouter, arakoon_ports=ports, service_name=ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name))) cluster_name = Configuration.get( '/ovs/framework/arakoon_clusters').get('voldrv') if cluster_name is None: return metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name) if 0 < len(current_services) < len( available_storagerouters) and metadata['internal'] is True: for storagerouter, partition in available_storagerouters.iteritems( ): if storagerouter.ip in current_ips: continue arakoon_installer = ArakoonInstaller(cluster_name=cluster_name) arakoon_installer.load() arakoon_installer.extend_cluster( new_ip=storagerouter.ip, base_dir=partition.folder, log_sinks=LogHandler.get_sink_path( 'arakoon-server_{0}'.format(cluster_name)), crash_log_sinks=LogHandler.get_sink_path( 'arakoon-server-crash_{0}'.format(cluster_name))) _add_service( service_storagerouter=storagerouter, arakoon_ports=arakoon_installer.ports[storagerouter.ip], service_name=ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name)) current_ips.append(storagerouter.ip) arakoon_installer.restart_cluster_after_extending( new_ip=storagerouter.ip) StorageDriverController._configure_arakoon_to_volumedriver( cluster_name=cluster_name)
def services_running(self): # type: () -> bool """ Check if all services are running :return: Boolean """ try: key = 'ovs-watcher-{0}'.format(str(uuid.uuid4())) value = str(time.time()) if self.target in [WatcherTypes.CONFIG, WatcherTypes.FWK]: self.log_message('Testing configuration store...') try: Configuration.list('/') except Exception as ex: self.log_message(' Error during configuration store test: {0}'.format(ex), 2) return False with open(CACC_LOCATION) as config_file: contents = config_file.read() config = ArakoonClusterConfig(cluster_id=ARAKOON_NAME, load_config=False) config.read_config(contents=contents) client = ArakoonInstaller.build_client(config) contents = client.get(ArakoonInstaller.INTERNAL_CONFIG_KEY, consistency=NoGuarantee()) if Watcher.LOG_CONTENTS != contents: try: config.read_config(contents=contents) # Validate whether the contents are not corrupt except Exception as ex: self.log_message(' Configuration stored in configuration store seems to be corrupt: {0}'.format(ex), 2) return False temp_filename = '{0}~'.format(CACC_LOCATION) with open(temp_filename, 'w') as config_file: config_file.write(contents) config_file.flush() os.fsync(config_file) os.rename(temp_filename, CACC_LOCATION) Watcher.LOG_CONTENTS = contents self.log_message(' Configuration store OK', 0) if self.target == WatcherTypes.FWK: self._test_store('volatile', key, value) self._test_store('persistent') if self.target == WatcherTypes.VOLDRV: # Arakoon, voldrv cluster self._test_store('arakoon_voldrv') if self.target in [WatcherTypes.FWK, WatcherTypes.VOLDRV]: # RabbitMQ self.log_message('Test rabbitMQ...', 0) messagequeue = Configuration.get('/ovs/framework/messagequeue') rmq_servers = messagequeue['endpoints'] good_node = False for server in rmq_servers: try: connection_string = '{0}://{1}:{2}@{3}/%2F'.format(messagequeue['protocol'], messagequeue['user'], messagequeue['password'], server) connection = pika.BlockingConnection(pika.URLParameters(connection_string)) channel = connection.channel() channel.basic_publish('', 'ovs-watcher', str(time.time()), pika.BasicProperties(content_type='text/plain', delivery_mode=1)) connection.close() good_node = True except Exception as message: self.log_message(' Error during rabbitMQ test on node {0}: {1}'.format(server, message), 2) if good_node is False: self.log_message(' No working rabbitMQ node could be found', 2) return False self.log_message(' RabbitMQ test OK') self.log_message('All tests OK') return True except Exception as ex: self.log_message('Unexpected exception: {0}'.format(ex), 2) return False
def promote_node(cluster_ip, master_ip, ip_client_map, unique_id, configure_memcached, configure_rabbitmq): """ Promotes a given node """ from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.servicelist import ServiceList from ovs.dal.hybrids.service import Service Toolbox.log(logger=NodeTypeController._logger, messages='Promoting node', title=True) service_manager = ServiceFactory.get_manager() if configure_memcached is True: if NodeTypeController._validate_local_memcache_servers( ip_client_map) is False: raise RuntimeError( 'Not all memcache nodes can be reached which is required for promoting a node.' ) target_client = ip_client_map[cluster_ip] machine_id = System.get_my_machine_id(target_client) node_name, _ = target_client.get_hostname() master_client = ip_client_map[master_ip] storagerouter = StorageRouterList.get_by_machine_id(unique_id) storagerouter.node_type = 'MASTER' storagerouter.save() external_config = Configuration.get('/ovs/framework/external_config') if external_config is None: Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon configuration cluster') arakoon_installer = ArakoonInstaller(cluster_name='config') arakoon_installer.load(ip=master_ip) arakoon_installer.extend_cluster( new_ip=cluster_ip, base_dir=Configuration.get('/ovs/framework/paths|ovsdb')) arakoon_installer.restart_cluster_after_extending( new_ip=cluster_ip) service_manager.register_service( node_name=machine_id, service_metadata=arakoon_installer.service_metadata[cluster_ip] ) # Find other (arakoon) master nodes arakoon_cluster_name = str( Configuration.get('/ovs/framework/arakoon_clusters|ovsdb')) arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=arakoon_cluster_name) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name) master_node_ips = [node.ip for node in config.nodes] if cluster_ip in master_node_ips: master_node_ips.remove(cluster_ip) if len(master_node_ips) == 0: raise RuntimeError( 'There should be at least one other master node') arakoon_ports = [] if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon OVS DB cluster') arakoon_installer = ArakoonInstaller( cluster_name=arakoon_cluster_name) arakoon_installer.load() arakoon_installer.extend_cluster( new_ip=cluster_ip, base_dir=Configuration.get('/ovs/framework/paths|ovsdb')) arakoon_installer.restart_cluster_after_extending( new_ip=cluster_ip) arakoon_ports = arakoon_installer.ports[cluster_ip] if configure_memcached is True: NodeTypeController.configure_memcached( client=target_client, logger=NodeTypeController._logger) NodeTypeController.add_services(client=target_client, node_type='master', logger=NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Update configurations') if configure_memcached is True: endpoints = Configuration.get('/ovs/framework/memcache|endpoints') endpoint = '{0}:11211'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/memcache|endpoints', endpoints) if configure_rabbitmq is True: endpoints = Configuration.get( '/ovs/framework/messagequeue|endpoints') endpoint = '{0}:5672'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/messagequeue|endpoints', endpoints) if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Restarting master node services') PersistentFactory.store = None VolatileFactory.store = None if 'arakoon-ovsdb' not in [ s.name for s in ServiceList.get_services() if s.is_internal is False or s.storagerouter.ip == cluster_ip ]: service = Service() service.name = 'arakoon-ovsdb' service.type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) service.ports = arakoon_ports service.storagerouter = storagerouter service.save() if configure_rabbitmq is True: NodeTypeController.configure_rabbitmq( client=target_client, logger=NodeTypeController._logger) # Copy rabbitmq cookie rabbitmq_cookie_file = '/var/lib/rabbitmq/.erlang.cookie' Toolbox.log(logger=NodeTypeController._logger, messages='Copying RabbitMQ cookie') contents = master_client.file_read(rabbitmq_cookie_file) master_hostname, _ = master_client.get_hostname() target_client.dir_create(os.path.dirname(rabbitmq_cookie_file)) target_client.file_write(rabbitmq_cookie_file, contents) target_client.file_chmod(rabbitmq_cookie_file, mode=0400) target_client.run(['rabbitmq-server', '-detached']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop_app']) time.sleep(5) target_client.run([ 'rabbitmqctl', 'join_cluster', 'rabbit@{0}'.format(master_hostname) ]) time.sleep(5) target_client.run(['rabbitmqctl', 'stop']) time.sleep(5) # Enable HA for the rabbitMQ queues ServiceFactory.change_service_state(target_client, 'rabbitmq-server', 'start', NodeTypeController._logger) NodeTypeController.check_rabbitmq_and_enable_ha_mode( client=target_client, logger=NodeTypeController._logger) NodeTypeController._configure_amqp_to_volumedriver() Toolbox.log(logger=NodeTypeController._logger, messages='Starting services') services = ['memcached', 'arakoon-ovsdb', 'rabbitmq-server'] if arakoon_metadata['internal'] is True: services.remove('arakoon-ovsdb') for service in services: if service_manager.has_service(service, client=target_client): ServiceFactory.change_service_state(target_client, service, 'start', NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger) if Toolbox.run_hooks(component='nodetype', sub_component='promote', logger=NodeTypeController._logger, cluster_ip=cluster_ip, master_ip=master_ip): Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger) if NodeTypeController.avahi_installed( client=target_client, logger=NodeTypeController._logger) is True: NodeTypeController.configure_avahi( client=target_client, node_name=node_name, node_type='master', logger=NodeTypeController._logger) Configuration.set('/ovs/framework/hosts/{0}/type'.format(machine_id), 'MASTER') target_client.run( ['chown', '-R', 'ovs:ovs', '/opt/OpenvStorage/config']) Configuration.set( '/ovs/framework/hosts/{0}/promotecompleted'.format(machine_id), True) if target_client.file_exists('/tmp/ovs_rollback'): target_client.file_delete('/tmp/ovs_rollback') Toolbox.log(logger=NodeTypeController._logger, messages='Promote complete')
def extend_arakoon(cluster_name, master_storagerouter_ip, storagerouter_ip, cluster_basedir, service_type=ServiceType.ARAKOON_CLUSTER_TYPES.FWK, clustered_nodes=None): """ Adds a external arakoon to a storagerouter :param cluster_name: name of the already existing arakoon cluster :type cluster_name: str :param master_storagerouter_ip: master ip address of the existing arakoon cluster e.g. 10.100.199.11 :type master_storagerouter_ip: str :param storagerouter_ip: ip of a new storagerouter to extend to e.g. 10.100.199.12 :type storagerouter_ip: str :param cluster_basedir: absolute path for the new arakoon cluster :type cluster_basedir: str :param service_type: type of plugin for arakoon (DEFAULT=ServiceType.ARAKOON_CLUSTER_TYPES.FWK) * FWK * ABM * NSM :type service_type: ovs.dal.hybrids.ServiceType.ARAKOON_CLUSTER_TYPES :param clustered_nodes: nodes who are available for the arakoon (including the to be extended_arakoon) e.g. ['10.100.199.11', '10.100.199.12'] (DEFAULT=[]) :type clustered_nodes: list :return: is created or not :rtype: bool """ if clustered_nodes is None: clustered_nodes = [] client = SSHClient(storagerouter_ip, username='******') # create required directories if not client.dir_exists(cluster_basedir): client.dir_create(cluster_basedir) ArakoonSetup.LOGGER.info( "Starting extending arakoon cluster with name `{0}`, master_ip `{1}`, slave_ip `{2}`, base_dir `{3}`" .format(cluster_name, master_storagerouter_ip, storagerouter_ip, cluster_basedir)) arakoon_installer = ArakoonInstaller(cluster_name) arakoon_installer.load() arakoon_installer.extend_cluster( new_ip=storagerouter_ip, base_dir=cluster_basedir, locked=False, log_sinks=Logger.get_sink_path('automation_lib_arakoon_server'), crash_log_sinks=Logger.get_sink_path( 'automation_lib_arakoon_server_crash')) if service_type == ServiceType.ARAKOON_CLUSTER_TYPES.ABM: client.run([ 'ln', '-s', '/usr/lib/alba/albamgr_plugin.cmxs', '{0}/arakoon/{1}/db'.format(cluster_basedir, cluster_name) ]) elif service_type == ServiceType.ARAKOON_CLUSTER_TYPES.NSM: client.run([ 'ln', '-s', '/usr/lib/alba/nsm_host_plugin.cmxs', '{0}/arakoon/{1}/db'.format(cluster_basedir, cluster_name) ]) # checking if we need to restart the given nodes if len(clustered_nodes) != 0: ArakoonSetup.LOGGER.info( "Trying to restart all given nodes of arakoon: {0}".format( clustered_nodes, cluster_name)) arakoon_installer.restart_cluster_after_extending( new_ip=storagerouter_ip) ArakoonSetup.LOGGER.info( "Finished restarting all given nodes of arakoon: {0}".format( clustered_nodes, cluster_name)) ArakoonSetup.LOGGER.info( "Finished extending arakoon cluster with name `{0}`, master_ip `{1}`, slave_ip `{2}`, base_dir `{3}`" .format(cluster_name, master_storagerouter_ip, storagerouter_ip, cluster_basedir))
def demote_node(cluster_ip, master_ip, ip_client_map, unique_id, unconfigure_memcached, unconfigure_rabbitmq, offline_nodes=None): """ Demotes a given node """ from ovs.dal.lists.storagerouterlist import StorageRouterList Toolbox.log(logger=NodeTypeController._logger, messages='Demoting node', title=True) service_manager = ServiceFactory.get_manager() if offline_nodes is None: offline_nodes = [] if unconfigure_memcached is True and len(offline_nodes) == 0: if NodeTypeController._validate_local_memcache_servers( ip_client_map) is False: raise RuntimeError( 'Not all memcache nodes can be reached which is required for demoting a node.' ) # Find other (arakoon) master nodes arakoon_cluster_name = str( Configuration.get('/ovs/framework/arakoon_clusters|ovsdb')) arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=arakoon_cluster_name) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name) master_node_ips = [node.ip for node in config.nodes] shrink = False if cluster_ip in master_node_ips: shrink = True master_node_ips.remove(cluster_ip) if len(master_node_ips) == 0: raise RuntimeError( 'There should be at least one other master node') storagerouter = StorageRouterList.get_by_machine_id(unique_id) storagerouter.node_type = 'EXTRA' storagerouter.save() offline_node_ips = [node.ip for node in offline_nodes] if arakoon_metadata['internal'] is True and shrink is True: Toolbox.log(logger=NodeTypeController._logger, messages='Leaving Arakoon {0} cluster'.format( arakoon_cluster_name)) arakoon_installer = ArakoonInstaller( cluster_name=arakoon_cluster_name) arakoon_installer.load() arakoon_installer.shrink_cluster(removal_ip=cluster_ip, offline_nodes=offline_node_ips) arakoon_installer.restart_cluster_after_shrinking() try: external_config = Configuration.get( '/ovs/framework/external_config') if external_config is None and shrink is True: Toolbox.log(logger=NodeTypeController._logger, messages='Leaving Arakoon config cluster') arakoon_installer = ArakoonInstaller(cluster_name='config') arakoon_installer.load(ip=master_node_ips[0]) arakoon_installer.shrink_cluster( removal_ip=cluster_ip, offline_nodes=offline_node_ips) arakoon_installer.restart_cluster_after_shrinking() except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages=['\nFailed to leave configuration cluster', ex], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Update configurations') try: if unconfigure_memcached is True: endpoints = Configuration.get( '/ovs/framework/memcache|endpoints') endpoint = '{0}:{1}'.format(cluster_ip, 11211) if endpoint in endpoints: endpoints.remove(endpoint) Configuration.set('/ovs/framework/memcache|endpoints', endpoints) if unconfigure_rabbitmq is True: endpoints = Configuration.get( '/ovs/framework/messagequeue|endpoints') endpoint = '{0}:{1}'.format(cluster_ip, 5672) if endpoint in endpoints: endpoints.remove(endpoint) Configuration.set('/ovs/framework/messagequeue|endpoints', endpoints) except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=['\nFailed to update configurations', ex], loglevel='exception') if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Restarting master node services') remaining_nodes = ip_client_map.keys()[:] if cluster_ip in remaining_nodes: remaining_nodes.remove(cluster_ip) PersistentFactory.store = None VolatileFactory.store = None for service in storagerouter.services: if service.name == 'arakoon-ovsdb': service.delete() target_client = None if storagerouter in offline_nodes: if unconfigure_rabbitmq is True: Toolbox.log( logger=NodeTypeController._logger, messages='Removing/unconfiguring offline RabbitMQ node') client = ip_client_map[master_ip] try: client.run([ 'rabbitmqctl', 'forget_cluster_node', 'rabbit@{0}'.format(storagerouter.name) ]) except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=[ '\nFailed to forget RabbitMQ cluster node', ex ], loglevel='exception') else: target_client = ip_client_map[cluster_ip] if unconfigure_rabbitmq is True: Toolbox.log(logger=NodeTypeController._logger, messages='Removing/unconfiguring RabbitMQ') try: if service_manager.has_service('rabbitmq-server', client=target_client): ServiceFactory.change_service_state( target_client, 'rabbitmq-server', 'stop', NodeTypeController._logger) target_client.run(['rabbitmq-server', '-detached']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop_app']) time.sleep(5) target_client.run(['rabbitmqctl', 'reset']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop']) time.sleep(5) target_client.file_unlink( "/var/lib/rabbitmq/.erlang.cookie") ServiceFactory.change_service_state( target_client, 'rabbitmq-server', 'stop', NodeTypeController._logger) # To be sure except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=[ '\nFailed to remove/unconfigure RabbitMQ', ex ], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Stopping services') services = ['memcached', 'rabbitmq-server'] if unconfigure_rabbitmq is False: services.remove('rabbitmq-server') if unconfigure_memcached is False: services.remove('memcached') for service in services: if service_manager.has_service(service, client=target_client): Toolbox.log( logger=NodeTypeController._logger, messages='Stopping service {0}'.format(service)) try: ServiceFactory.change_service_state( target_client, service, 'stop', NodeTypeController._logger) except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages=[ '\nFailed to stop service'.format(service), ex ], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Removing services') services = [ 'scheduled-tasks', 'webapp-api', 'volumerouter-consumer' ] for service in services: if service_manager.has_service(service, client=target_client): Toolbox.log( logger=NodeTypeController._logger, messages='Removing service {0}'.format(service)) try: ServiceFactory.change_service_state( target_client, service, 'stop', NodeTypeController._logger) service_manager.remove_service(service, client=target_client) except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages=[ '\nFailed to remove service'.format(service), ex ], loglevel='exception') if service_manager.has_service('workers', client=target_client): service_manager.add_service( name='workers', client=target_client, params={'WORKER_QUEUE': '{0}'.format(unique_id)}) try: NodeTypeController._configure_amqp_to_volumedriver() except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages=['\nFailed to configure AMQP to Storage Driver', ex], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger, offline_node_ips=offline_node_ips) if Toolbox.run_hooks(component='nodetype', sub_component='demote', logger=NodeTypeController._logger, cluster_ip=cluster_ip, master_ip=master_ip, offline_node_ips=offline_node_ips): Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger, offline_node_ips=offline_node_ips) if storagerouter not in offline_nodes: target_client = ip_client_map[cluster_ip] node_name, _ = target_client.get_hostname() if NodeTypeController.avahi_installed( client=target_client, logger=NodeTypeController._logger) is True: NodeTypeController.configure_avahi( client=target_client, node_name=node_name, node_type='extra', logger=NodeTypeController._logger) Configuration.set( '/ovs/framework/hosts/{0}/type'.format(storagerouter.machine_id), 'EXTRA') if target_client is not None and target_client.file_exists( '/tmp/ovs_rollback'): target_client.file_write('/tmp/ovs_rollback', 'rollback') Toolbox.log(logger=NodeTypeController._logger, messages='Demote complete', title=True)
def test_arakoon_collapse(self): """ Test the Arakoon collapse functionality """ # Set up the test structure = DalHelper.build_dal_structure( structure={'storagerouters': [1, 2]}) storagerouter_1 = structure['storagerouters'][1] storagerouter_2 = structure['storagerouters'][2] MockedSSHClient._run_returns[storagerouter_1.ip] = {} MockedSSHClient._run_returns[storagerouter_2.ip] = {} # Make sure we cover all Arakoon cluster types clusters_to_create = { ServiceType.ARAKOON_CLUSTER_TYPES.SD: [{ 'name': 'unittest-voldrv', 'internal': True, 'success': True }], ServiceType.ARAKOON_CLUSTER_TYPES.CFG: [{ 'name': 'unittest-cacc', 'internal': True, 'success': True }], ServiceType.ARAKOON_CLUSTER_TYPES.FWK: [{ 'name': 'unittest-ovsdb', 'internal': True, 'success': False }], ServiceType.ARAKOON_CLUSTER_TYPES.ABM: [{ 'name': 'unittest-cluster-1-abm', 'internal': True, 'success': False }, { 'name': 'unittest-random-abm-name', 'internal': False, 'success': True }], ServiceType.ARAKOON_CLUSTER_TYPES.NSM: [{ 'name': 'unittest-cluster-1-nsm_0', 'internal': True, 'success': True }] } self.assertEqual( first=sorted(clusters_to_create.keys()), second=sorted(ServiceType.ARAKOON_CLUSTER_TYPES.keys()), msg= 'An Arakoon cluster type has been removed or added, please update this test accordingly' ) # Create all Arakoon clusters and related services failed_clusters = [] external_clusters = [] successful_clusters = [] for cluster_type, cluster_infos in clusters_to_create.iteritems(): filesystem = cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.CFG for cluster_info in cluster_infos: internal = cluster_info['internal'] cluster_name = cluster_info['name'] base_dir = DalHelper.CLUSTER_DIR.format(cluster_name) arakoon_installer = ArakoonInstaller(cluster_name=cluster_name) arakoon_installer.create_cluster(cluster_type=cluster_type, ip=storagerouter_1.ip, base_dir=base_dir, internal=internal) arakoon_installer.start_cluster() arakoon_installer.extend_cluster(new_ip=storagerouter_2.ip, base_dir=base_dir) service_name = ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name) if cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.ABM: service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ALBA_MGR) elif cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.NSM: service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.NS_MGR) else: service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) if internal is True: DalHelper.create_service( service_name=service_name, service_type=service_type, storagerouter=storagerouter_1, ports=arakoon_installer.ports[storagerouter_1.ip]) DalHelper.create_service( service_name=service_name, service_type=service_type, storagerouter=storagerouter_2, ports=arakoon_installer.ports[storagerouter_2.ip]) else: DalHelper.create_service(service_name=service_name, service_type=service_type) external_clusters.append(cluster_name) continue if cluster_info['success'] is True: if filesystem is True: config_path = ArakoonClusterConfig.CONFIG_FILE.format( cluster_name) else: config_path = Configuration.get_configuration_path( ArakoonClusterConfig.CONFIG_KEY.format( cluster_name)) MockedSSHClient._run_returns[storagerouter_1.ip][ 'arakoon --collapse-local 1 2 -config {0}'.format( config_path)] = None MockedSSHClient._run_returns[storagerouter_2.ip][ 'arakoon --collapse-local 2 2 -config {0}'.format( config_path)] = None successful_clusters.append(cluster_name) else: # For successful False clusters we don't emulate the collapse, thus making it fail failed_clusters.append(cluster_name) # Start collapse and make it fail for all clusters on StorageRouter 2 SSHClient._raise_exceptions[storagerouter_2.ip] = { 'users': ['ovs'], 'exception': UnableToConnectException('No route to host') } GenericController.collapse_arakoon() # Verify all log messages for each type of cluster generic_logs = Logger._logs.get('lib', {}) for cluster_name in successful_clusters + failed_clusters + external_clusters: collect_msg = ( 'DEBUG', 'Collecting info for cluster {0}'.format(cluster_name)) unreachable_msg = ( 'ERROR', 'Could not collapse any cluster on {0} (not reachable)'.format( storagerouter_2.name)) end_collapse_msg = ( 'DEBUG', 'Collapsing cluster {0} on {1} completed'.format( cluster_name, storagerouter_1.ip)) start_collapse_msg = ('DEBUG', 'Collapsing cluster {0} on {1}'.format( cluster_name, storagerouter_1.ip)) failed_collapse_msg = ( 'ERROR', 'Collapsing cluster {0} on {1} failed'.format( cluster_name, storagerouter_1.ip)) messages_to_validate = [] if cluster_name in successful_clusters: assert_function = self.assertIn messages_to_validate.append(collect_msg) messages_to_validate.append(unreachable_msg) messages_to_validate.append(start_collapse_msg) messages_to_validate.append(end_collapse_msg) elif cluster_name in failed_clusters: assert_function = self.assertIn messages_to_validate.append(collect_msg) messages_to_validate.append(unreachable_msg) messages_to_validate.append(start_collapse_msg) messages_to_validate.append(failed_collapse_msg) else: assert_function = self.assertNotIn messages_to_validate.append(collect_msg) messages_to_validate.append(start_collapse_msg) messages_to_validate.append(end_collapse_msg) for severity, message in messages_to_validate: if assert_function == self.assertIn: assert_message = 'Expected to find log message: {0}'.format( message) else: assert_message = 'Did not expect to find log message: {0}'.format( message) assert_function(member=message, container=generic_logs, msg=assert_message) if assert_function == self.assertIn: self.assertEqual( first=severity, second=generic_logs[message], msg='Log message {0} is of severity {1} expected {2}'. format(message, generic_logs[message], severity)) # Collapse should always have a 'finished' message since each cluster should be attempted to be collapsed for general_message in [ 'Arakoon collapse started', 'Arakoon collapse finished' ]: self.assertIn(member=general_message, container=generic_logs, msg='Expected to find log message: {0}'.format( general_message))
def promote_or_demote_node(node_action, cluster_ip=None, execute_rollback=False): """ Promotes or demotes the local node :param node_action: Demote or promote :type node_action: str :param cluster_ip: IP of node to promote or demote :type cluster_ip: str :param execute_rollback: In case of failure revert the changes made :type execute_rollback: bool :return: None """ if node_action not in ('promote', 'demote'): raise ValueError('Nodes can only be promoted or demoted') Toolbox.log(logger=NodeTypeController._logger, messages='Open vStorage Setup - {0}'.format( node_action.capitalize()), boxed=True) try: Toolbox.log(logger=NodeTypeController._logger, messages='Collecting information', title=True) machine_id = System.get_my_machine_id() if Configuration.get('/ovs/framework/hosts/{0}/setupcompleted'. format(machine_id)) is False: raise RuntimeError('No local OVS setup found.') if cluster_ip and not re.match(Toolbox.regex_ip, cluster_ip): raise RuntimeError( 'Incorrect IP provided ({0})'.format(cluster_ip)) if cluster_ip: client = SSHClient(endpoint=cluster_ip) machine_id = System.get_my_machine_id(client) node_type = Configuration.get( '/ovs/framework/hosts/{0}/type'.format(machine_id)) if node_action == 'promote' and node_type == 'MASTER': raise RuntimeError('This node is already master.') elif node_action == 'demote' and node_type == 'EXTRA': raise RuntimeError('This node should be a master.') elif node_type not in ['MASTER', 'EXTRA']: raise RuntimeError('This node is not correctly configured.') master_ip = None offline_nodes = [] online = True target_client = None if node_action == 'demote' and cluster_ip: # Demote an offline node from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.lib.storagedriver import StorageDriverController ip = cluster_ip unique_id = None ip_client_map = {} for storage_router in StorageRouterList.get_storagerouters(): try: client = SSHClient(storage_router.ip, username='******') if storage_router.node_type == 'MASTER': master_ip = storage_router.ip ip_client_map[storage_router.ip] = client except UnableToConnectException: if storage_router.ip == cluster_ip: online = False unique_id = storage_router.machine_id StorageDriverController.mark_offline( storagerouter_guid=storage_router.guid) offline_nodes.append(storage_router) if online is True: raise RuntimeError( "If the node is online, please use 'ovs setup demote' executed on the node you wish to demote" ) if master_ip is None: raise RuntimeError( 'Failed to retrieve another responsive MASTER node') else: target_password = Toolbox.ask_validate_password( ip='127.0.0.1', logger=NodeTypeController._logger) target_client = SSHClient('127.0.0.1', username='******', password=target_password) unique_id = System.get_my_machine_id(target_client) ip = Configuration.get( '/ovs/framework/hosts/{0}/ip'.format(unique_id)) storagerouter_info = NodeTypeController.retrieve_storagerouter_info_via_host( ip=target_client.ip, password=target_password) node_ips = [ sr_info['ip'] for sr_info in storagerouter_info.itervalues() ] master_node_ips = [ sr_info['ip'] for sr_info in storagerouter_info.itervalues() if sr_info['type'] == 'master' and sr_info['ip'] != ip ] if len(master_node_ips) == 0: if node_action == 'promote': raise RuntimeError('No master node could be found') else: raise RuntimeError( 'It is not possible to remove the only master') master_ip = master_node_ips[0] ip_client_map = dict( (node_ip, SSHClient(node_ip, username='******')) for node_ip in node_ips) if node_action == 'demote': for cluster_name in Configuration.list('/ovs/arakoon'): config = ArakoonClusterConfig(cluster_id=cluster_name) arakoon_client = ArakoonInstaller.build_client(config) metadata = json.loads( arakoon_client.get(ArakoonInstaller.METADATA_KEY)) if len(config.nodes) == 1 and config.nodes[ 0].ip == ip and metadata.get('internal') is True: raise RuntimeError( 'Demote is not supported when single node Arakoon cluster(s) are present on the node to be demoted.' ) configure_rabbitmq = Toolbox.is_service_internally_managed( service='rabbitmq') configure_memcached = Toolbox.is_service_internally_managed( service='memcached') if node_action == 'promote': try: NodeTypeController.promote_node( cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, configure_memcached=configure_memcached, configure_rabbitmq=configure_rabbitmq) except Exception: if execute_rollback is True: NodeTypeController.demote_node( cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, unconfigure_memcached=configure_memcached, unconfigure_rabbitmq=configure_rabbitmq, offline_nodes=offline_nodes) elif target_client is not None: target_client.file_write('/tmp/ovs_rollback', 'demote') raise else: try: NodeTypeController.demote_node( cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, unconfigure_memcached=configure_memcached, unconfigure_rabbitmq=configure_rabbitmq, offline_nodes=offline_nodes) except Exception: if execute_rollback is True: NodeTypeController.promote_node( cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, configure_memcached=configure_memcached, configure_rabbitmq=configure_rabbitmq) elif target_client is not None: target_client.file_write('/tmp/ovs_rollback', 'promote') raise Toolbox.log(logger=NodeTypeController._logger, messages='\n') Toolbox.log(logger=NodeTypeController._logger, messages='{0} complete.'.format( node_action.capitalize()), boxed=True) except Exception as exception: Toolbox.log(logger=NodeTypeController._logger, messages='\n') Toolbox.log( logger=NodeTypeController._logger, messages=['An unexpected error occurred:', str(exception)], boxed=True, loglevel='exception') sys.exit(1) except KeyboardInterrupt: Toolbox.log(logger=NodeTypeController._logger, messages='\n') Toolbox.log( logger=NodeTypeController._logger, messages= 'This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.', boxed=True, loglevel='error') sys.exit(1)
def shrink_vpool(cls, storagedriver_guid, offline_storage_router_guids=list()): """ Removes a StorageDriver (if its the last StorageDriver for a vPool, the vPool is removed as well) :param storagedriver_guid: Guid of the StorageDriver to remove :type storagedriver_guid: str :param offline_storage_router_guids: Guids of StorageRouters which are offline and will be removed from cluster. WHETHER VPOOL WILL BE DELETED DEPENDS ON THIS :type offline_storage_router_guids: list :return: None :rtype: NoneType """ # TODO: Add logging # TODO: Unit test individual pieces of code # Validations storagedriver = StorageDriver(storagedriver_guid) storagerouter = storagedriver.storagerouter cls._logger.info( 'StorageDriver {0} - Deleting StorageDriver {1}'.format( storagedriver.guid, storagedriver.name)) vp_installer = VPoolInstaller(name=storagedriver.vpool.name) vp_installer.validate(storagedriver=storagedriver) sd_installer = StorageDriverInstaller(vp_installer=vp_installer, storagedriver=storagedriver) cls._logger.info( 'StorageDriver {0} - Checking availability of related StorageRouters' .format(storagedriver.guid, storagedriver.name)) sr_client_map = SSHClient.get_clients(endpoints=[ sd.storagerouter for sd in vp_installer.vpool.storagedrivers ], user_names=['root']) sr_installer = StorageRouterInstaller(root_client=sr_client_map.get( storagerouter, {}).get('root'), storagerouter=storagerouter, vp_installer=vp_installer, sd_installer=sd_installer) offline_srs = sr_client_map.pop('offline') if sorted([sr.guid for sr in offline_srs ]) != sorted(offline_storage_router_guids): raise RuntimeError('Not all StorageRouters are reachable') if storagerouter not in offline_srs: mtpt_pids = sr_installer.root_client.run( "lsof -t +D '/mnt/{0}' || true".format( vp_installer.name.replace(r"'", r"'\''")), allow_insecure=True).splitlines() if len(mtpt_pids) > 0: raise RuntimeError( 'vPool cannot be deleted. Following processes keep the vPool mount point occupied: {0}' .format(', '.join(mtpt_pids))) # Retrieve reachable StorageDrivers reachable_storagedrivers = [] for sd in vp_installer.vpool.storagedrivers: if sd.storagerouter not in sr_client_map: # StorageRouter is offline continue sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format( vp_installer.vpool.guid, sd.storagedriver_id) if Configuration.exists(sd_key) is True: path = Configuration.get_configuration_path(sd_key) with remote(sd.storagerouter.ip, [LocalStorageRouterClient]) as rem: try: lsrc = rem.LocalStorageRouterClient(path) lsrc.server_revision( ) # 'Cheap' call to verify whether volumedriver is responsive cls._logger.info( 'StorageDriver {0} - Responsive StorageDriver {1} on node with IP {2}' .format(storagedriver.guid, sd.name, sd.storagerouter.ip)) reachable_storagedrivers.append(sd) except Exception as exception: if not is_connection_failure(exception): raise if len(reachable_storagedrivers) == 0: raise RuntimeError( 'Could not find any responsive node in the cluster') # Start removal if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.SHRINKING) else: vp_installer.update_status(status=VPool.STATUSES.DELETING) # Clean up stale vDisks cls._logger.info('StorageDriver {0} - Removing stale vDisks'.format( storagedriver.guid)) VDiskController.remove_stale_vdisks(vpool=vp_installer.vpool) # Reconfigure the MDSes cls._logger.info('StorageDriver {0} - Reconfiguring MDSes'.format( storagedriver.guid)) for vdisk_guid in storagerouter.vdisks_guids: try: MDSServiceController.ensure_safety( vdisk_guid=vdisk_guid, excluded_storagerouter_guids=[storagerouter.guid] + offline_storage_router_guids) except Exception: cls._logger.exception( 'StorageDriver {0} - vDisk {1} - Ensuring MDS safety failed' .format(storagedriver.guid, vdisk_guid)) # Validate that all MDSes on current StorageRouter have been moved away # Ensure safety does not always throw an error, that's why we perform this check here instead of in the Exception clause of above code vdisks = [] for mds in vp_installer.mds_services: for junction in mds.vdisks: vdisk = junction.vdisk if vdisk in vdisks: continue vdisks.append(vdisk) cls._logger.critical( 'StorageDriver {0} - vDisk {1} {2} - MDS Services have not been migrated away' .format(storagedriver.guid, vdisk.guid, vdisk.name)) if len(vdisks) > 0: # Put back in RUNNING, so it can be used again. Errors keep on displaying in GUI now anyway vp_installer.update_status(status=VPool.STATUSES.RUNNING) raise RuntimeError( 'Not all MDS Services have been successfully migrated away') # Start with actual removal errors_found = False if storagerouter not in offline_srs: errors_found &= sd_installer.stop_services() errors_found &= vp_installer.configure_cluster_registry( exclude=[storagedriver], apply_on=reachable_storagedrivers) errors_found &= vp_installer.update_node_distance_map() errors_found &= vp_installer.remove_mds_services() errors_found &= sd_installer.clean_config_management() errors_found &= sd_installer.clean_model() if storagerouter not in offline_srs: errors_found &= sd_installer.clean_directories( mountpoints=StorageRouterController.get_mountpoints( client=sr_installer.root_client)) try: DiskController.sync_with_reality( storagerouter_guid=storagerouter.guid) except Exception: cls._logger.exception( 'StorageDriver {0} - Synchronizing disks with reality failed' .format(storagedriver.guid)) errors_found = True if vp_installer.storagedriver_amount > 1: # Update the vPool metadata and run DTL checkup vp_installer.vpool.metadata['caching_info'].pop( sr_installer.storagerouter.guid, None) vp_installer.vpool.save() try: VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600) except Exception: cls._logger.exception( 'StorageDriver {0} - DTL checkup failed for vPool {1} with guid {2}' .format(storagedriver.guid, vp_installer.name, vp_installer.vpool.guid)) else: cls._logger.info( 'StorageDriver {0} - Removing vPool from model'.format( storagedriver.guid)) # Clean up model try: vp_installer.vpool.delete() except Exception: errors_found = True cls._logger.exception( 'StorageDriver {0} - Cleaning up vPool from the model failed' .format(storagedriver.guid)) Configuration.delete('/ovs/vpools/{0}'.format( vp_installer.vpool.guid)) cls._logger.info('StorageDriver {0} - Running MDS checkup'.format( storagedriver.guid)) try: MDSServiceController.mds_checkup() except Exception: cls._logger.exception( 'StorageDriver {0} - MDS checkup failed'.format( storagedriver.guid)) # Update vPool status if errors_found is True: if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.FAILURE) raise RuntimeError( '1 or more errors occurred while trying to remove the StorageDriver. Please check the logs for more information' ) if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.RUNNING) cls._logger.info( 'StorageDriver {0} - Deleted StorageDriver {1}'.format( storagedriver.guid, storagedriver.name)) if len(VPoolList.get_vpools()) == 0: cluster_name = ArakoonInstaller.get_cluster_name('voldrv') if ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name)['internal'] is True: cls._logger.debug( 'StorageDriver {0} - Removing Arakoon cluster {1}'.format( storagedriver.guid, cluster_name)) try: installer = ArakoonInstaller(cluster_name=cluster_name) installer.load() installer.delete_cluster() except Exception: cls._logger.exception( 'StorageDriver {0} - Delete voldrv Arakoon cluster failed' .format(storagedriver.guid)) service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) service_name = ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name) for service in list(service_type.services): if service.name == service_name: service.delete() # Remove watcher volumedriver service if last StorageDriver on current StorageRouter if len( storagerouter.storagedrivers ) == 0 and storagerouter not in offline_srs: # ensure client is initialized for StorageRouter try: if cls._service_manager.has_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client): cls._service_manager.stop_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client) cls._service_manager.remove_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client) except Exception: cls._logger.exception( 'StorageDriver {0} - {1} service deletion failed'.format( storagedriver.guid, ServiceFactory.SERVICE_WATCHER_VOLDRV))
def services_running(self, target): """ Check all services are running :param target: Target to check :return: Boolean """ try: key = 'ovs-watcher-{0}'.format(str(uuid.uuid4())) value = str(time.time()) if target in ['config', 'framework']: self.log_message(target, 'Testing configuration store...', 0) from ovs.extensions.generic.configuration import Configuration try: Configuration.list('/') except Exception as ex: self.log_message( target, ' Error during configuration store test: {0}'.format( ex), 2) return False from ovs.extensions.db.arakooninstaller import ArakoonInstaller, ArakoonClusterConfig from ovs_extensions.db.arakoon.pyrakoon.pyrakoon.compat import NoGuarantee from ovs.extensions.generic.configuration import Configuration with open(Configuration.CACC_LOCATION) as config_file: contents = config_file.read() config = ArakoonClusterConfig( cluster_id=Configuration.ARAKOON_NAME, load_config=False) config.read_config(contents=contents) client = ArakoonInstaller.build_client(config) contents = client.get(ArakoonInstaller.INTERNAL_CONFIG_KEY, consistency=NoGuarantee()) if Watcher.LOG_CONTENTS != contents: try: config.read_config( contents=contents ) # Validate whether the contents are not corrupt except Exception as ex: self.log_message( target, ' Configuration stored in configuration store seems to be corrupt: {0}' .format(ex), 2) return False temp_filename = '{0}~'.format(Configuration.CACC_LOCATION) with open(temp_filename, 'w') as config_file: config_file.write(contents) config_file.flush() os.fsync(config_file) os.rename(temp_filename, Configuration.CACC_LOCATION) Watcher.LOG_CONTENTS = contents self.log_message(target, ' Configuration store OK', 0) if target == 'framework': # Volatile self.log_message(target, 'Testing volatile store...', 0) max_tries = 5 tries = 0 while tries < max_tries: try: try: logging.disable(logging.WARNING) from ovs.extensions.storage.volatilefactory import VolatileFactory VolatileFactory.store = None volatile = VolatileFactory.get_client() volatile.set(key, value) if volatile.get(key) == value: volatile.delete(key) break volatile.delete(key) finally: logging.disable(logging.NOTSET) except Exception as message: self.log_message( target, ' Error during volatile store test: {0}'.format( message), 2) key = 'ovs-watcher-{0}'.format(str( uuid.uuid4())) # Get another key time.sleep(1) tries += 1 if tries == max_tries: self.log_message(target, ' Volatile store not working correctly', 2) return False self.log_message( target, ' Volatile store OK after {0} tries'.format(tries), 0) # Persistent self.log_message(target, 'Testing persistent store...', 0) max_tries = 5 tries = 0 while tries < max_tries: try: try: logging.disable(logging.WARNING) persistent = PersistentFactory.get_client() persistent.nop() break finally: logging.disable(logging.NOTSET) except Exception as message: self.log_message( target, ' Error during persistent store test: {0}'.format( message), 2) time.sleep(1) tries += 1 if tries == max_tries: self.log_message( target, ' Persistent store not working correctly', 2) return False self.log_message( target, ' Persistent store OK after {0} tries'.format(tries), 0) if target == 'volumedriver': # Arakoon, voldrv cluster self.log_message(target, 'Testing arakoon (voldrv)...', 0) max_tries = 5 tries = 0 while tries < max_tries: try: from ovs.extensions.generic.configuration import Configuration from ovs_extensions.storage.persistent.pyrakoonstore import PyrakoonStore cluster_name = str( Configuration.get( '/ovs/framework/arakoon_clusters|voldrv')) configuration = Configuration.get( '/ovs/arakoon/{0}/config'.format(cluster_name), raw=True) client = PyrakoonStore(cluster=cluster_name, configuration=configuration) client.nop() break except Exception as message: self.log_message( target, ' Error during arakoon (voldrv) test: {0}'.format( message), 2) time.sleep(1) tries += 1 if tries == max_tries: self.log_message( target, ' Arakoon (voldrv) not working correctly', 2) return False self.log_message(target, ' Arakoon (voldrv) OK', 0) if target in ['framework', 'volumedriver']: # RabbitMQ self.log_message(target, 'Test rabbitMQ...', 0) import pika from ovs.extensions.generic.configuration import Configuration messagequeue = Configuration.get('/ovs/framework/messagequeue') rmq_servers = messagequeue['endpoints'] good_node = False for server in rmq_servers: try: connection_string = '{0}://{1}:{2}@{3}/%2F'.format( messagequeue['protocol'], messagequeue['user'], messagequeue['password'], server) connection = pika.BlockingConnection( pika.URLParameters(connection_string)) channel = connection.channel() channel.basic_publish( '', 'ovs-watcher', str(time.time()), pika.BasicProperties(content_type='text/plain', delivery_mode=1)) connection.close() good_node = True except Exception as message: self.log_message( target, ' Error during rabbitMQ test on node {0}: {1}'. format(server, message), 2) if good_node is False: self.log_message( target, ' No working rabbitMQ node could be found', 2) return False self.log_message(target, ' RabbitMQ test OK', 0) self.log_message(target, 'All tests OK', 0) return True except Exception as ex: self.log_message(target, 'Unexpected exception: {0}'.format(ex), 2) return False
def migrate(): """ Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually executed. This code will typically contain: * "dangerous" migration code (it needs certain running services) * Migration code depending on a cluster-wide state * ... * Successfully finishing a piece of migration code, should create an entry in /ovs/framework/migration in case it should not be executed again * Eg: /ovs/framework/migration|stats_monkey_integration: True """ MigrationController._logger.info('Preparing out of band migrations...') from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.storagedriverlist import StorageDriverList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.vpoollist import VPoolList from ovs.extensions.db.arakooninstaller import ArakoonInstaller from ovs.extensions.generic.configuration import Configuration from ovs.extensions.generic.sshclient import SSHClient from ovs_extensions.generic.toolbox import ExtensionsToolbox from ovs.extensions.migration.migration.ovsmigrator import ExtensionMigrator from ovs.extensions.packages.packagefactory import PackageFactory from ovs_extensions.services.interfaces.systemd import Systemd from ovs.extensions.services.servicefactory import ServiceFactory from ovs.extensions.storageserver.storagedriver import StorageDriverConfiguration from ovs.lib.helpers.storagedriver.installer import StorageDriverInstaller MigrationController._logger.info('Start out of band migrations...') service_manager = ServiceFactory.get_manager() sr_client_map = {} for storagerouter in StorageRouterList.get_storagerouters(): sr_client_map[storagerouter.guid] = SSHClient(endpoint=storagerouter.ip, # Is triggered during post-update code too during which the ovs-watcher-framework service is still down and thus not refreshing the heartbeat --> use IP i/o StorageRouter username='******') ######################################################### # Addition of 'ExecReload' for AlbaProxy SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagedriver in StorageDriverList.get_storagedrivers(): root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: service = alba_proxy.service service_name = 'ovs-{0}'.format(service.name) if not service_manager.has_service(name=service_name, client=root_client): continue if 'ExecReload=' in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ################################################################## # Adjustment of open file descriptors for Arakoon services to 8192 changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-arakoon-'): continue if ServiceFactory.get_service_type() == 'systemd': path = '/lib/systemd/system/{0}.service'.format(service_name) check = 'LimitNOFILE=8192' else: path = '/etc/init/{0}.conf'.format(service_name) check = 'limit nofile 8192 8192' if not root_client.file_exists(path): continue if check in root_client.file_read(path): continue try: service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name) changed_clients.add(root_client) ExtensionsToolbox.edit_version_file(client=root_client, package_name='arakoon', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, service_name)) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ############################# # Migrate to multiple proxies for storagedriver in StorageDriverList.get_storagedrivers(): vpool = storagedriver.vpool root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: # Rename alba_proxy service in model service = alba_proxy.service old_service_name = 'albaproxy_{0}'.format(vpool.name) new_service_name = 'albaproxy_{0}_0'.format(vpool.name) if old_service_name != service.name: continue service.name = new_service_name service.save() if not service_manager.has_service(name=old_service_name, client=root_client): continue old_configuration_key = '/ovs/framework/hosts/{0}/services/{1}'.format(storagedriver.storagerouter.machine_id, old_service_name) if not Configuration.exists(key=old_configuration_key): continue # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, old_service_name), new_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, new_service_name)) # Register new service and remove old service service_manager.add_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, params=Configuration.get(old_configuration_key), target_name='ovs-{0}'.format(new_service_name)) # Update scrub proxy config proxy_config_key = '/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid) proxy_config = None if Configuration.exists(key=proxy_config_key) is False else Configuration.get(proxy_config_key) if proxy_config is not None: fragment_cache = proxy_config.get(StorageDriverConfiguration.CACHE_FRAGMENT, ['none', {}]) if fragment_cache[0] == 'alba' and fragment_cache[1].get('cache_on_write') is True: # Accelerated ALBA configured fragment_cache_scrub_info = copy.deepcopy(fragment_cache) fragment_cache_scrub_info[1]['cache_on_read'] = False proxy_scrub_config_key = '/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid) proxy_scrub_config = None if Configuration.exists(key=proxy_scrub_config_key) is False else Configuration.get(proxy_scrub_config_key) if proxy_scrub_config is not None and proxy_scrub_config[StorageDriverConfiguration.CACHE_FRAGMENT] == ['none']: proxy_scrub_config[StorageDriverConfiguration.CACHE_FRAGMENT] = fragment_cache_scrub_info Configuration.set(key=proxy_scrub_config_key, value=proxy_scrub_config) # Update 'backend_connection_manager' section changes = False storagedriver_config = StorageDriverConfiguration(vpool.guid, storagedriver.storagedriver_id) if 'backend_connection_manager' not in storagedriver_config.configuration: continue current_config = storagedriver_config.configuration['backend_connection_manager'] if current_config.get('backend_type') != 'MULTI': changes = True backend_connection_manager = {'backend_type': 'MULTI'} for index, proxy in enumerate(sorted(storagedriver.alba_proxies, key=lambda pr: pr.service.ports[0])): backend_connection_manager[str(index)] = copy.deepcopy(current_config) # noinspection PyUnresolvedReferences backend_connection_manager[str(index)]['alba_connection_use_rora'] = True # noinspection PyUnresolvedReferences backend_connection_manager[str(index)]['alba_connection_rora_manifest_cache_capacity'] = 5000 # noinspection PyUnresolvedReferences for key, value in backend_connection_manager[str(index)].items(): if key.startswith('backend_interface'): backend_connection_manager[key] = value # noinspection PyUnresolvedReferences del backend_connection_manager[str(index)][key] for key, value in {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}.iteritems(): if key not in backend_connection_manager: backend_connection_manager[key] = value else: backend_connection_manager = current_config for value in backend_connection_manager.values(): if isinstance(value, dict): for key, val in value.items(): if key.startswith('backend_interface'): backend_connection_manager[key] = val changes = True del value[key] for key, value in {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}.iteritems(): if key not in backend_connection_manager: changes = True backend_connection_manager[key] = value if changes is True: storagedriver_config.clear_backend_connection_manager() storagedriver_config.configure_backend_connection_manager(**backend_connection_manager) storagedriver_config.save(root_client) # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section) ExtensionsToolbox.edit_version_file(client=root_client, package_name='volumedriver', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, 'volumedriver_{0}'.format(vpool.name))) if service_manager.__class__ == Systemd: root_client.run(['systemctl', 'daemon-reload']) ######################################## # Update metadata_store_bits information vpools = VPoolList.get_vpools() for vpool in vpools: bits = None for storagedriver in vpool.storagedrivers: key = '/ovs/framework/hosts/{0}/services/volumedriver_{1}'.format(storagedriver.storagerouter.machine_id, vpool.name) if Configuration.exists(key=key) and 'METADATASTORE_BITS' not in Configuration.get(key=key): if bits is None: entries = service_manager.extract_from_service_file(name='ovs-volumedriver_{0}'.format(vpool.name), client=sr_client_map[storagedriver.storagerouter_guid], entries=['METADATASTORE_BITS=']) if len(entries) == 1: bits = entries[0].split('=')[-1] bits = int(bits) if bits.isdigit() else 5 if bits is not None: try: content = Configuration.get(key=key) content['METADATASTORE_BITS'] = bits Configuration.set(key=key, value=content) except: MigrationController._logger.exception('Error updating volumedriver info for vPool {0} on StorageRouter {1}'.format(vpool.name, storagedriver.storagerouter.name)) if bits is not None: vpool.metadata_store_bits = bits vpool.save() ##################################### # Update the vPool metadata structure def _update_metadata_structure(metadata): metadata = copy.deepcopy(metadata) cache_structure = {'read': False, 'write': False, 'is_backend': False, 'quota': None, 'backend_info': {'name': None, # Will be filled in when is_backend is true 'backend_guid': None, 'alba_backend_guid': None, 'policies': None, 'preset': None, 'arakoon_config': None, 'connection_info': {'client_id': None, 'client_secret': None, 'host': None, 'port': None, 'local': None}} } structure_map = {StorageDriverConfiguration.CACHE_BLOCK: {'read': 'block_cache_on_read', 'write': 'block_cache_on_write', 'quota': 'quota_bc', 'backend_prefix': 'backend_bc_{0}'}, StorageDriverConfiguration.CACHE_FRAGMENT: {'read': 'fragment_cache_on_read', 'write': 'fragment_cache_on_write', 'quota': 'quota_fc', 'backend_prefix': 'backend_aa_{0}'}} if 'arakoon_config' in metadata['backend']: # Arakoon config should be placed under the backend info metadata['backend']['backend_info']['arakoon_config'] = metadata['backend'].pop('arakoon_config') if 'connection_info' in metadata['backend']: # Connection info sohuld be placed under the backend info metadata['backend']['backend_info']['connection_info'] = metadata['backend'].pop('connection_info') if 'caching_info' not in metadata: # Caching info is the new key would_be_caching_info = {} metadata['caching_info'] = would_be_caching_info # Extract all caching data for every storagerouter current_caching_info = metadata['backend'].pop('caching_info') # Pop to mutate metadata for storagerouter_guid in current_caching_info.iterkeys(): current_cache_data = current_caching_info[storagerouter_guid] storagerouter_caching_info = {} would_be_caching_info[storagerouter_guid] = storagerouter_caching_info for cache_type, cache_type_mapping in structure_map.iteritems(): new_cache_structure = copy.deepcopy(cache_structure) storagerouter_caching_info[cache_type] = new_cache_structure for new_structure_key, old_structure_key in cache_type_mapping.iteritems(): if new_structure_key == 'backend_prefix': # Get possible backend related info metadata_key = old_structure_key.format(storagerouter_guid) if metadata_key not in metadata: continue backend_data = metadata.pop(metadata_key) # Pop to mutate metadata new_cache_structure['is_backend'] = True # Copy over the old data new_cache_structure['backend_info']['arakoon_config'] = backend_data['arakoon_config'] new_cache_structure['backend_info'].update(backend_data['backend_info']) new_cache_structure['backend_info']['connection_info'].update(backend_data['connection_info']) else: new_cache_structure[new_structure_key] = current_cache_data.get(old_structure_key) return metadata vpools = VPoolList.get_vpools() for vpool in vpools: try: new_metadata = _update_metadata_structure(vpool.metadata) vpool.metadata = new_metadata vpool.save() except KeyError: MigrationController._logger.exception('Exceptions occurred when updating the metadata for vPool {0}'.format(vpool.name)) ############################################## # Always use indent=4 during Configuration set def _resave_all_config_entries(config_path='/ovs'): """ Recursive functions which checks every config management key if its a directory or not. If not a directory, we retrieve the config and just save it again using the new indentation logic """ for item in Configuration.list(config_path): new_path = config_path + '/' + item print new_path if Configuration.dir_exists(new_path) is True: _resave_all_config_entries(config_path=new_path) else: try: _config = Configuration.get(new_path) Configuration.set(new_path, _config) except: _config = Configuration.get(new_path, raw=True) Configuration.set(new_path, _config, raw=True) if ExtensionMigrator.THIS_VERSION <= 13: # There is no way of checking whether this new indentation logic has been applied, so we only perform this for version 13 and lower MigrationController._logger.info('Re-saving every configuration setting with new indentation rules') _resave_all_config_entries() ############################ # Update some default values def _update_manifest_cache_size(_proxy_config_key): updated = False manifest_cache_size = 500 * 1024 * 1024 if Configuration.exists(key=_proxy_config_key): _proxy_config = Configuration.get(key=_proxy_config_key) for cache_type in [StorageDriverConfiguration.CACHE_BLOCK, StorageDriverConfiguration.CACHE_FRAGMENT]: if cache_type in _proxy_config and _proxy_config[cache_type][0] == 'alba': if _proxy_config[cache_type][1]['manifest_cache_size'] != manifest_cache_size: updated = True _proxy_config[cache_type][1]['manifest_cache_size'] = manifest_cache_size if _proxy_config['manifest_cache_size'] != manifest_cache_size: updated = True _proxy_config['manifest_cache_size'] = manifest_cache_size if updated is True: Configuration.set(key=_proxy_config_key, value=_proxy_config) return updated for storagedriver in StorageDriverList.get_storagedrivers(): try: vpool = storagedriver.vpool root_client = sr_client_map[storagedriver.storagerouter_guid] _update_manifest_cache_size('/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)) # Generic scrub proxy is deployed every time scrubbing kicks in, so no need to restart these services for alba_proxy in storagedriver.alba_proxies: if _update_manifest_cache_size('/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid)) is True: # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, alba_proxy.service.name)) # Update 'backend_connection_manager' section changes = False storagedriver_config = StorageDriverConfiguration(vpool.guid, storagedriver.storagedriver_id) if 'backend_connection_manager' not in storagedriver_config.configuration: continue current_config = storagedriver_config.configuration['backend_connection_manager'] for key, value in current_config.iteritems(): if key.isdigit() is True: if value.get('alba_connection_asd_connection_pool_capacity') != 10: changes = True value['alba_connection_asd_connection_pool_capacity'] = 10 if value.get('alba_connection_timeout') != 30: changes = True value['alba_connection_timeout'] = 30 if value.get('alba_connection_rora_manifest_cache_capacity') != 25000: changes = True value['alba_connection_rora_manifest_cache_capacity'] = 25000 if changes is True: storagedriver_config.clear_backend_connection_manager() storagedriver_config.configure_backend_connection_manager(**current_config) storagedriver_config.save(root_client) # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section) ExtensionsToolbox.edit_version_file(client=root_client, package_name='volumedriver', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, 'volumedriver_{0}'.format(vpool.name))) except Exception: MigrationController._logger.exception('Updating default configuration values failed for StorageDriver {0}'.format(storagedriver.storagedriver_id)) #################################################### # Adding proxy fail fast as env variable for proxies changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-albaproxy_'): continue if ServiceFactory.get_service_type() == 'systemd': path = '/lib/systemd/system/{0}.service'.format(service_name) check = 'Environment=ALBA_FAIL_FAST=true' else: path = '/etc/init/{0}.conf'.format(service_name) check = 'env ALBA_FAIL_FAST=true' if not root_client.file_exists(path): continue if check in root_client.file_read(path): continue try: service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name=service_name) changed_clients.add(root_client) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, service_name)) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ###################################### # Integration of stats monkey (2.10.2) if Configuration.get(key='/ovs/framework/migration|stats_monkey_integration', default=False) is False: try: # Get content of old key into new key old_stats_monkey_key = '/statsmonkey/statsmonkey' if Configuration.exists(key=old_stats_monkey_key) is True: Configuration.set(key='/ovs/framework/monitoring/stats_monkey', value=Configuration.get(key=old_stats_monkey_key)) Configuration.delete(key=old_stats_monkey_key) # Make sure to disable the stats monkey by default or take over the current schedule if it was configured manually before celery_key = '/ovs/framework/scheduling/celery' current_value = None scheduling_config = Configuration.get(key=celery_key, default={}) if 'statsmonkey.run_all_stats' in scheduling_config: # Old celery task name of the stats monkey current_value = scheduling_config.pop('statsmonkey.run_all_stats') scheduling_config['ovs.stats_monkey.run_all'] = current_value scheduling_config['alba.stats_monkey.run_all'] = current_value Configuration.set(key=celery_key, value=scheduling_config) support_key = '/ovs/framework/support' support_config = Configuration.get(key=support_key) support_config['support_agent'] = support_config.pop('enabled', True) support_config['remote_access'] = support_config.pop('enablesupport', False) Configuration.set(key=support_key, value=support_config) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|stats_monkey_integration', value=True) except Exception: MigrationController._logger.exception('Integration of stats monkey failed') ###################################################### # Write away cluster ID to a file for back-up purposes try: cluster_id = Configuration.get(key='/ovs/framework/cluster_id', default=None) with open(Configuration.CONFIG_STORE_LOCATION, 'r') as config_file: config = json.load(config_file) if cluster_id is not None and config.get('cluster_id', None) is None: config['cluster_id'] = cluster_id with open(Configuration.CONFIG_STORE_LOCATION, 'w') as config_file: json.dump(config, config_file, indent=4) except Exception: MigrationController._logger.exception('Writing cluster id to a file failed.') ######################################################### # Additional string formatting in Arakoon services (2.11) try: if Configuration.get(key='/ovs/framework/migration|arakoon_service_update', default=False) is False: arakoon_service_names = [ArakoonInstaller.get_service_name_for_cluster(cluster_name=cluster_name) for cluster_name in Configuration.list(key='ovs/arakoon')] for storagerouter in StorageRouterList.get_masters(): for service_name in arakoon_service_names: config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(storagerouter.machine_id, service_name) if Configuration.exists(key=config_key): config = Configuration.get(key=config_key) config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR config['ARAKOON_PKG_NAME'] = PackageFactory.PKG_ARAKOON config['ARAKOON_VERSION_CMD'] = PackageFactory.VERSION_CMD_ARAKOON Configuration.set(key=config_key, value=config) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|arakoon_service_update', value=True) except Exception: MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed') ############################################################ # Additional string formatting in ALBA proxy services (2.11) changed_clients = set() try: if Configuration.get(key='/ovs/framework/migration|alba_proxy_service_update', default=False) is False: alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_ALBA) for service in ServiceTypeList.get_by_name('AlbaProxy').services: root_client = sr_client_map[service.storagerouter_guid] config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(service.storagerouter.machine_id, service.name) if Configuration.exists(key=config_key): config = Configuration.get(key=config_key) config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR config['ALBA_PKG_NAME'] = alba_pkg_name config['ALBA_VERSION_CMD'] = alba_version_cmd Configuration.set(key=config_key, value=config) service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name='ovs-{0}'.format(service.name)) changed_clients.add(root_client) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|alba_proxy_service_update', value=True) except Exception: MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed') ############################################################ # Additional string formatting in DTL/VOLDRV services (2.11) try: if Configuration.get(key='/ovs/framework/migration|voldrv_service_update', default=False) is False: sd_pkg_name, sd_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD) for vpool in VPoolList.get_vpools(): for storagedriver in vpool.storagedrivers: root_client = sr_client_map[storagedriver.storagerouter_guid] for entry in ['dtl', 'volumedriver']: service_name = '{0}_{1}'.format(entry, vpool.name) service_template = StorageDriverInstaller.SERVICE_TEMPLATE_DTL if entry == 'dtl' else StorageDriverInstaller.SERVICE_TEMPLATE_SD config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(storagedriver.storagerouter.machine_id, service_name) if Configuration.exists(key=config_key): config = Configuration.get(key=config_key) config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR config['VOLDRV_PKG_NAME'] = sd_pkg_name config['VOLDRV_VERSION_CMD'] = sd_version_cmd Configuration.set(key=config_key, value=config) service_manager.regenerate_service(name=service_template, client=root_client, target_name='ovs-{0}'.format(service_name)) changed_clients.add(root_client) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|voldrv_service_update', value=True) except Exception: MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed') ####################################################### # Storing actual package name in version files (2.11.0) (https://github.com/openvstorage/framework/issues/1876) if Configuration.get(key='/ovs/framework/migration|actual_package_name_in_version_file', default=False) is False: try: voldrv_pkg_name, _ = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD) for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map.get(storagerouter.guid) if root_client is None: continue for file_name in root_client.file_list(directory=ServiceFactory.RUN_FILE_DIR): if not file_name.endswith('.version'): continue file_path = '{0}/{1}'.format(ServiceFactory.RUN_FILE_DIR, file_name) contents = root_client.file_read(filename=file_path) regenerate = False if voldrv_pkg_name == PackageFactory.PKG_VOLDRV_SERVER: if 'volumedriver-server' in contents: regenerate = True contents = contents.replace('volumedriver-server', PackageFactory.PKG_VOLDRV_SERVER) root_client.file_write(filename=file_path, contents=contents) elif voldrv_pkg_name == PackageFactory.PKG_VOLDRV_SERVER_EE: if 'volumedriver-server' in contents or PackageFactory.PKG_VOLDRV_SERVER in contents: regenerate = True contents = contents.replace('volumedriver-server', PackageFactory.PKG_VOLDRV_SERVER_EE) contents = contents.replace(PackageFactory.PKG_VOLDRV_SERVER, PackageFactory.PKG_VOLDRV_SERVER_EE) root_client.file_write(filename=file_path, contents=contents) if regenerate is True: service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_DTL if file_name.startswith('dtl') else StorageDriverInstaller.SERVICE_TEMPLATE_SD, client=root_client, target_name='ovs-{0}'.format(file_name.split('.')[0])) # Leave out .version changed_clients.add(root_client) Configuration.set(key='/ovs/framework/migration|actual_package_name_in_version_file', value=True) except Exception: MigrationController._logger.exception('Updating actual package name for version files failed') for root_client in changed_clients: try: root_client.run(['systemctl', 'daemon-reload']) except Exception: MigrationController._logger.exception('Executing command "systemctl daemon-reload" failed') ######################################################### # Addition of 'Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50' for AlbaProxy SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagedriver in StorageDriverList.get_storagedrivers(): root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: service = alba_proxy.service service_name = 'ovs-{0}'.format(service.name) if not service_manager.has_service(name=service_name, client=root_client): continue if "Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50" in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name='ovs-albaproxy', client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ######################################################### # Addition of 'Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50' for Arakoon SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-arakoon-'): continue if not service_manager.has_service(name=service_name, client=root_client): continue if "Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50" in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) MigrationController._logger.info('Finished out of band migrations')
def test_nsm_checkup_external(self): """ Validates whether the NSM checkup works for externally managed Arakoon clusters """ Configuration.set('/ovs/framework/plugins/alba/config|nsm.safety', 1) Configuration.set('/ovs/framework/plugins/alba/config|nsm.maxload', 10) structure = DalHelper.build_dal_structure(structure={'storagerouters': [1, 2, 3]}) alba_structure = AlbaDalHelper.build_dal_structure(structure={'alba_backends': [[1, 'LOCAL']]}) alba_backend = alba_structure['alba_backends'][1] storagerouter_1 = structure['storagerouters'][1] storagerouter_2 = structure['storagerouters'][2] # Validate some logic for externally managed arakoons during NSM checkup with self.assertRaises(ValueError) as raise_info: AlbaArakoonController.nsm_checkup(external_nsm_cluster_names=['test']) # No ALBA Backend specified self.assertEqual(first=str(raise_info.exception), second='Additional NSMs can only be configured for a specific ALBA Backend') with self.assertRaises(ValueError) as raise_info: AlbaArakoonController.nsm_checkup(alba_backend_guid=alba_backend.guid, min_internal_nsms=2, external_nsm_cluster_names=['test']) self.assertEqual(first=str(raise_info.exception), second="'min_internal_nsms' and 'external_nsm_cluster_names' are mutually exclusive") with self.assertRaises(ValueError) as raise_info: # noinspection PyTypeChecker AlbaArakoonController.nsm_checkup(alba_backend_guid=alba_backend.guid, external_nsm_cluster_names={}) # NSM cluster names must be a list self.assertEqual(first=str(raise_info.exception), second="'external_nsm_cluster_names' must be of type 'list'") with self.assertRaises(ValueError) as raise_info: AlbaArakoonController.nsm_checkup(alba_backend_guid=alba_backend.guid, external_nsm_cluster_names=['non-existing-cluster']) # non-existing cluster names should raise self.assertEqual(first=str(raise_info.exception), second="Arakoon cluster with name non-existing-cluster does not exist") # Create an external ABM and NSM Arakoon cluster external_abm_1 = 'backend_1-abm' external_nsm_1 = 'backend_1-nsm_0' external_nsm_2 = 'backend_1-nsm_1' for cluster_name, cluster_type in {external_abm_1: 'ABM', external_nsm_1: 'NSM', external_nsm_2: 'NSM'}.iteritems(): arakoon_installer = ArakoonInstaller(cluster_name=cluster_name) arakoon_installer.create_cluster(cluster_type=cluster_type, ip=storagerouter_1.ip, base_dir='/tmp', internal=False) arakoon_installer.extend_cluster(new_ip=storagerouter_2.ip, base_dir='/tmp') arakoon_installer.start_cluster() arakoon_installer.unclaim_cluster() self.assertDictEqual(d1={'cluster_name': cluster_name, 'cluster_type': cluster_type, 'internal': False, 'in_use': False}, d2=arakoon_installer.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name)) # Let the 'add_cluster` claim the externally managed clusters and model the services Logger._logs = {} AlbaController.add_cluster(alba_backend_guid=alba_backend.guid, abm_cluster=external_abm_1, nsm_clusters=[external_nsm_1]) # Only claim external_nsm_1 for cluster_name, cluster_type in {external_abm_1: 'ABM', external_nsm_1: 'NSM', external_nsm_2: 'NSM'}.iteritems(): arakoon_installer = ArakoonInstaller(cluster_name=cluster_name) self.assertDictEqual(d1={'cluster_name': cluster_name, 'cluster_type': cluster_type, 'internal': False, 'in_use': False if cluster_name == external_nsm_2 else True}, d2=arakoon_installer.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name)) log_found = False for log_record in Logger._logs.get('lib', []): if 'NSM load OK' in log_record: log_found = True break self.assertTrue(expr=log_found) self.assertEqual(first=1, second=len(alba_backend.abm_cluster.abm_services)) self.assertEqual(first=1, second=len(alba_backend.nsm_clusters)) self.assertEqual(first=1, second=len(alba_backend.nsm_clusters[0].nsm_services)) self.assertIsNone(obj=alba_backend.abm_cluster.abm_services[0].service.storagerouter) self.assertIsNone(obj=alba_backend.nsm_clusters[0].nsm_services[0].service.storagerouter) self.assertListEqual(VirtualAlbaBackend.run_log['backend_1-abm'], [['update_abm_client_config'], ['add_nsm_host', 'backend_1-nsm_0'], ['update_maintenance_config','--eviction-type-random'], ['update_maintenance_config','enable-auto-cleanup-deleted-namespaces-days']]) # Add cluster already invokes a NSM checkup, so nothing should have changed VirtualAlbaBackend.run_log['backend_1-abm'] = [] AlbaArakoonController.nsm_checkup() self.assertListEqual(list1=[], list2=VirtualAlbaBackend.run_log['backend_1-abm']) # Overload the only NSM and run NSM checkup. This should log a critical message, but change nothing VirtualAlbaBackend.data['backend_1-abm']['nsms'][0]['namespaces_count'] = 25 Logger._logs = {} AlbaArakoonController.nsm_checkup() log_found = False for log_record in Logger._logs.get('lib', []): if 'All NSM clusters are overloaded' in log_record: log_found = True break self.assertTrue(expr=log_found) self.assertEqual(first=1, second=len(alba_backend.abm_cluster.abm_services)) self.assertEqual(first=1, second=len(alba_backend.nsm_clusters)) self.assertEqual(first=1, second=len(alba_backend.nsm_clusters[0].nsm_services)) self.assertIsNone(obj=alba_backend.abm_cluster.abm_services[0].service.storagerouter) self.assertIsNone(obj=alba_backend.nsm_clusters[0].nsm_services[0].service.storagerouter) self.assertListEqual(list1=[], list2=VirtualAlbaBackend.run_log['backend_1-abm']) # Validate a maximum of 50 NSMs can be deployed current_nsms = [nsm_cluster.number for nsm_cluster in alba_backend.nsm_clusters] alba_structure = AlbaDalHelper.build_dal_structure( structure={'alba_nsm_clusters': [(1, 50)]}, # (<abackend_id>, <amount_of_nsm_clusters>) previous_structure=alba_structure ) # Try to add 1 additional NSM with self.assertRaises(ValueError) as raise_info: AlbaArakoonController.nsm_checkup(alba_backend_guid=alba_backend.guid, external_nsm_cluster_names=[external_nsm_2]) self.assertEqual(first=str(raise_info.exception), second='The maximum of 50 NSM Arakoon clusters will be exceeded. Amount of clusters that can be deployed for this ALBA Backend: 0') # Remove the unused NSM clusters again for nsm_cluster in alba_structure['alba_nsm_clusters'][1][len(current_nsms):]: for nsm_service in nsm_cluster.nsm_services: nsm_service.delete() nsm_service.service.delete() nsm_cluster.delete() # Try to add a previously claimed NSM cluster with self.assertRaises(ValueError) as raise_info: AlbaArakoonController.nsm_checkup(alba_backend_guid=alba_backend.guid, external_nsm_cluster_names=[external_nsm_1]) # The provided cluster_name to claim has already been claimed self.assertEqual(first=str(raise_info.exception), second='Some of the provided cluster_names have already been claimed before') # Add a 2nd NSM cluster AlbaArakoonController.nsm_checkup(alba_backend_guid=alba_backend.guid, external_nsm_cluster_names=[external_nsm_2]) self.assertEqual(first=1, second=len(alba_backend.abm_cluster.abm_services)) self.assertEqual(first=2, second=len(alba_backend.nsm_clusters)) self.assertEqual(first=1, second=len(alba_backend.nsm_clusters[0].nsm_services)) self.assertEqual(first=1, second=len(alba_backend.nsm_clusters[1].nsm_services)) self.assertIsNone(obj=alba_backend.abm_cluster.abm_services[0].service.storagerouter) self.assertIsNone(obj=alba_backend.nsm_clusters[0].nsm_services[0].service.storagerouter) self.assertIsNone(obj=alba_backend.nsm_clusters[1].nsm_services[0].service.storagerouter) self.assertListEqual(list1=[['add_nsm_host', 'backend_1-nsm_1']], list2=VirtualAlbaBackend.run_log['backend_1-abm']) for cluster_name, cluster_type in {external_abm_1: 'ABM', external_nsm_1: 'NSM', external_nsm_2: 'NSM'}.iteritems(): arakoon_installer = ArakoonInstaller(cluster_name=cluster_name) self.assertDictEqual(d1={'cluster_name': cluster_name, 'cluster_type': cluster_type, 'internal': False, 'in_use': True}, d2=arakoon_installer.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name))