def check_rabbitmq_and_enable_ha_mode(client, logger): """ Verify RabbitMQ is running properly and enable HA mode :param client: Client on which to check RabbitMQ :type client: ovs_extensions.generic.sshclient.SSHClient :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: None """ service_manager = ServiceFactory.get_manager() if not service_manager.has_service('rabbitmq-server', client): raise RuntimeError( 'Service rabbitmq-server has not been added on node {0}'. format(client.ip)) rabbitmq_running, same_process = service_manager.is_rabbitmq_running( client=client) if rabbitmq_running is False or same_process is False: ServiceFactory.change_service_state(client, 'rabbitmq-server', 'restart', logger) time.sleep(5) client.run([ 'rabbitmqctl', 'set_policy', 'ha-all', '^(volumerouter|ovs_.*)$', '{"ha-mode":"all"}' ])
def configure_avahi(client, node_name, node_type, logger): """ Configure Avahi :param client: Client on which to configure avahi :type client: ovs_extensions.generic.sshclient.SSHClient :param node_name: Name of the node to set in Avahi :type node_name: str :param node_type: Type of the node ('master' or 'extra') :type node_type: str :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: None """ valid_avahi = NodeTypeController.validate_avahi_cluster_name( ip=client.ip, cluster_name=Configuration.get('/ovs/framework/cluster_name'), node_name=node_name) if valid_avahi[0] is False: raise RuntimeError(valid_avahi[1]) Toolbox.log(logger=logger, messages='Announcing service') client.file_write( NodeTypeController.avahi_filename, """<?xml version="1.0" standalone='no'?> <!--*-nxml-*--> <!DOCTYPE service-group SYSTEM "avahi-service.dtd"> <!-- $Id$ --> <service-group> <name replace-wildcards="yes">{0}</name> <service> <type>_ovs_{1}_node._tcp</type> <port>443</port> </service> </service-group>""".format(valid_avahi[1], node_type)) ServiceFactory.change_service_state(client, 'avahi-daemon', 'restart', NodeTypeController._logger)
def __init__(self): """ Initializes the client """ # Safe calls self._node_id = System.get_my_machine_id().replace(r"'", r"'\''") # Alba is currently always installed but the Alba version/package info is located in the SDM section self._package_manager = PackageFactory.get_manager() self._service_manager = ServiceFactory.get_manager() self._service_type = ServiceFactory.get_service_type() if self._service_type != 'systemd': raise NotImplementedError('Only Systemd is supported') # Potential failing calls self._cluster_id = self.get_config_key( self.LOCATION_CLUSTER_ID, fallback=[CONFIG_STORE_LOCATION, 'cluster_id']) self.interval = self.get_config_key( self.LOCATION_INTERVAL, fallback=[self.FALLBACK_CONFIG, self.KEY_INTERVAL], default=self.DEFAULT_INTERVAL) self._openvpn_service_name = 'openvpn@ovs_{0}-{1}'.format( self._cluster_id, self._node_id) # Calls to look out for. These could still be None when using them self._storagerouter = None self._client = None self._set_storagerouter() self._set_client() # Safe call, start caching self.caching = SupportAgentCache(self)
def override_scheduletasks(configuration): """ Override the scheduled tasks crontab with your own confguration :param configuration: configuration to override scheduled tasks :type configuration: dict :return: """ service_name = 'ovs-watcher-framework' Configuration.set(CelerySetup.SCHEDULED_TASK_CFG, configuration) fetched_cfg = Configuration.get(CelerySetup.SCHEDULED_TASK_CFG, configuration) if cmp(fetched_cfg, configuration) == 0: # restart ovs-watcher-framework on all nodes for sr_ip in StoragerouterHelper.get_storagerouter_ips(): client = SSHClient(sr_ip, username='******') service_manager = ServiceFactory.get_manager() try: service_manager.restart_service(service_name, client) except: return False CelerySetup.LOGGER.info( "Successfully restarted all `{0}` services!".format( service_name)) return True else: CelerySetup.LOGGER.warning( "`{0}` config is `{1}` but should be `{2}`".format( CelerySetup.SCHEDULED_TASK_CFG, fetched_cfg, configuration)) return False
def _proxy_summary(self): """ Returns a summary of the proxies of this StorageDriver :return: summary of the proxies :rtype: dict """ proxy_info = {'red': 0, 'orange': 0, 'green': 0} summary = {'proxies': proxy_info} try: service_manager = ServiceFactory.get_manager() client = SSHClient(self.storagerouter) except Exception: self._logger.exception('Unable to retrieve necessary clients') else: for alba_proxy in self.alba_proxies: try: service_status = service_manager.get_service_status( alba_proxy.service.name, client) except Exception: # A ValueError can occur when the services are still being deployed (the model will be updated before the actual deployment) self._logger.exception( 'Unable to retrieve the service status for service {0} of StorageDriver {1}' .format(alba_proxy.service.name, self.guid)) proxy_info['red'] += 1 continue if service_status == 'active': proxy_info['green'] += 1 elif service_status == 'inactive': proxy_info['orange'] += 1 else: proxy_info['red'] += 1 finally: return summary
def check_alba_processes(result_handler): """ Checks the availability of processes for Alba :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ result_handler.info('Checking LOCAL ALBA services: ', add_to_result=False) client = SSHClient(AlbaHealthCheck.LOCAL_SR) service_manager = ServiceFactory.get_manager() services = [ service for service in service_manager.list_services(client=client) if service.startswith(AlbaHealthCheck.MODULE) ] if len(services) == 0: result_handler.skip('Found no LOCAL ALBA services.') return for service_name in services: if service_manager.get_service_status(service_name, client) == 'active': result_handler.success( 'Service {0} is running!'.format(service_name)) else: result_handler.failure( 'Service {0} is NOT running! '.format(service_name))
def wait_for_service(client, name, status, logger): """ Wait for service to enter status :param client: SSHClient to run commands :type client: ovs_extensions.generic.sshclient.SSHClient :param name: Name of service :type name: str :param status: 'active' if running, 'inactive' if halted :type status: str :param logger: Logging object :type logger: ovs.log.log_handler.LogHandler :return: None :rtype: NoneType """ tries = 10 service_manager = ServiceFactory.get_manager() service_status = service_manager.get_service_status(name, client) while tries > 0: if service_status == status: return logger.debug('... waiting for service {0}'.format(name)) tries -= 1 time.sleep(10 - tries) service_status = service_manager.get_service_status(name, client) raise RuntimeError( 'Service {0} does not have expected status: Expected: {1} - Actual: {2}' .format(name, status, service_status))
def check_ovs_processes(logger): """ Checks the availability of processes for Open vStorage :param logger: logging object :type logger: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ logger.info('Checking local ovs services.') client = SSHClient(OpenvStorageHealthCheck.LOCAL_SR) service_manager = ServiceFactory.get_manager() services = [ service for service in service_manager.list_services(client=client) if service.startswith(OpenvStorageHealthCheck.MODULE) ] if len(services) == 0: logger.warning('Found no local ovs services.') for service_name in services: if service_manager.get_service_status(service_name, client) == 'active': logger.success('Service {0} is running!'.format(service_name)) else: logger.failure( 'Service {0} is not running, please check this.'.format( service_name))
def idle_till_ovs_is_up(ip, username, password=None, connection_timeout=300, service_timeout=60, logger=LOGGER): """ wait until a node is back up and all ovs related are running (or potentially stuck) :param ip: ip of the node :param username: username to login with :param password: password to login with :param connection_timeout: raise when not online after these seconds :param service_timeout: poll for x seconds when checking services :param logger: logging instance :raise RuntimeError: when the timeout has been reached :return: dict with services mapped by their state """ # neutral_states = ['inactive', 'deactivating'] failed_states = ['failed', 'error'] active_states = ['active', 'reloading'] activating_state = 'activating' start_time = time.time() client = None while client is None: delta = time.time() - start_time if delta > connection_timeout: raise RuntimeError('Idling has timed out after {0}s'.format(delta)) try: client = SSHClient(ip, username=username, password=password) except: logger.debug('Could not establish a connection yet to {0} after {1}s'.format(ip, delta)) time.sleep(1) service_manager = ServiceFactory.get_manager() ovs_services = [service for service in service_manager.list_services(client) if service.startswith('ovs-')] active_services = [] failed_service = [] activating_services = [] # Initially class these services for service in ovs_services: logger.debug('Initially classifying {0}'.format(service)) service_state = service_manager.get_service_status(service, client) logger.debug('Service {0} - State {1}'.format(service, service_state)) if service_state in failed_states: failed_service.append(service) elif service_state in active_states: active_services.append(service) elif service_state == activating_state: activating_services.append(service) else: logger.error('Unable to process service state {0}'.format(service_state)) start_time = time.time() while len(activating_services) > 0: if time.time() - start_time > service_timeout: break service = activating_services.pop() service_state = service_manager.get_service_status(service, client) if service_state in failed_states: failed_service.append(service) elif service_state in active_states: active_services.append(service) elif service_state == activating_state: activating_services.append(service) return {'active': active_services, 'failed': failed_service, 'activating': activating_services}
def _get_filedescriptors(cls, result_handler, arakoon_clusters, batch_size=10): """ Retrieve tlog/tlx stat information for a Arakoon cluster concurrently Note: this will mutate the given arakoon_clusters dict :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param arakoon_clusters: Information about all Arakoon clusters, sorted by type and given config :type arakoon_clusters: dict :param batch_size: Amount of workers to collect the Arakoon information. Every worker means a connection towards a different node :return: Dict with file descriptors contents for every node config :rtype: dict """ queue = Queue.Queue() clients = {} # Prep work for cluster_type, clusters in arakoon_clusters.iteritems(): for cluster in clusters: cluster_name = cluster['cluster_name'] arakoon_config = cluster['config'] cluster['fd_result'] = {} for node_config in arakoon_config.nodes: result = {'errors': [], 'result': {'fds': []}} # Build SSHClients outside the threads to avoid GIL try: client = clients.get(node_config.ip) if client is None: client = SSHClient(node_config.ip, timeout=5) clients[node_config.ip] = client except Exception as ex: result['errors'].append(('build_client', ex)) continue cluster['fd_result'][node_config] = result queue.put((cluster_name, node_config, result)) service_manager = ServiceFactory.get_manager() # Limit to one session for every node. # Every process will fork from this one, creating a new session instead of using the already existing channel # There might be an issue issue if a ssh session would take too long causing all workers to connect to that one node # and therefore hitting the MaxSessions again (theory) for _ in xrange(min(len(clients.keys()), batch_size)): thread = Thread(target=cls._fd_worker, args=(queue, clients, result_handler, service_manager)) thread.setDaemon( True ) # Setting threads as "daemon" allows main program to exit eventually even if these don't finish correctly. thread.start() # Wait for all results queue.join() return arakoon_clusters
def _roll_out_dtl_services(vpool, storagerouters): """ Deploy and start the DTL service on all storagerouters :param storagerouters: StorageRouters to deploy and start a DTL service on :return: None """ service_manager = ServiceFactory.get_manager() service_name = 'dtl_{0}'.format(vpool.name) for sr in storagerouters.values(): client = SSHClient(sr, 'root') service_manager.add_service(name=service_name, client=client) service_manager.start_service(name=service_name, client=client)
def change_config(storagedriver, config): """ Change the config of the volumedriver and reload the config. Restart will be triggered if no vDisk are running on the volumedriver. :param storagedriver: StorageDriver object :type storagedriver: StorageDriver :param config: Volumedriver config :type config: dict :return: """ service_manager = ServiceFactory.get_manager() config_key = '/ovs/vpools/{0}/hosts/{1}/config'.format( storagedriver.vpool.guid, storagedriver.name) current_config = Configuration.get(config_key) if 'volume_manager' in config: volume_manager = current_config['volume_manager'] for key, value in config['volume_manager'].iteritems(): volume_manager[key] = value if 'backend_connection_manager' in config: backend_connection_manager = current_config[ 'backend_connection_manager'] for key, value in config['backend_connection_manager'].iteritems(): if key == 'proxy': for current_config_key, current_config_value in backend_connection_manager.iteritems( ): if current_config_key.isdigit(): for proxy_key, proxy_config in config[ 'backend_connection_manager'][ 'proxy'].iteritems(): current_config_value[proxy_key] = proxy_config else: backend_connection_manager[key] = value StoragedriverHelper.LOGGER.info("New config: {0}".format( json.dumps(current_config, indent=4))) Configuration.set(config_key, json.dumps(current_config, indent=4), raw=True) client = SSHClient(storagedriver.storagerouter, 'root') service_name = 'ovs-volumedriver_{0}'.format(storagedriver.vpool.name) if len(storagedriver.vdisks_guids) == 0: StoragedriverHelper.LOGGER.info( "Restarting service: {0}".format(service_name)) service_manager.restart_service(service_name, client) else: StoragedriverHelper.LOGGER.info( "Not restarting service: {0}, amount of vdisks: {1}".format( service_name, len(storagedriver.vdisks_guids)))
def restart_framework_and_memcache_services(clients, logger, offline_node_ips=None): """ Restart framework and Memcached services :param clients: Clients on which to restart these services :type clients: dict :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :param offline_node_ips: IP addresses of offline nodes in the cluster :type offline_node_ips: list :return: None """ from ovs.dal.lists.storagerouterlist import StorageRouterList service_manager = ServiceFactory.get_manager() master_ips = [sr.ip for sr in StorageRouterList.get_masters()] slave_ips = [sr.ip for sr in StorageRouterList.get_slaves()] if offline_node_ips is None: offline_node_ips = [] memcached = 'memcached' watcher = 'watcher-framework' support_agent = 'support-agent' for ip in master_ips + slave_ips: if ip not in offline_node_ips: if service_manager.has_service(watcher, clients[ip]): ServiceFactory.change_service_state( clients[ip], watcher, 'stop', logger) for ip in master_ips: if ip not in offline_node_ips: ServiceFactory.change_service_state(clients[ip], memcached, 'restart', logger) for ip in master_ips + slave_ips: if ip not in offline_node_ips: if service_manager.has_service(watcher, clients[ip]): ServiceFactory.change_service_state( clients[ip], watcher, 'start', logger) if service_manager.has_service(support_agent, clients[ip]): ServiceFactory.change_service_state( clients[ip], support_agent, 'restart', logger) VolatileFactory.store = None
def get_non_running_ovs_services(client): """ get all non-running ovs services :param client: sshclient instance :return: list of non running ovs services :rtype: list """ non_running_ovs_services = [] service_manager = ServiceFactory.get_manager() for service in service_manager.list_services(client): if not service.startswith('ovs-'): continue if service_manager.get_service_status(service, client) != 'active': non_running_ovs_services.append(service) return non_running_ovs_services
def install_plugins(): """ (Re)load plugins """ manager = ServiceFactory.get_manager() if manager.has_service('ovs-watcher-framework', SSHClient('127.0.0.1', username='******')): # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed print 'Installing plugin into Open vStorage' from ovs.dal.lists.storagerouterlist import StorageRouterList clients = {} masters = StorageRouterList.get_masters() slaves = StorageRouterList.get_slaves() try: for sr in masters + slaves: clients[sr] = SSHClient(sr, username='******') except UnableToConnectException: raise RuntimeError('Not all StorageRouters are reachable') memcached = 'memcached' watcher = 'watcher-framework' for sr in masters + slaves: if manager.has_service(watcher, clients[sr]): print '- Stopping watcher on {0} ({1})'.format( sr.name, sr.ip) manager.stop_service(watcher, clients[sr]) for sr in masters: print '- Restarting memcached on {0} ({1})'.format( sr.name, sr.ip) manager.restart_service(memcached, clients[sr]) for sr in masters + slaves: if manager.has_service(watcher, clients[sr]): print '- Starting watcher on {0} ({1})'.format( sr.name, sr.ip) manager.start_service(watcher, clients[sr]) print '- Execute model migrations' from ovs.dal.helpers import Migration Migration.migrate() from ovs.lib.helpers.toolbox import Toolbox ip = System.get_my_storagerouter().ip functions = Toolbox.fetch_hooks('plugin', 'postinstall') if len(functions) > 0: print '- Execute post installation scripts' for fct in functions: fct(ip=ip) print 'Installing plugin into Open vStorage: Completed'
def _get_filedescriptors(cls, result_handler, arakoon_clusters, batch_size=10): """ Retrieve tlog/tlx stat information for a Arakoon cluster concurrently Note: this will mutate the given arakoon_clusters dict :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :param arakoon_clusters: Information about all Arakoon clusters, sorted by type and given config :type arakoon_clusters: dict :param batch_size: Amount of workers to collect the Arakoon information. Every worker means a connection towards a different node :return: Dict with file descriptors contents for every node config :rtype: dict """ queue = Queue.Queue() clients = {} # Prep work for cluster_type, clusters in arakoon_clusters.iteritems(): for cluster in clusters: cluster_name = cluster['cluster_name'] arakoon_config = cluster['config'] cluster['fd_result'] = {} for node_config in arakoon_config.nodes: result = {'errors': [], 'result': {'fds': []}} # Build SSHClients outside the threads to avoid GIL try: client = clients.get(node_config.ip) if client is None: client = SSHClient(node_config.ip, timeout=5) clients[node_config.ip] = client except Exception as ex: result['errors'].append(('build_client', ex)) continue cluster['fd_result'][node_config] = result queue.put((cluster_name, node_config, result)) service_manager = ServiceFactory.get_manager() # Limit to one session for every node. # Every process will fork from this one, creating a new session instead of using the already existing channel # There might be an issue issue if a ssh session would take too long causing all workers to connect to that one node # and therefore hitting the MaxSessions again (theory) for _ in xrange(min(len(clients.keys()), batch_size)): thread = Thread(target=cls._fd_worker, args=(queue, clients, result_handler, service_manager)) thread.setDaemon(True) # Setting threads as "daemon" allows main program to exit eventually even if these don't finish correctly. thread.start() # Wait for all results queue.join() return arakoon_clusters
def __init__(self, ip): """ Create RabbitMQ object :param ip: ip from the server :type ip: str """ # check if rabbitmq is available on the ip if not RabbitMQ._check_rabbitmq_ip(ip): raise ValueError('RabbitMQ on {0} could not be found.'.format(ip)) self._service_manager = ServiceFactory.get_manager() self.ip = ip if RabbitMQ.INTERNAL: self._storagerouter = StorageRouterList.get_by_ip(ip) self._client = SSHClient(ip, username='******') if not self.check_management_plugin(): self.enable_management_plugin()
def add_services(client, node_type, logger): """ Add the services required by the OVS cluster :param client: Client on which to add the services :type client: ovs_extensions.generic.sshclient.SSHClient :param node_type: Type of node ('master' or 'extra') :type node_type: str :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: None """ Toolbox.log(logger=logger, messages='Adding services') service_manager = ServiceFactory.get_manager() services = {} worker_queue = System.get_my_machine_id(client=client) if node_type == 'master': worker_queue += ',ovs_masters' services.update({ 'memcached': { 'MEMCACHE_NODE_IP': client.ip, 'WORKER_QUEUE': worker_queue }, 'rabbitmq-server': { 'MEMCACHE_NODE_IP': client.ip, 'WORKER_QUEUE': worker_queue }, 'scheduled-tasks': {}, 'webapp-api': {}, 'volumerouter-consumer': {} }) services.update({ 'workers': { 'WORKER_QUEUE': worker_queue }, 'watcher-framework': {} }) for service_name, params in services.iteritems(): if not service_manager.has_service(service_name, client): Toolbox.log(logger=logger, messages='Adding service {0}'.format(service_name)) service_manager.add_service(name=service_name, params=params, client=client)
def configure_proxy(backend_name, proxy_configuration): faulty_keys = [ key for key in proxy_configuration.keys() if key not in ProxySetup.PARAMS ] if len(faulty_keys) > 0: raise ValueError( '{0} are unsupported keys for proxy configuration.'.format( ', '.join(faulty_keys))) ExtensionsToolbox.verify_required_params(ProxySetup.PARAMS, proxy_configuration) vpools = VPoolList.get_vpools() service_manager = ServiceFactory.get_manager() with open('/root/old_proxies', 'w') as backup_file: for vpool in vpools: if vpool.metadata['backend']['backend_info'][ 'name'] != backend_name: continue for storagedriver in vpool.storagedrivers: for proxy in storagedriver.alba_proxies: config_loc = 'ovs/vpools/{0}/proxies/{1}/config/main'.format( vpool.guid, proxy.guid) proxy_service = Service(proxy.service_guid) proxy_config = Configuration.get(config_loc) old_proxy_config = dict(proxy_config) backup_file.write('{} -- {}\n'.format( config_loc, old_proxy_config)) proxy_config.update(proxy_configuration) ProxySetup.LOGGER.info( "Changed {0} to {1} for proxy {2}".format( old_proxy_config, proxy_config, config_loc)) ProxySetup.LOGGER.info("Changed items {0}".format([ (key, value) for key, value in proxy_config.iteritems() if key not in old_proxy_config.keys() ])) Configuration.set(config_loc, json.dumps(proxy_config, indent=4), raw=True) client = SSHClient(storagedriver.storage_ip, username='******') service_manager.restart_service(proxy_service.name, client=client)
def check_ovs_processes(result_handler): """ Checks the availability of processes for Open vStorage :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ result_handler.info('Checking local ovs services.') client = SSHClient(System.get_my_storagerouter()) service_manager = ServiceFactory.get_manager() services = [service for service in service_manager.list_services(client=client) if service.startswith(OpenvStorageHealthCheck.MODULE)] if len(services) == 0: result_handler.warning('Found no local ovs services.') for service_name in services: if service_manager.get_service_status(service_name, client) == 'active': result_handler.success('Service {0} is running!'.format(service_name), code=ErrorCodes.process_fwk) else: result_handler.failure('Service {0} is not running, please check this.'.format(service_name), code=ErrorCodes.process_fwk)
def _on_remove(cluster_ip, complete_removal): """ Handles the StorageDriver removal part of a node :param cluster_ip: IP of the node which is being removed from the cluster :type cluster_ip: str :param complete_removal: Unused for StorageDriver, used for AlbaController :type complete_removal: bool :return: None """ _ = complete_removal service_manager = ServiceFactory.get_manager() service_name = 'watcher-volumedriver' try: client = SSHClient(endpoint=cluster_ip, username='******') if service_manager.has_service(name=service_name, client=client): service_manager.stop_service(name=service_name, client=client) service_manager.remove_service(name=service_name, client=client) except (UnableToConnectException, NotAuthenticatedException): pass
def check_alba_processes(result_handler): """ Checks the availability of processes for Alba :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ result_handler.info('Checking LOCAL ALBA services: ', add_to_result=False) client = SSHClient(System.get_my_storagerouter()) service_manager = ServiceFactory.get_manager() services = [service for service in service_manager.list_services(client=client) if service.startswith(AlbaHealthCheck.MODULE)] if len(services) == 0: result_handler.skip('Found no LOCAL ALBA services.') return for service_name in services: if service_manager.get_service_status(service_name, client) == 'active': result_handler.success('Service {0} is running!'.format(service_name), code=ErrorCodes.alba_service_running) else: result_handler.failure('Service {0} is NOT running! '.format(service_name), code=ErrorCodes.alba_service_down)
def change_service_state(client, name, state, logger=None): """ Starts/stops/restarts a service :param client: SSHClient on which to connect and change service state :param name: Name of the service :param state: State to put the service in :param logger: LogHandler Object """ service_manager = ServiceFactory.get_manager() action = None status = service_manager.get_service_status(name, client=client) if status != 'active' and state in ['start', 'restart']: if logger is not None: logger.debug('{0}: Starting service {1}'.format( client.ip, name)) service_manager.start_service(name, client=client) action = 'Started' elif status == 'active' and state == 'stop': if logger is not None: logger.debug('{0}: Stopping service {1}'.format( client.ip, name)) service_manager.stop_service(name, client=client) action = 'Stopped' elif status == 'active' and state == 'restart': if logger is not None: logger.debug('{0}: Restarting service {1}'.format( client.ip, name)) service_manager.restart_service(name, client=client) action = 'Restarted' if action is None: print ' [{0}] {1} already {2}'.format( client.ip, name, 'running' if status == 'active' else 'halted') else: if logger is not None: logger.debug('{0}: {1} service {2}'.format( client.ip, action, name)) print ' [{0}] {1} {2}'.format(client.ip, name, action.lower())
def __init__(self, vdisk_guid): # type: (str) -> None """ Initializes a new MDSCatchUp An instance populates some caches. These cached are cleared once the instance is garbage collected. When running MDSCatchup in bulk: add them to a list to speed up the process :param vdisk_guid: Guid of the vDisk to catch up for :type vdisk_guid: str """ self.id = str(uuid.uuid4()) self.vdisk = VDisk(vdisk_guid) self.mds_key = self._CATCH_UP_VDISK_KEY.format(self.vdisk.guid) self.tlog_threshold = Configuration.get( 'ovs/volumedriver/mds|tlogs_behind', default=100) self.volumedriver_service_name = 'ovs-volumedriver_{0}'.format( self.vdisk.vpool.name) self.mds_client_timeout = Configuration.get( 'ovs/vpools/{0}/mds_config|mds_client_connection_timeout'.format( self.vdisk.vpool_guid), default=120) self.mds_clients = {} self.dry_run = False self.catch_up_threads = [] self.errors = [] self._service_manager = ServiceFactory.get_manager() self._persistent = PersistentFactory.get_client() self._log = 'MDS catchup {0} - vDisk {1} (volume id: {2})'.format( self.id, self.vdisk.guid, self.vdisk.volume_id) self._clients = self.build_clients() self._volumedriver_contexts = self.get_volumedriver_contexts() self._worker_contexts = self.get_worker_contexts() self._worker_context = self._worker_contexts[ System.get_my_storagerouter()] self._relevant_contexts = self._get_all_relevant_contexts( ) # All possible contexts (by mixing volumedriver ones with workers)
def change_scheduled_task(task_name, state, disabled=False, cron=None): if not Configuration.exists(celery_key): Configuration.set(celery_key, {}) jobs = Configuration.get(celery_key) if state == 'present': if disabled: jobs[task_name] = None output = 'task {0}: disabled'.format(task_name) else: jobs[task_name] = cron settings = '' for key, value in cron.iteritems(): settings += "{0}: {1} ".format(key, value) output = 'task {0}: cron settings {1}'.format(task_name, settings) else: jobs.pop(task_name, None) output = 'task {0}: removed, default settings will be applied.'.format(task_name) Configuration.set(celery_key, jobs) service_name = 'scheduled-tasks' service_manager = ServiceFactory.get_manager() for storagerouter in StorageRouterList.get_masters(): client = SSHClient(storagerouter, username='******') service_manager.restart_service(service_name, client=client) return output
def remove_services(client, node_type, logger): """ Remove all services managed by OVS :param client: Client on which to remove the services :type client: ovs_extensions.generic.sshclient.SSHClient :param node_type: Type of node, can be 'master' or 'extra' :type node_type: str :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: None """ Toolbox.log(logger=logger, messages='Removing services') service_manager = ServiceFactory.get_manager() stop_only = ['rabbitmq-server', 'memcached'] services = ['workers', 'support-agent', 'watcher-framework'] if node_type == 'master': services += [ 'scheduled-tasks', 'webapp-api', 'volumerouter-consumer' ] if Toolbox.is_service_internally_managed( service='rabbitmq') is True: services.append('rabbitmq-server') if Toolbox.is_service_internally_managed( service='memcached') is True: services.append('memcached') for service in services: if service_manager.has_service(service, client=client): Toolbox.log( logger=logger, messages='{0} service {1}'.format( 'Removing' if service not in stop_only else 'Stopping', service)) service_manager.stop_service(service, client=client) if service not in stop_only: service_manager.remove_service(service, client=client)
def migrate(): """ Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually executed. This code will typically contain: * "dangerous" migration code (it needs certain running services) * Migration code depending on a cluster-wide state * ... * Successfully finishing a piece of migration code, should create an entry in /ovs/framework/migration in case it should not be executed again * Eg: /ovs/framework/migration|stats_monkey_integration: True """ MigrationController._logger.info('Preparing out of band migrations...') from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.storagedriverlist import StorageDriverList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.vpoollist import VPoolList from ovs.extensions.db.arakooninstaller import ArakoonInstaller from ovs.extensions.generic.configuration import Configuration from ovs.extensions.generic.sshclient import SSHClient from ovs_extensions.generic.toolbox import ExtensionsToolbox from ovs.extensions.migration.migration.ovsmigrator import ExtensionMigrator from ovs.extensions.packages.packagefactory import PackageFactory from ovs_extensions.services.interfaces.systemd import Systemd from ovs.extensions.services.servicefactory import ServiceFactory from ovs.extensions.storageserver.storagedriver import StorageDriverConfiguration from ovs.lib.helpers.storagedriver.installer import StorageDriverInstaller MigrationController._logger.info('Start out of band migrations...') service_manager = ServiceFactory.get_manager() sr_client_map = {} for storagerouter in StorageRouterList.get_storagerouters(): sr_client_map[storagerouter.guid] = SSHClient(endpoint=storagerouter.ip, # Is triggered during post-update code too during which the ovs-watcher-framework service is still down and thus not refreshing the heartbeat --> use IP i/o StorageRouter username='******') ######################################################### # Addition of 'ExecReload' for AlbaProxy SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagedriver in StorageDriverList.get_storagedrivers(): root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: service = alba_proxy.service service_name = 'ovs-{0}'.format(service.name) if not service_manager.has_service(name=service_name, client=root_client): continue if 'ExecReload=' in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ################################################################## # Adjustment of open file descriptors for Arakoon services to 8192 changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-arakoon-'): continue if ServiceFactory.get_service_type() == 'systemd': path = '/lib/systemd/system/{0}.service'.format(service_name) check = 'LimitNOFILE=8192' else: path = '/etc/init/{0}.conf'.format(service_name) check = 'limit nofile 8192 8192' if not root_client.file_exists(path): continue if check in root_client.file_read(path): continue try: service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name) changed_clients.add(root_client) ExtensionsToolbox.edit_version_file(client=root_client, package_name='arakoon', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, service_name)) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ############################# # Migrate to multiple proxies for storagedriver in StorageDriverList.get_storagedrivers(): vpool = storagedriver.vpool root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: # Rename alba_proxy service in model service = alba_proxy.service old_service_name = 'albaproxy_{0}'.format(vpool.name) new_service_name = 'albaproxy_{0}_0'.format(vpool.name) if old_service_name != service.name: continue service.name = new_service_name service.save() if not service_manager.has_service(name=old_service_name, client=root_client): continue old_configuration_key = '/ovs/framework/hosts/{0}/services/{1}'.format(storagedriver.storagerouter.machine_id, old_service_name) if not Configuration.exists(key=old_configuration_key): continue # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, old_service_name), new_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, new_service_name)) # Register new service and remove old service service_manager.add_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, params=Configuration.get(old_configuration_key), target_name='ovs-{0}'.format(new_service_name)) # Update scrub proxy config proxy_config_key = '/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid) proxy_config = None if Configuration.exists(key=proxy_config_key) is False else Configuration.get(proxy_config_key) if proxy_config is not None: fragment_cache = proxy_config.get(StorageDriverConfiguration.CACHE_FRAGMENT, ['none', {}]) if fragment_cache[0] == 'alba' and fragment_cache[1].get('cache_on_write') is True: # Accelerated ALBA configured fragment_cache_scrub_info = copy.deepcopy(fragment_cache) fragment_cache_scrub_info[1]['cache_on_read'] = False proxy_scrub_config_key = '/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid) proxy_scrub_config = None if Configuration.exists(key=proxy_scrub_config_key) is False else Configuration.get(proxy_scrub_config_key) if proxy_scrub_config is not None and proxy_scrub_config[StorageDriverConfiguration.CACHE_FRAGMENT] == ['none']: proxy_scrub_config[StorageDriverConfiguration.CACHE_FRAGMENT] = fragment_cache_scrub_info Configuration.set(key=proxy_scrub_config_key, value=proxy_scrub_config) # Update 'backend_connection_manager' section changes = False storagedriver_config = StorageDriverConfiguration(vpool.guid, storagedriver.storagedriver_id) if 'backend_connection_manager' not in storagedriver_config.configuration: continue current_config = storagedriver_config.configuration['backend_connection_manager'] if current_config.get('backend_type') != 'MULTI': changes = True backend_connection_manager = {'backend_type': 'MULTI'} for index, proxy in enumerate(sorted(storagedriver.alba_proxies, key=lambda pr: pr.service.ports[0])): backend_connection_manager[str(index)] = copy.deepcopy(current_config) # noinspection PyUnresolvedReferences backend_connection_manager[str(index)]['alba_connection_use_rora'] = True # noinspection PyUnresolvedReferences backend_connection_manager[str(index)]['alba_connection_rora_manifest_cache_capacity'] = 5000 # noinspection PyUnresolvedReferences for key, value in backend_connection_manager[str(index)].items(): if key.startswith('backend_interface'): backend_connection_manager[key] = value # noinspection PyUnresolvedReferences del backend_connection_manager[str(index)][key] for key, value in {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}.iteritems(): if key not in backend_connection_manager: backend_connection_manager[key] = value else: backend_connection_manager = current_config for value in backend_connection_manager.values(): if isinstance(value, dict): for key, val in value.items(): if key.startswith('backend_interface'): backend_connection_manager[key] = val changes = True del value[key] for key, value in {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}.iteritems(): if key not in backend_connection_manager: changes = True backend_connection_manager[key] = value if changes is True: storagedriver_config.clear_backend_connection_manager() storagedriver_config.configure_backend_connection_manager(**backend_connection_manager) storagedriver_config.save(root_client) # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section) ExtensionsToolbox.edit_version_file(client=root_client, package_name='volumedriver', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, 'volumedriver_{0}'.format(vpool.name))) if service_manager.__class__ == Systemd: root_client.run(['systemctl', 'daemon-reload']) ######################################## # Update metadata_store_bits information vpools = VPoolList.get_vpools() for vpool in vpools: bits = None for storagedriver in vpool.storagedrivers: key = '/ovs/framework/hosts/{0}/services/volumedriver_{1}'.format(storagedriver.storagerouter.machine_id, vpool.name) if Configuration.exists(key=key) and 'METADATASTORE_BITS' not in Configuration.get(key=key): if bits is None: entries = service_manager.extract_from_service_file(name='ovs-volumedriver_{0}'.format(vpool.name), client=sr_client_map[storagedriver.storagerouter_guid], entries=['METADATASTORE_BITS=']) if len(entries) == 1: bits = entries[0].split('=')[-1] bits = int(bits) if bits.isdigit() else 5 if bits is not None: try: content = Configuration.get(key=key) content['METADATASTORE_BITS'] = bits Configuration.set(key=key, value=content) except: MigrationController._logger.exception('Error updating volumedriver info for vPool {0} on StorageRouter {1}'.format(vpool.name, storagedriver.storagerouter.name)) if bits is not None: vpool.metadata_store_bits = bits vpool.save() ##################################### # Update the vPool metadata structure def _update_metadata_structure(metadata): metadata = copy.deepcopy(metadata) cache_structure = {'read': False, 'write': False, 'is_backend': False, 'quota': None, 'backend_info': {'name': None, # Will be filled in when is_backend is true 'backend_guid': None, 'alba_backend_guid': None, 'policies': None, 'preset': None, 'arakoon_config': None, 'connection_info': {'client_id': None, 'client_secret': None, 'host': None, 'port': None, 'local': None}} } structure_map = {StorageDriverConfiguration.CACHE_BLOCK: {'read': 'block_cache_on_read', 'write': 'block_cache_on_write', 'quota': 'quota_bc', 'backend_prefix': 'backend_bc_{0}'}, StorageDriverConfiguration.CACHE_FRAGMENT: {'read': 'fragment_cache_on_read', 'write': 'fragment_cache_on_write', 'quota': 'quota_fc', 'backend_prefix': 'backend_aa_{0}'}} if 'arakoon_config' in metadata['backend']: # Arakoon config should be placed under the backend info metadata['backend']['backend_info']['arakoon_config'] = metadata['backend'].pop('arakoon_config') if 'connection_info' in metadata['backend']: # Connection info sohuld be placed under the backend info metadata['backend']['backend_info']['connection_info'] = metadata['backend'].pop('connection_info') if 'caching_info' not in metadata: # Caching info is the new key would_be_caching_info = {} metadata['caching_info'] = would_be_caching_info # Extract all caching data for every storagerouter current_caching_info = metadata['backend'].pop('caching_info') # Pop to mutate metadata for storagerouter_guid in current_caching_info.iterkeys(): current_cache_data = current_caching_info[storagerouter_guid] storagerouter_caching_info = {} would_be_caching_info[storagerouter_guid] = storagerouter_caching_info for cache_type, cache_type_mapping in structure_map.iteritems(): new_cache_structure = copy.deepcopy(cache_structure) storagerouter_caching_info[cache_type] = new_cache_structure for new_structure_key, old_structure_key in cache_type_mapping.iteritems(): if new_structure_key == 'backend_prefix': # Get possible backend related info metadata_key = old_structure_key.format(storagerouter_guid) if metadata_key not in metadata: continue backend_data = metadata.pop(metadata_key) # Pop to mutate metadata new_cache_structure['is_backend'] = True # Copy over the old data new_cache_structure['backend_info']['arakoon_config'] = backend_data['arakoon_config'] new_cache_structure['backend_info'].update(backend_data['backend_info']) new_cache_structure['backend_info']['connection_info'].update(backend_data['connection_info']) else: new_cache_structure[new_structure_key] = current_cache_data.get(old_structure_key) return metadata vpools = VPoolList.get_vpools() for vpool in vpools: try: new_metadata = _update_metadata_structure(vpool.metadata) vpool.metadata = new_metadata vpool.save() except KeyError: MigrationController._logger.exception('Exceptions occurred when updating the metadata for vPool {0}'.format(vpool.name)) ############################################## # Always use indent=4 during Configuration set def _resave_all_config_entries(config_path='/ovs'): """ Recursive functions which checks every config management key if its a directory or not. If not a directory, we retrieve the config and just save it again using the new indentation logic """ for item in Configuration.list(config_path): new_path = config_path + '/' + item print new_path if Configuration.dir_exists(new_path) is True: _resave_all_config_entries(config_path=new_path) else: try: _config = Configuration.get(new_path) Configuration.set(new_path, _config) except: _config = Configuration.get(new_path, raw=True) Configuration.set(new_path, _config, raw=True) if ExtensionMigrator.THIS_VERSION <= 13: # There is no way of checking whether this new indentation logic has been applied, so we only perform this for version 13 and lower MigrationController._logger.info('Re-saving every configuration setting with new indentation rules') _resave_all_config_entries() ############################ # Update some default values def _update_manifest_cache_size(_proxy_config_key): updated = False manifest_cache_size = 500 * 1024 * 1024 if Configuration.exists(key=_proxy_config_key): _proxy_config = Configuration.get(key=_proxy_config_key) for cache_type in [StorageDriverConfiguration.CACHE_BLOCK, StorageDriverConfiguration.CACHE_FRAGMENT]: if cache_type in _proxy_config and _proxy_config[cache_type][0] == 'alba': if _proxy_config[cache_type][1]['manifest_cache_size'] != manifest_cache_size: updated = True _proxy_config[cache_type][1]['manifest_cache_size'] = manifest_cache_size if _proxy_config['manifest_cache_size'] != manifest_cache_size: updated = True _proxy_config['manifest_cache_size'] = manifest_cache_size if updated is True: Configuration.set(key=_proxy_config_key, value=_proxy_config) return updated for storagedriver in StorageDriverList.get_storagedrivers(): try: vpool = storagedriver.vpool root_client = sr_client_map[storagedriver.storagerouter_guid] _update_manifest_cache_size('/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)) # Generic scrub proxy is deployed every time scrubbing kicks in, so no need to restart these services for alba_proxy in storagedriver.alba_proxies: if _update_manifest_cache_size('/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid)) is True: # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, alba_proxy.service.name)) # Update 'backend_connection_manager' section changes = False storagedriver_config = StorageDriverConfiguration(vpool.guid, storagedriver.storagedriver_id) if 'backend_connection_manager' not in storagedriver_config.configuration: continue current_config = storagedriver_config.configuration['backend_connection_manager'] for key, value in current_config.iteritems(): if key.isdigit() is True: if value.get('alba_connection_asd_connection_pool_capacity') != 10: changes = True value['alba_connection_asd_connection_pool_capacity'] = 10 if value.get('alba_connection_timeout') != 30: changes = True value['alba_connection_timeout'] = 30 if value.get('alba_connection_rora_manifest_cache_capacity') != 25000: changes = True value['alba_connection_rora_manifest_cache_capacity'] = 25000 if changes is True: storagedriver_config.clear_backend_connection_manager() storagedriver_config.configure_backend_connection_manager(**current_config) storagedriver_config.save(root_client) # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section) ExtensionsToolbox.edit_version_file(client=root_client, package_name='volumedriver', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, 'volumedriver_{0}'.format(vpool.name))) except Exception: MigrationController._logger.exception('Updating default configuration values failed for StorageDriver {0}'.format(storagedriver.storagedriver_id)) #################################################### # Adding proxy fail fast as env variable for proxies changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-albaproxy_'): continue if ServiceFactory.get_service_type() == 'systemd': path = '/lib/systemd/system/{0}.service'.format(service_name) check = 'Environment=ALBA_FAIL_FAST=true' else: path = '/etc/init/{0}.conf'.format(service_name) check = 'env ALBA_FAIL_FAST=true' if not root_client.file_exists(path): continue if check in root_client.file_read(path): continue try: service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name=service_name) changed_clients.add(root_client) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, service_name)) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ###################################### # Integration of stats monkey (2.10.2) if Configuration.get(key='/ovs/framework/migration|stats_monkey_integration', default=False) is False: try: # Get content of old key into new key old_stats_monkey_key = '/statsmonkey/statsmonkey' if Configuration.exists(key=old_stats_monkey_key) is True: Configuration.set(key='/ovs/framework/monitoring/stats_monkey', value=Configuration.get(key=old_stats_monkey_key)) Configuration.delete(key=old_stats_monkey_key) # Make sure to disable the stats monkey by default or take over the current schedule if it was configured manually before celery_key = '/ovs/framework/scheduling/celery' current_value = None scheduling_config = Configuration.get(key=celery_key, default={}) if 'statsmonkey.run_all_stats' in scheduling_config: # Old celery task name of the stats monkey current_value = scheduling_config.pop('statsmonkey.run_all_stats') scheduling_config['ovs.stats_monkey.run_all'] = current_value scheduling_config['alba.stats_monkey.run_all'] = current_value Configuration.set(key=celery_key, value=scheduling_config) support_key = '/ovs/framework/support' support_config = Configuration.get(key=support_key) support_config['support_agent'] = support_config.pop('enabled', True) support_config['remote_access'] = support_config.pop('enablesupport', False) Configuration.set(key=support_key, value=support_config) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|stats_monkey_integration', value=True) except Exception: MigrationController._logger.exception('Integration of stats monkey failed') ###################################################### # Write away cluster ID to a file for back-up purposes try: cluster_id = Configuration.get(key='/ovs/framework/cluster_id', default=None) with open(Configuration.CONFIG_STORE_LOCATION, 'r') as config_file: config = json.load(config_file) if cluster_id is not None and config.get('cluster_id', None) is None: config['cluster_id'] = cluster_id with open(Configuration.CONFIG_STORE_LOCATION, 'w') as config_file: json.dump(config, config_file, indent=4) except Exception: MigrationController._logger.exception('Writing cluster id to a file failed.') ######################################################### # Additional string formatting in Arakoon services (2.11) try: if Configuration.get(key='/ovs/framework/migration|arakoon_service_update', default=False) is False: arakoon_service_names = [ArakoonInstaller.get_service_name_for_cluster(cluster_name=cluster_name) for cluster_name in Configuration.list(key='ovs/arakoon')] for storagerouter in StorageRouterList.get_masters(): for service_name in arakoon_service_names: config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(storagerouter.machine_id, service_name) if Configuration.exists(key=config_key): config = Configuration.get(key=config_key) config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR config['ARAKOON_PKG_NAME'] = PackageFactory.PKG_ARAKOON config['ARAKOON_VERSION_CMD'] = PackageFactory.VERSION_CMD_ARAKOON Configuration.set(key=config_key, value=config) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|arakoon_service_update', value=True) except Exception: MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed') ############################################################ # Additional string formatting in ALBA proxy services (2.11) changed_clients = set() try: if Configuration.get(key='/ovs/framework/migration|alba_proxy_service_update', default=False) is False: alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_ALBA) for service in ServiceTypeList.get_by_name('AlbaProxy').services: root_client = sr_client_map[service.storagerouter_guid] config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(service.storagerouter.machine_id, service.name) if Configuration.exists(key=config_key): config = Configuration.get(key=config_key) config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR config['ALBA_PKG_NAME'] = alba_pkg_name config['ALBA_VERSION_CMD'] = alba_version_cmd Configuration.set(key=config_key, value=config) service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name='ovs-{0}'.format(service.name)) changed_clients.add(root_client) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|alba_proxy_service_update', value=True) except Exception: MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed') ############################################################ # Additional string formatting in DTL/VOLDRV services (2.11) try: if Configuration.get(key='/ovs/framework/migration|voldrv_service_update', default=False) is False: sd_pkg_name, sd_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD) for vpool in VPoolList.get_vpools(): for storagedriver in vpool.storagedrivers: root_client = sr_client_map[storagedriver.storagerouter_guid] for entry in ['dtl', 'volumedriver']: service_name = '{0}_{1}'.format(entry, vpool.name) service_template = StorageDriverInstaller.SERVICE_TEMPLATE_DTL if entry == 'dtl' else StorageDriverInstaller.SERVICE_TEMPLATE_SD config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(storagedriver.storagerouter.machine_id, service_name) if Configuration.exists(key=config_key): config = Configuration.get(key=config_key) config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR config['VOLDRV_PKG_NAME'] = sd_pkg_name config['VOLDRV_VERSION_CMD'] = sd_version_cmd Configuration.set(key=config_key, value=config) service_manager.regenerate_service(name=service_template, client=root_client, target_name='ovs-{0}'.format(service_name)) changed_clients.add(root_client) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|voldrv_service_update', value=True) except Exception: MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed') ####################################################### # Storing actual package name in version files (2.11.0) (https://github.com/openvstorage/framework/issues/1876) if Configuration.get(key='/ovs/framework/migration|actual_package_name_in_version_file', default=False) is False: try: voldrv_pkg_name, _ = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD) for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map.get(storagerouter.guid) if root_client is None: continue for file_name in root_client.file_list(directory=ServiceFactory.RUN_FILE_DIR): if not file_name.endswith('.version'): continue file_path = '{0}/{1}'.format(ServiceFactory.RUN_FILE_DIR, file_name) contents = root_client.file_read(filename=file_path) regenerate = False if voldrv_pkg_name == PackageFactory.PKG_VOLDRV_SERVER: if 'volumedriver-server' in contents: regenerate = True contents = contents.replace('volumedriver-server', PackageFactory.PKG_VOLDRV_SERVER) root_client.file_write(filename=file_path, contents=contents) elif voldrv_pkg_name == PackageFactory.PKG_VOLDRV_SERVER_EE: if 'volumedriver-server' in contents or PackageFactory.PKG_VOLDRV_SERVER in contents: regenerate = True contents = contents.replace('volumedriver-server', PackageFactory.PKG_VOLDRV_SERVER_EE) contents = contents.replace(PackageFactory.PKG_VOLDRV_SERVER, PackageFactory.PKG_VOLDRV_SERVER_EE) root_client.file_write(filename=file_path, contents=contents) if regenerate is True: service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_DTL if file_name.startswith('dtl') else StorageDriverInstaller.SERVICE_TEMPLATE_SD, client=root_client, target_name='ovs-{0}'.format(file_name.split('.')[0])) # Leave out .version changed_clients.add(root_client) Configuration.set(key='/ovs/framework/migration|actual_package_name_in_version_file', value=True) except Exception: MigrationController._logger.exception('Updating actual package name for version files failed') for root_client in changed_clients: try: root_client.run(['systemctl', 'daemon-reload']) except Exception: MigrationController._logger.exception('Executing command "systemctl daemon-reload" failed') ######################################################### # Addition of 'Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50' for AlbaProxy SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagedriver in StorageDriverList.get_storagedrivers(): root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: service = alba_proxy.service service_name = 'ovs-{0}'.format(service.name) if not service_manager.has_service(name=service_name, client=root_client): continue if "Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50" in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name='ovs-albaproxy', client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ######################################################### # Addition of 'Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50' for Arakoon SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-arakoon-'): continue if not service_manager.has_service(name=service_name, client=root_client): continue if "Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50" in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) MigrationController._logger.info('Finished out of band migrations')
def remove_node(node_ip, silent=None): """ Remove the node with specified IP from the cluster :param node_ip: IP of the node to remove :type node_ip: str :param silent: If silent == '--force-yes' no question will be asked to confirm the removal :type silent: str :return: None """ from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.lib.storagedriver import StorageDriverController from ovs.lib.vpool import VPoolController Toolbox.log(logger=NodeRemovalController._logger, messages='Remove node', boxed=True) Toolbox.log( logger=NodeRemovalController._logger, messages= 'WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n' ) service_manager = ServiceFactory.get_manager() ############### # VALIDATIONS # ############### try: node_ip = node_ip.strip() if not isinstance(node_ip, str): raise ValueError('Node IP must be a string') if not re.match(SSHClient.IP_REGEX, node_ip): raise ValueError('Invalid IP {0} specified'.format(node_ip)) storage_router_all = sorted(StorageRouterList.get_storagerouters(), key=lambda k: k.name) storage_router_masters = StorageRouterList.get_masters() storage_router_all_ips = set( [storage_router.ip for storage_router in storage_router_all]) storage_router_master_ips = set([ storage_router.ip for storage_router in storage_router_masters ]) storage_router_to_remove = StorageRouterList.get_by_ip(node_ip) offline_reasons = {} if node_ip not in storage_router_all_ips: raise ValueError( 'Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}' .format('\n - '.join(storage_router_all_ips), node_ip)) if len(storage_router_all_ips) == 1: raise RuntimeError("Removing the only node is not possible") if node_ip in storage_router_master_ips and len( storage_router_master_ips) == 1: raise RuntimeError( "Removing the only master node is not possible") if System.get_my_storagerouter() == storage_router_to_remove: raise RuntimeError( 'The node to be removed cannot be identical to the node on which the removal is initiated' ) Toolbox.log( logger=NodeRemovalController._logger, messages='Creating SSH connections to remaining master nodes') master_ip = None ip_client_map = {} storage_routers_offline = [] storage_router_to_remove_online = True for storage_router in storage_router_all: try: client = SSHClient(storage_router, username='******', timeout=10) except (UnableToConnectException, NotAuthenticatedException, TimeOutException) as ex: if isinstance(ex, UnableToConnectException): msg = 'Unable to connect' elif isinstance(ex, NotAuthenticatedException): msg = 'Could not authenticate' elif isinstance(ex, TimeOutException): msg = 'Connection timed out' Toolbox.log( logger=NodeRemovalController._logger, messages=' * Node with IP {0:<15}- {1}'.format( storage_router.ip, msg)) offline_reasons[storage_router.ip] = msg storage_routers_offline.append(storage_router) if storage_router == storage_router_to_remove: storage_router_to_remove_online = False continue Toolbox.log( logger=NodeRemovalController._logger, messages=' * Node with IP {0:<15}- Successfully connected' .format(storage_router.ip)) ip_client_map[storage_router.ip] = client if storage_router != storage_router_to_remove and storage_router.node_type == 'MASTER': master_ip = storage_router.ip if len(ip_client_map) == 0 or master_ip is None: raise RuntimeError( 'Could not connect to any master node in the cluster') storage_router_to_remove.invalidate_dynamics('vdisks_guids') if len( storage_router_to_remove.vdisks_guids ) > 0: # vDisks are supposed to be moved away manually before removing a node raise RuntimeError( "Still vDisks attached to Storage Router {0}".format( storage_router_to_remove.name)) internal_memcached = Toolbox.is_service_internally_managed( service='memcached') internal_rabbit_mq = Toolbox.is_service_internally_managed( service='rabbitmq') memcached_endpoints = Configuration.get( key='/ovs/framework/memcache|endpoints') rabbit_mq_endpoints = Configuration.get( key='/ovs/framework/messagequeue|endpoints') copy_memcached_endpoints = list(memcached_endpoints) copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints) for endpoint in memcached_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_memcached_endpoints.remove(endpoint) for endpoint in rabbit_mq_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_rabbit_mq_endpoints.remove(endpoint) if len(copy_memcached_endpoints ) == 0 and internal_memcached is True: raise RuntimeError( 'Removal of provided nodes will result in a complete removal of the memcached service' ) if len(copy_rabbit_mq_endpoints ) == 0 and internal_rabbit_mq is True: raise RuntimeError( 'Removal of provided nodes will result in a complete removal of the messagequeue service' ) Toolbox.run_hooks(component='noderemoval', sub_component='validate_removal', logger=NodeRemovalController._logger, cluster_ip=storage_router_to_remove.ip) except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages= 'Removal has been aborted during the validation step. No changes have been applied.', boxed=True, loglevel='warning') sys.exit(1) except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel='exception') sys.exit(1) ################# # CONFIRMATIONS # ################# try: interactive = silent != '--force-yes' remove_asd_manager = not interactive # Remove ASD manager if non-interactive else ask if interactive is True: if len(storage_routers_offline) > 0: Toolbox.log( logger=NodeRemovalController._logger, messages= 'Certain nodes appear to be offline. These will not fully removed and will cause issues if they are not really offline.' ) Toolbox.log( logger=NodeRemovalController._logger, messages='Offline nodes: {0}'.format(''.join( ('\n * {0:<15}- {1}.'.format(ip, message) for ip, message in offline_reasons.iteritems())))) valid_node_info = Interactive.ask_yesno( message= 'Continue the removal with these being presumably offline?', default_value=False) if valid_node_info is False: Toolbox.log( logger=NodeRemovalController._logger, messages= 'Please validate the state of the nodes before removing.', title=True) sys.exit(1) proceed = Interactive.ask_yesno( message='Are you sure you want to remove node {0}?'.format( storage_router_to_remove.name), default_value=False) if proceed is False: Toolbox.log(logger=NodeRemovalController._logger, messages='Abort removal', title=True) sys.exit(1) remove_asd_manager = True if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username='******') if service_manager.has_service(name='asd-manager', client=client): remove_asd_manager = Interactive.ask_yesno( message= 'Do you also want to remove the ASD manager and related ASDs?', default_value=False) if remove_asd_manager is True or storage_router_to_remove_online is False: for fct in Toolbox.fetch_hooks('noderemoval', 'validate_asd_removal'): validation_output = fct(storage_router_to_remove.ip) if validation_output['confirm'] is True: if Interactive.ask_yesno( message=validation_output['question'], default_value=False) is False: remove_asd_manager = False break except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages= 'Removal has been aborted during the confirmation step. No changes have been applied.', boxed=True, loglevel='warning') sys.exit(1) except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel='exception') sys.exit(1) ########### # REMOVAL # ########### try: Toolbox.log(logger=NodeRemovalController._logger, messages='Starting removal of node {0} - {1}'.format( storage_router_to_remove.name, storage_router_to_remove.ip)) if storage_router_to_remove_online is False: Toolbox.log( logger=NodeRemovalController._logger, messages= ' Marking all Storage Drivers served by Storage Router {0} as offline' .format(storage_router_to_remove.ip)) StorageDriverController.mark_offline( storagerouter_guid=storage_router_to_remove.guid) # Remove vPools Toolbox.log(logger=NodeRemovalController._logger, messages=' Removing vPools from node'.format( storage_router_to_remove.ip)) storage_routers_offline_guids = [ sr.guid for sr in storage_routers_offline if sr.guid != storage_router_to_remove.guid ] for storage_driver in storage_router_to_remove.storagedrivers: Toolbox.log(logger=NodeRemovalController._logger, messages=' Removing vPool {0} from node'.format( storage_driver.vpool.name)) VPoolController.shrink_vpool( storagedriver_guid=storage_driver.guid, offline_storage_router_guids=storage_routers_offline_guids) # Demote if MASTER if storage_router_to_remove.node_type == 'MASTER': NodeTypeController.demote_node( cluster_ip=storage_router_to_remove.ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=storage_router_to_remove.machine_id, unconfigure_memcached=internal_memcached, unconfigure_rabbitmq=internal_rabbit_mq, offline_nodes=storage_routers_offline) # Stop / remove services Toolbox.log(logger=NodeRemovalController._logger, messages='Stopping and removing services') if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username='******') NodeRemovalController.remove_services( client=client, node_type=storage_router_to_remove.node_type.lower(), logger=NodeRemovalController._logger) service = 'watcher-config' if service_manager.has_service(service, client=client): Toolbox.log( logger=NodeRemovalController._logger, messages='Removing service {0}'.format(service)) service_manager.stop_service(service, client=client) service_manager.remove_service(service, client=client) Toolbox.run_hooks(component='noderemoval', sub_component='remove', logger=NodeRemovalController._logger, cluster_ip=storage_router_to_remove.ip, complete_removal=remove_asd_manager) # Clean up model Toolbox.log(logger=NodeRemovalController._logger, messages='Removing node from model') for service in storage_router_to_remove.services: service.delete() for disk in storage_router_to_remove.disks: for partition in disk.partitions: partition.delete() disk.delete() for j_domain in storage_router_to_remove.domains: j_domain.delete() Configuration.delete('/ovs/framework/hosts/{0}'.format( storage_router_to_remove.machine_id)) NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, offline_node_ips=[node.ip for node in storage_routers_offline], logger=NodeRemovalController._logger) if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username='******') client.file_delete(filenames=[CACC_LOCATION]) client.file_delete(filenames=[CONFIG_STORE_LOCATION]) storage_router_to_remove.delete() Toolbox.log(logger=NodeRemovalController._logger, messages='Successfully removed node\n') except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages=['An unexpected error occurred:', str(exception)], boxed=True, loglevel='exception') sys.exit(1) except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages= 'This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.', boxed=True, loglevel='error') sys.exit(1) if remove_asd_manager is True and storage_router_to_remove_online is True: Toolbox.log(logger=NodeRemovalController._logger, messages='\nRemoving ASD Manager') with remote(storage_router_to_remove.ip, [os]) as rem: rem.os.system('asd-manager remove --force-yes') Toolbox.log(logger=NodeRemovalController._logger, messages='Remove nodes finished', title=True)
def _get_service_manager(cls): return ServiceFactory.get_manager()
class VPoolController(object): """ Contains all BLL related to VPools """ _logger = Logger('lib') _service_manager = ServiceFactory.get_manager() @classmethod @ovs_task(name='ovs.storagerouter.add_vpool') def add_vpool(cls, parameters): """ Add a vPool to the machine this task is running on :param parameters: Parameters for vPool creation :type parameters: dict :return: None :rtype: NoneType """ # TODO: Add logging cls._logger.debug('Adding vpool. Parameters: {}'.format(parameters)) # VALIDATIONS if not isinstance(parameters, dict): raise ValueError( 'Parameters passed to create a vPool should be of type dict') # Check StorageRouter existence storagerouter = StorageRouterList.get_by_ip( ip=parameters.get('storagerouter_ip')) if storagerouter is None: raise RuntimeError('Could not find StorageRouter') # Validate requested vPool configurations vp_installer = VPoolInstaller(name=parameters.get('vpool_name')) vp_installer.validate(storagerouter=storagerouter) # Validate requested StorageDriver configurations cls._logger.info( 'vPool {0}: Validating StorageDriver configurations'.format( vp_installer.name)) sd_installer = StorageDriverInstaller( vp_installer=vp_installer, configurations={ 'storage_ip': parameters.get('storage_ip'), 'caching_info': parameters.get('caching_info'), 'backend_info': { 'main': parameters.get('backend_info'), StorageDriverConfiguration.CACHE_BLOCK: parameters.get('backend_info_bc'), StorageDriverConfiguration.CACHE_FRAGMENT: parameters.get('backend_info_fc') }, 'connection_info': { 'main': parameters.get('connection_info'), StorageDriverConfiguration.CACHE_BLOCK: parameters.get('connection_info_bc'), StorageDriverConfiguration.CACHE_FRAGMENT: parameters.get('connection_info_fc') }, 'sd_configuration': parameters.get('config_params') }) partitions_mutex = volatile_mutex('add_vpool_partitions_{0}'.format( storagerouter.guid)) try: # VPOOL CREATION # Create the vPool as soon as possible in the process to be displayed in the GUI (INSTALLING/EXTENDING state) if vp_installer.is_new is True: vp_installer.create(rdma_enabled=sd_installer.rdma_enabled) vp_installer.configure_mds( config=parameters.get('mds_config_params', {})) else: vp_installer.update_status(status=VPool.STATUSES.EXTENDING) # ADDITIONAL VALIDATIONS # Check StorageRouter connectivity cls._logger.info( 'vPool {0}: Validating StorageRouter connectivity'.format( vp_installer.name)) linked_storagerouters = [storagerouter] if vp_installer.is_new is False: linked_storagerouters += [ sd.storagerouter for sd in vp_installer.vpool.storagedrivers ] sr_client_map = SSHClient.get_clients( endpoints=linked_storagerouters, user_names=['ovs', 'root']) offline_nodes = sr_client_map.pop('offline') if storagerouter in offline_nodes: raise RuntimeError( 'Node on which the vPool is being {0} is not reachable'. format('created' if vp_installer.is_new is True else 'extended')) sr_installer = StorageRouterInstaller( root_client=sr_client_map[storagerouter]['root'], sd_installer=sd_installer, vp_installer=vp_installer, storagerouter=storagerouter) # When 2 or more jobs simultaneously run on the same StorageRouter, we need to check and create the StorageDriver partitions in locked context partitions_mutex.acquire(wait=60) sr_installer.partition_info = StorageRouterController.get_partition_info( storagerouter_guid=storagerouter.guid) sr_installer.validate_vpool_extendable() sr_installer.validate_global_write_buffer( requested_size=parameters.get('writecache_size', 0)) sr_installer.validate_local_cache_size( requested_proxies=parameters.get('parallelism', {}).get( 'proxies', 2)) # MODEL STORAGEDRIVER AND PARTITION JUNCTIONS sd_installer.create() sd_installer.create_partitions() partitions_mutex.release() vp_installer.refresh_metadata() except Exception: cls._logger.exception( 'Something went wrong during the validation or modeling of vPool {0} on StorageRouter {1}' .format(vp_installer.name, storagerouter.name)) partitions_mutex.release() vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise # Arakoon setup counter = 0 while counter < 300: try: if StorageDriverController.manual_voldrv_arakoon_checkup( ) is True: break except Exception: cls._logger.exception( 'Arakoon checkup for voldrv cluster failed') vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise counter += 1 time.sleep(1) if counter == 300: vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise RuntimeError( 'Arakoon checkup for the StorageDriver cluster could not be started' ) # Cluster registry try: vp_installer.configure_cluster_registry(allow_raise=True) except Exception: if vp_installer.is_new is True: vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) else: vp_installer.revert_vpool(status=VPool.STATUSES.FAILURE) raise try: sd_installer.setup_proxy_configs() sd_installer.configure_storagedriver_service() DiskController.sync_with_reality(storagerouter.guid) MDSServiceController.prepare_mds_service( storagerouter=storagerouter, vpool=vp_installer.vpool) # Update the MDS safety if changed via API (vpool.configuration will be available at this point also for the newly added StorageDriver) vp_installer.vpool.invalidate_dynamics('configuration') if vp_installer.mds_safety is not None and vp_installer.vpool.configuration[ 'mds_config']['mds_safety'] != vp_installer.mds_safety: Configuration.set( key='/ovs/vpools/{0}/mds_config|mds_safety'.format( vp_installer.vpool.guid), value=vp_installer.mds_safety) sd_installer.start_services( ) # Create and start watcher volumedriver, DTL, proxies and StorageDriver services # Post creation/extension checkups mds_config_set = MDSServiceController.get_mds_storagedriver_config_set( vpool=vp_installer.vpool, offline_nodes=offline_nodes) for sr, clients in sr_client_map.iteritems(): for current_storagedriver in [ sd for sd in sr.storagedrivers if sd.vpool_guid == vp_installer.vpool.guid ]: storagedriver_config = StorageDriverConfiguration( vpool_guid=vp_installer.vpool.guid, storagedriver_id=current_storagedriver.storagedriver_id ) if storagedriver_config.config_missing is False: # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them storagedriver_config.configure_filesystem( fs_metadata_backend_mds_nodes=mds_config_set[ sr.guid]) storagedriver_config.save(client=clients['ovs']) # Everything's reconfigured, refresh new cluster configuration for current_storagedriver in vp_installer.vpool.storagedrivers: if current_storagedriver.storagerouter not in sr_client_map: continue vp_installer.vpool.storagedriver_client.update_cluster_node_configs( str(current_storagedriver.storagedriver_id), req_timeout_secs=10) except Exception: cls._logger.exception('vPool {0}: Creation failed'.format( vp_installer.name)) vp_installer.update_status(status=VPool.STATUSES.FAILURE) raise # When a node is offline, we can run into errors, but also when 1 or more volumes are not running # Scheduled tasks below, so don't really care whether they succeed or not try: VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600) except: pass for vdisk in vp_installer.vpool.vdisks: try: MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid) except: pass vp_installer.update_status(status=VPool.STATUSES.RUNNING) cls._logger.info('Add vPool {0} ended successfully'.format( vp_installer.name)) # @classmethod # @ovs_task(name='ovs.vpool.extend_vpool') # def extend_vpool(cls, ...): # TODO: Make sure extend and create cannot be executed at the same time for the same vPool @classmethod @ovs_task(name='ovs.vpool.shrink_vpool') def shrink_vpool(cls, storagedriver_guid, offline_storage_router_guids=list()): """ Removes a StorageDriver (if its the last StorageDriver for a vPool, the vPool is removed as well) :param storagedriver_guid: Guid of the StorageDriver to remove :type storagedriver_guid: str :param offline_storage_router_guids: Guids of StorageRouters which are offline and will be removed from cluster. WHETHER VPOOL WILL BE DELETED DEPENDS ON THIS :type offline_storage_router_guids: list :return: None :rtype: NoneType """ # TODO: Add logging # TODO: Unit test individual pieces of code # Validations storagedriver = StorageDriver(storagedriver_guid) storagerouter = storagedriver.storagerouter cls._logger.info( 'StorageDriver {0} - Deleting StorageDriver {1}'.format( storagedriver.guid, storagedriver.name)) vp_installer = VPoolInstaller(name=storagedriver.vpool.name) vp_installer.validate(storagedriver=storagedriver) sd_installer = StorageDriverInstaller(vp_installer=vp_installer, storagedriver=storagedriver) cls._logger.info( 'StorageDriver {0} - Checking availability of related StorageRouters' .format(storagedriver.guid, storagedriver.name)) sr_client_map = SSHClient.get_clients(endpoints=[ sd.storagerouter for sd in vp_installer.vpool.storagedrivers ], user_names=['root']) sr_installer = StorageRouterInstaller(root_client=sr_client_map.get( storagerouter, {}).get('root'), storagerouter=storagerouter, vp_installer=vp_installer, sd_installer=sd_installer) offline_srs = sr_client_map.pop('offline') if sorted([sr.guid for sr in offline_srs ]) != sorted(offline_storage_router_guids): raise RuntimeError('Not all StorageRouters are reachable') if storagerouter not in offline_srs: mtpt_pids = sr_installer.root_client.run( "lsof -t +D '/mnt/{0}' || true".format( vp_installer.name.replace(r"'", r"'\''")), allow_insecure=True).splitlines() if len(mtpt_pids) > 0: raise RuntimeError( 'vPool cannot be deleted. Following processes keep the vPool mount point occupied: {0}' .format(', '.join(mtpt_pids))) # Retrieve reachable StorageDrivers reachable_storagedrivers = [] for sd in vp_installer.vpool.storagedrivers: if sd.storagerouter not in sr_client_map: # StorageRouter is offline continue sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format( vp_installer.vpool.guid, sd.storagedriver_id) if Configuration.exists(sd_key) is True: path = Configuration.get_configuration_path(sd_key) with remote(sd.storagerouter.ip, [LocalStorageRouterClient]) as rem: try: lsrc = rem.LocalStorageRouterClient(path) lsrc.server_revision( ) # 'Cheap' call to verify whether volumedriver is responsive cls._logger.info( 'StorageDriver {0} - Responsive StorageDriver {1} on node with IP {2}' .format(storagedriver.guid, sd.name, sd.storagerouter.ip)) reachable_storagedrivers.append(sd) except Exception as exception: if not is_connection_failure(exception): raise if len(reachable_storagedrivers) == 0: raise RuntimeError( 'Could not find any responsive node in the cluster') # Start removal if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.SHRINKING) else: vp_installer.update_status(status=VPool.STATUSES.DELETING) # Clean up stale vDisks cls._logger.info('StorageDriver {0} - Removing stale vDisks'.format( storagedriver.guid)) VDiskController.remove_stale_vdisks(vpool=vp_installer.vpool) # Reconfigure the MDSes cls._logger.info('StorageDriver {0} - Reconfiguring MDSes'.format( storagedriver.guid)) for vdisk_guid in storagerouter.vdisks_guids: try: MDSServiceController.ensure_safety( vdisk_guid=vdisk_guid, excluded_storagerouter_guids=[storagerouter.guid] + offline_storage_router_guids) except Exception: cls._logger.exception( 'StorageDriver {0} - vDisk {1} - Ensuring MDS safety failed' .format(storagedriver.guid, vdisk_guid)) # Validate that all MDSes on current StorageRouter have been moved away # Ensure safety does not always throw an error, that's why we perform this check here instead of in the Exception clause of above code vdisks = [] for mds in vp_installer.mds_services: for junction in mds.vdisks: vdisk = junction.vdisk if vdisk in vdisks: continue vdisks.append(vdisk) cls._logger.critical( 'StorageDriver {0} - vDisk {1} {2} - MDS Services have not been migrated away' .format(storagedriver.guid, vdisk.guid, vdisk.name)) if len(vdisks) > 0: # Put back in RUNNING, so it can be used again. Errors keep on displaying in GUI now anyway vp_installer.update_status(status=VPool.STATUSES.RUNNING) raise RuntimeError( 'Not all MDS Services have been successfully migrated away') # Start with actual removal errors_found = False if storagerouter not in offline_srs: errors_found &= sd_installer.stop_services() errors_found &= vp_installer.configure_cluster_registry( exclude=[storagedriver], apply_on=reachable_storagedrivers) errors_found &= vp_installer.update_node_distance_map() errors_found &= vp_installer.remove_mds_services() errors_found &= sd_installer.clean_config_management() errors_found &= sd_installer.clean_model() if storagerouter not in offline_srs: errors_found &= sd_installer.clean_directories( mountpoints=StorageRouterController.get_mountpoints( client=sr_installer.root_client)) try: DiskController.sync_with_reality( storagerouter_guid=storagerouter.guid) except Exception: cls._logger.exception( 'StorageDriver {0} - Synchronizing disks with reality failed' .format(storagedriver.guid)) errors_found = True if vp_installer.storagedriver_amount > 1: # Update the vPool metadata and run DTL checkup vp_installer.vpool.metadata['caching_info'].pop( sr_installer.storagerouter.guid, None) vp_installer.vpool.save() try: VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600) except Exception: cls._logger.exception( 'StorageDriver {0} - DTL checkup failed for vPool {1} with guid {2}' .format(storagedriver.guid, vp_installer.name, vp_installer.vpool.guid)) else: cls._logger.info( 'StorageDriver {0} - Removing vPool from model'.format( storagedriver.guid)) # Clean up model try: vp_installer.vpool.delete() except Exception: errors_found = True cls._logger.exception( 'StorageDriver {0} - Cleaning up vPool from the model failed' .format(storagedriver.guid)) Configuration.delete('/ovs/vpools/{0}'.format( vp_installer.vpool.guid)) cls._logger.info('StorageDriver {0} - Running MDS checkup'.format( storagedriver.guid)) try: MDSServiceController.mds_checkup() except Exception: cls._logger.exception( 'StorageDriver {0} - MDS checkup failed'.format( storagedriver.guid)) # Update vPool status if errors_found is True: if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.FAILURE) raise RuntimeError( '1 or more errors occurred while trying to remove the StorageDriver. Please check the logs for more information' ) if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.RUNNING) cls._logger.info( 'StorageDriver {0} - Deleted StorageDriver {1}'.format( storagedriver.guid, storagedriver.name)) if len(VPoolList.get_vpools()) == 0: cluster_name = ArakoonInstaller.get_cluster_name('voldrv') if ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name)['internal'] is True: cls._logger.debug( 'StorageDriver {0} - Removing Arakoon cluster {1}'.format( storagedriver.guid, cluster_name)) try: installer = ArakoonInstaller(cluster_name=cluster_name) installer.load() installer.delete_cluster() except Exception: cls._logger.exception( 'StorageDriver {0} - Delete voldrv Arakoon cluster failed' .format(storagedriver.guid)) service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) service_name = ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name) for service in list(service_type.services): if service.name == service_name: service.delete() # Remove watcher volumedriver service if last StorageDriver on current StorageRouter if len( storagerouter.storagedrivers ) == 0 and storagerouter not in offline_srs: # ensure client is initialized for StorageRouter try: if cls._service_manager.has_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client): cls._service_manager.stop_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client) cls._service_manager.remove_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client) except Exception: cls._logger.exception( 'StorageDriver {0} - {1} service deletion failed'.format( storagedriver.guid, ServiceFactory.SERVICE_WATCHER_VOLDRV)) @staticmethod @ovs_task(name='ovs.vpool.up_and_running') @log('VOLUMEDRIVER_TASK') def up_and_running(storagedriver_id): """ Volumedriver informs us that the service is completely started. Post-start events can be executed :param storagedriver_id: ID of the storagedriver """ storagedriver = StorageDriverList.get_by_storagedriver_id( storagedriver_id) if storagedriver is None: raise RuntimeError( 'A Storage Driver with id {0} could not be found.'.format( storagedriver_id)) storagedriver.startup_counter += 1 storagedriver.save() # noinspection PyTypeChecker @staticmethod @ovs_task(name='ovs.storagerouter.create_hprm_config_files') def create_hprm_config_files(vpool_guid, local_storagerouter_guid, parameters): """ Create the required configuration files to be able to make use of HPRM (aka PRACC) This configuration will be zipped and made available for download :param vpool_guid: The guid of the VPool for which a HPRM manager needs to be deployed :type vpool_guid: str :param local_storagerouter_guid: The guid of the StorageRouter the API was requested on :type local_storagerouter_guid: str :param parameters: Additional information required for the HPRM configuration files :type parameters: dict :return: Name of the zipfile containing the configuration files :rtype: str """ # Validations required_params = { 'port': (int, { 'min': 1, 'max': 65535 }), 'identifier': (str, ExtensionsToolbox.regex_vpool) } ExtensionsToolbox.verify_required_params( actual_params=parameters, required_params=required_params) vpool = VPool(vpool_guid) identifier = parameters['identifier'] config_path = None local_storagerouter = StorageRouter(local_storagerouter_guid) for sd in vpool.storagedrivers: if len(sd.alba_proxies) == 0: raise ValueError( 'No ALBA proxies configured for vPool {0} on StorageRouter {1}' .format(vpool.name, sd.storagerouter.name)) config_path = '/ovs/vpools/{0}/proxies/{1}/config/{{0}}'.format( vpool.guid, sd.alba_proxies[0].guid) if config_path is None: raise ValueError( 'vPool {0} has not been extended any StorageRouter'.format( vpool.name)) proxy_cfg = Configuration.get(key=config_path.format('main')) cache_info = {} arakoons = {} cache_types = VPool.CACHES.values() if not any(ctype in parameters for ctype in cache_types): raise ValueError( 'At least one cache type should be passed: {0}'.format( ', '.join(cache_types))) for ctype in cache_types: if ctype not in parameters: continue required_dict = {'read': (bool, None), 'write': (bool, None)} required_params.update({ctype: (dict, required_dict)}) ExtensionsToolbox.verify_required_params( actual_params=parameters, required_params=required_params) read = parameters[ctype]['read'] write = parameters[ctype]['write'] if read is False and write is False: cache_info[ctype] = ['none'] continue path = parameters[ctype].get('path') if path is not None: path = path.strip() if not path or path.endswith( '/.') or '..' in path or '/./' in path: raise ValueError('Invalid path specified') required_dict.update({ 'path': (str, None), 'size': (int, { 'min': 1, 'max': 10 * 1024 }) }) ExtensionsToolbox.verify_required_params( actual_params=parameters, required_params=required_params) while '//' in path: path = path.replace('//', '/') cache_info[ctype] = [ 'local', { 'path': path, 'max_size': parameters[ctype]['size'] * 1024**3, 'cache_on_read': read, 'cache_on_write': write } ] else: required_dict.update({ 'backend_info': (dict, { 'preset': (str, ExtensionsToolbox.regex_preset), 'alba_backend_guid': (str, ExtensionsToolbox.regex_guid), 'alba_backend_name': (str, ExtensionsToolbox.regex_backend) }), 'connection_info': (dict, { 'host': (str, ExtensionsToolbox.regex_ip, False), 'port': (int, { 'min': 1, 'max': 65535 }, False), 'client_id': (str, ExtensionsToolbox.regex_guid, False), 'client_secret': (str, None, False) }) }) ExtensionsToolbox.verify_required_params( actual_params=parameters, required_params=required_params) connection_info = parameters[ctype]['connection_info'] if connection_info[ 'host']: # Remote Backend for accelerated Backend alba_backend_guid = parameters[ctype]['backend_info'][ 'alba_backend_guid'] ovs_client = OVSClient.get_instance( connection_info=connection_info) arakoon_config = VPoolShared.retrieve_alba_arakoon_config( alba_backend_guid=alba_backend_guid, ovs_client=ovs_client) arakoons[ctype] = ArakoonClusterConfig.convert_config_to( arakoon_config, return_type='INI') else: # Local Backend for accelerated Backend alba_backend_name = parameters[ctype]['backend_info'][ 'alba_backend_name'] if Configuration.exists(key='/ovs/arakoon/{0}-abm/config'. format(alba_backend_name), raw=True) is False: raise ValueError( 'Arakoon cluster for ALBA Backend {0} could not be retrieved' .format(alba_backend_name)) arakoons[ctype] = Configuration.get( key='/ovs/arakoon/{0}-abm/config'.format( alba_backend_name), raw=True) cache_info[ctype] = [ 'alba', { 'albamgr_cfg_url': '/etc/hprm/{0}/{1}_cache_arakoon.ini'.format( identifier, ctype), 'bucket_strategy': [ '1-to-1', { 'prefix': vpool.guid, 'preset': parameters[ctype]['backend_info']['preset'] } ], 'manifest_cache_size': proxy_cfg['manifest_cache_size'], 'cache_on_read': read, 'cache_on_write': write } ] tgz_name = 'hprm_config_files_{0}_{1}.tgz'.format( identifier, vpool.name) config = { 'ips': ['127.0.0.1'], 'port': parameters['port'], 'pracc': { 'uds_path': '/var/run/hprm/{0}/uds_path'.format(identifier), 'max_clients': 1000, 'max_read_buf_size': 64 * 1024, # Buffer size for incoming requests (in bytes) 'thread_pool_size': 64 }, # Amount of threads 'transport': 'tcp', 'log_level': 'info', 'read_preference': proxy_cfg['read_preference'], 'albamgr_cfg_url': '/etc/hprm/{0}/arakoon.ini'.format(identifier), 'manifest_cache_size': proxy_cfg['manifest_cache_size'] } file_contents_map = {} for ctype in cache_types: if ctype in cache_info: config['{0}_cache'.format(ctype)] = cache_info[ctype] if ctype in arakoons: file_contents_map[ '/opt/OpenvStorage/config/{0}/{1}_cache_arakoon.ini'. format(identifier, ctype)] = arakoons[ctype] file_contents_map.update({ '/opt/OpenvStorage/config/{0}/config.json'.format(identifier): json.dumps(config, indent=4), '/opt/OpenvStorage/config/{0}/arakoon.ini'.format(identifier): Configuration.get(key=config_path.format('abm'), raw=True) }) local_client = SSHClient(endpoint=local_storagerouter) local_client.dir_create( directories='/opt/OpenvStorage/config/{0}'.format(identifier)) local_client.dir_create( directories='/opt/OpenvStorage/webapps/frontend/downloads') for file_name, contents in file_contents_map.iteritems(): local_client.file_write(contents=contents, filename=file_name) local_client.run(command=[ 'tar', '--transform', 's#^config/{0}#{0}#'.format(identifier), '-czf', '/opt/OpenvStorage/webapps/frontend/downloads/{0}'.format( tgz_name), 'config/{0}'.format(identifier) ]) local_client.dir_delete( directories='/opt/OpenvStorage/config/{0}'.format(identifier)) return tgz_name @staticmethod def retrieve_alba_arakoon_config(alba_backend_guid, ovs_client): """ Retrieve the ALBA Arakoon configuration WARNING: YOU DO NOT BELONG HERE, PLEASE MOVE TO YOUR OWN PLUGIN :param alba_backend_guid: Guid of the ALBA Backend :type alba_backend_guid: str :param ovs_client: OVS client object :type ovs_client: OVSClient :return: Arakoon configuration information :rtype: dict """ task_id = ovs_client.get( '/alba/backends/{0}/get_config_metadata'.format(alba_backend_guid)) successful, arakoon_config = ovs_client.wait_for_task(task_id, timeout=300) if successful is False: raise RuntimeError( 'Could not load metadata from environment {0}'.format( ovs_client.ip)) return arakoon_config
def migrate(): """ Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually executed. This code will typically contain: * "dangerous" migration code (it needs certain running services) * Migration code depending on a cluster-wide state * ... """ MigrationController._logger.info('Preparing out of band migrations...') from ovs.dal.lists.storagedriverlist import StorageDriverList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.vpoollist import VPoolList from ovs.extensions.generic.configuration import Configuration from ovs.extensions.generic.sshclient import SSHClient from ovs_extensions.generic.toolbox import ExtensionsToolbox from ovs_extensions.services.interfaces.systemd import Systemd from ovs.extensions.services.servicefactory import ServiceFactory from ovs.extensions.storageserver.storagedriver import StorageDriverConfiguration from ovs.lib.generic import GenericController MigrationController._logger.info('Start out of band migrations...') service_manager = ServiceFactory.get_manager() sr_client_map = {} for storagerouter in StorageRouterList.get_storagerouters(): sr_client_map[storagerouter.guid] = SSHClient(endpoint=storagerouter, username='******') ######################################################### # Addition of 'ExecReload' for AlbaProxy SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagedriver in StorageDriverList.get_storagedrivers(): root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: service = alba_proxy.service service_name = 'ovs-{0}'.format(service.name) if not service_manager.has_service(name=service_name, client=root_client): continue if 'ExecReload=' in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name='ovs-albaproxy', client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ################################################################## # Adjustment of open file descriptors for Arakoon services to 8192 changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-arakoon-'): continue if ServiceFactory.get_service_type() == 'systemd': path = '/lib/systemd/system/{0}.service'.format(service_name) check = 'LimitNOFILE=8192' else: path = '/etc/init/{0}.conf'.format(service_name) check = 'limit nofile 8192 8192' if not root_client.file_exists(path): continue if check in root_client.file_read(path): continue try: service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name) changed_clients.add(root_client) ExtensionsToolbox.edit_version_file(client=root_client, package_name='arakoon', old_service_name=service_name) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ############################# # Migrate to multiple proxies for storagedriver in StorageDriverList.get_storagedrivers(): vpool = storagedriver.vpool root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: # Rename alba_proxy service in model service = alba_proxy.service old_service_name = 'albaproxy_{0}'.format(vpool.name) new_service_name = 'albaproxy_{0}_0'.format(vpool.name) if old_service_name != service.name: continue service.name = new_service_name service.save() if not service_manager.has_service(name=old_service_name, client=root_client): continue old_configuration_key = '/ovs/framework/hosts/{0}/services/{1}'.format(storagedriver.storagerouter.machine_id, old_service_name) if not Configuration.exists(key=old_configuration_key): continue # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_service_name=old_service_name, new_service_name=new_service_name) # Register new service and remove old service service_manager.add_service(name='ovs-albaproxy', client=root_client, params=Configuration.get(old_configuration_key), target_name='ovs-{0}'.format(new_service_name)) # Update scrub proxy config proxy_config_key = '/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid) proxy_config = None if Configuration.exists(key=proxy_config_key) is False else Configuration.get(proxy_config_key) if proxy_config is not None: fragment_cache = proxy_config.get('fragment_cache', ['none', {}]) if fragment_cache[0] == 'alba' and fragment_cache[1].get('cache_on_write') is True: # Accelerated ALBA configured fragment_cache_scrub_info = copy.deepcopy(fragment_cache) fragment_cache_scrub_info[1]['cache_on_read'] = False proxy_scrub_config_key = '/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid) proxy_scrub_config = None if Configuration.exists(key=proxy_scrub_config_key) is False else Configuration.get(proxy_scrub_config_key) if proxy_scrub_config is not None and proxy_scrub_config['fragment_cache'] == ['none']: proxy_scrub_config['fragment_cache'] = fragment_cache_scrub_info Configuration.set(proxy_scrub_config_key, json.dumps(proxy_scrub_config, indent=4), raw=True) # Update 'backend_connection_manager' section changes = False storagedriver_config = StorageDriverConfiguration('storagedriver', vpool.guid, storagedriver.storagedriver_id) storagedriver_config.load() if 'backend_connection_manager' not in storagedriver_config.configuration: continue current_config = storagedriver_config.configuration['backend_connection_manager'] if current_config.get('backend_type') != 'MULTI': changes = True backend_connection_manager = {'backend_type': 'MULTI'} for index, proxy in enumerate(sorted(storagedriver.alba_proxies, key=lambda pr: pr.service.ports[0])): backend_connection_manager[str(index)] = copy.deepcopy(current_config) # noinspection PyUnresolvedReferences backend_connection_manager[str(index)]['alba_connection_use_rora'] = True # noinspection PyUnresolvedReferences backend_connection_manager[str(index)]['alba_connection_rora_manifest_cache_capacity'] = 5000 # noinspection PyUnresolvedReferences for key, value in backend_connection_manager[str(index)].items(): if key.startswith('backend_interface'): backend_connection_manager[key] = value # noinspection PyUnresolvedReferences del backend_connection_manager[str(index)][key] for key, value in {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}.iteritems(): if key not in backend_connection_manager: backend_connection_manager[key] = value else: backend_connection_manager = current_config for value in backend_connection_manager.values(): if isinstance(value, dict): for key, val in value.items(): if key.startswith('backend_interface'): backend_connection_manager[key] = val changes = True del value[key] for key, value in {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}.iteritems(): if key not in backend_connection_manager: changes = True backend_connection_manager[key] = value if changes is True: storagedriver_config.clear_backend_connection_manager() storagedriver_config.configure_backend_connection_manager(**backend_connection_manager) storagedriver_config.save(root_client) # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section) ExtensionsToolbox.edit_version_file(client=root_client, package_name='volumedriver', old_service_name='volumedriver_{0}'.format(vpool.name)) if service_manager.ImplementationClass == Systemd: root_client.run(['systemctl', 'daemon-reload']) ######################################## # Update metadata_store_bits information for vpool in VPoolList.get_vpools(): bits = None for storagedriver in vpool.storagedrivers: key = '/ovs/framework/hosts/{0}/services/volumedriver_{1}'.format(storagedriver.storagerouter.machine_id, vpool.name) if Configuration.exists(key=key) and 'METADATASTORE_BITS' not in Configuration.get(key=key): if bits is None: entries = service_manager.extract_from_service_file(name='ovs-volumedriver_{0}'.format(vpool.name), client=sr_client_map[storagedriver.storagerouter_guid], entries=['METADATASTORE_BITS=']) if len(entries) == 1: bits = entries[0].split('=')[-1] bits = int(bits) if bits.isdigit() else 5 if bits is not None: try: content = Configuration.get(key=key) content['METADATASTORE_BITS'] = bits Configuration.set(key=key, value=content) except: MigrationController._logger.exception('Error updating volumedriver info for vPool {0} on StorageRouter {1}'.format(vpool.name, storagedriver.storagerouter.name)) if bits is not None: vpool.metadata_store_bits = bits vpool.save() MigrationController._logger.info('Finished out of band migrations') GenericController.refresh_package_information()
class StorageRouterController(object): """ Contains all BLL related to StorageRouter """ _logger = Logger('lib') _log_level = LOG_LEVEL_MAPPING[_logger.getEffectiveLevel()] _os_manager = OSFactory.get_manager() _service_manager = ServiceFactory.get_manager() # noinspection PyCallByClass,PyTypeChecker storagerouterclient.Logger.setupLogging( Logger.load_path('storagerouterclient'), _log_level) # noinspection PyArgumentList storagerouterclient.Logger.enableLogging() @staticmethod @ovs_task(name='ovs.storagerouter.ping') def ping(storagerouter_guid, timestamp): """ Update a StorageRouter's celery heartbeat :param storagerouter_guid: Guid of the StorageRouter to update :type storagerouter_guid: str :param timestamp: Timestamp to compare to :type timestamp: float :return: None :rtype: NoneType """ with volatile_mutex( 'storagerouter_heartbeat_{0}'.format(storagerouter_guid)): storagerouter = StorageRouter(storagerouter_guid) if timestamp > storagerouter.heartbeats.get('celery', 0): storagerouter.heartbeats['celery'] = timestamp storagerouter.save() @staticmethod @ovs_task(name='ovs.storagerouter.get_metadata') def get_metadata(storagerouter_guid): """ Gets physical information about the specified StorageRouter :param storagerouter_guid: StorageRouter guid to retrieve the metadata for :type storagerouter_guid: str :return: Metadata information about the StorageRouter :rtype: dict """ return { 'partitions': StorageRouterController.get_partition_info(storagerouter_guid), 'ipaddresses': StorageRouterController.get_ip_addresses(storagerouter_guid), 'scrub_available': StorageRouterController.check_scrub_partition_present() } @staticmethod def get_ip_addresses(storagerouter_guid): """ Retrieves the IP addresses of a StorageRouter :param storagerouter_guid: Guid of the StorageRouter :return: List of IP addresses :rtype: list """ client = SSHClient(endpoint=StorageRouter(storagerouter_guid)) return StorageRouterController._os_manager.get_ip_addresses( client=client) @staticmethod def get_partition_info(storagerouter_guid): """ Retrieves information about the partitions of a Storagerouter :param storagerouter_guid: Guid of the Storagerouter :type storagerouter_guid: str :return: dict with information about the partitions :rtype: dict """ storagerouter = StorageRouter(storagerouter_guid) client = SSHClient(endpoint=storagerouter) services_mds = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.MD_SERVER).services services_arakoon = [ service for service in ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON).services if service.name != 'arakoon-ovsdb' and service.is_internal is True ] partitions = dict((role, []) for role in DiskPartition.ROLES) for disk in storagerouter.disks: for disk_partition in disk.partitions: claimed_space_by_fwk = 0 used_space_by_system = 0 available_space_by_system = 0 for storagedriver_partition in disk_partition.storagedrivers: claimed_space_by_fwk += storagedriver_partition.size if storagedriver_partition.size is not None else 0 if client.dir_exists(storagedriver_partition.path): try: used_space_by_system += int( client.run([ 'du', '-B', '1', '-d', '0', storagedriver_partition.path ], timeout=5).split('\t')[0]) except Exception as ex: StorageRouterController._logger.warning( 'Failed to get directory usage for {0}. {1}'. format(storagedriver_partition.path, ex)) if disk_partition.mountpoint is not None: for alias in disk_partition.aliases: StorageRouterController._logger.info( 'Verifying disk partition usage by checking path {0}' .format(alias)) disk_partition_device = client.file_read_link( path=alias) try: available_space_by_system = int( client.run([ 'df', '-B', '1', '--output=avail', disk_partition_device ], timeout=5).splitlines()[-1]) break except Exception as ex: StorageRouterController._logger.warning( 'Failed to get partition usage for {0}. {1}'. format(disk_partition.mountpoint, ex)) for role in disk_partition.roles: size = 0 if disk_partition.size is None else disk_partition.size if available_space_by_system > 0: # Take available space reported by df then add back used by roles so that the only used space reported is space not managed by us available = available_space_by_system + used_space_by_system - claimed_space_by_fwk else: available = size - claimed_space_by_fwk # Subtract size for roles which have already been claimed by other vpools (but not necessarily already been fully used) in_use = any(junction for junction in disk_partition.storagedrivers if junction.role == role) if role == DiskPartition.ROLES.DB: for service in services_arakoon: if service.storagerouter_guid == storagerouter_guid: in_use = True break for service in services_mds: if service.storagerouter_guid == storagerouter_guid: in_use = True break partitions[role].append({ 'ssd': disk.is_ssd, 'guid': disk_partition.guid, 'size': size, 'in_use': in_use, 'usable': True, # Sizes smaller than 1GiB and smaller than 5% of largest WRITE partition will be un-usable 'available': available if available > 0 else 0, 'mountpoint': disk_partition. folder, # Equals to mount point unless mount point is root ('/'), then we pre-pend mount point with '/mnt/storage' 'storagerouter_guid': storagerouter_guid }) # Strip out WRITE caches which are smaller than 5% of largest write cache size and smaller than 1GiB writecache_sizes = [] for partition_info in partitions[DiskPartition.ROLES.WRITE]: writecache_sizes.append(partition_info['available']) largest_write_cache = max( writecache_sizes) if len(writecache_sizes) > 0 else 0 for index, size in enumerate(writecache_sizes): if size < largest_write_cache * 5 / 100 or size < 1024**3: partitions[DiskPartition.ROLES.WRITE][index]['usable'] = False return partitions @staticmethod @ovs_task(name='ovs.storagerouter.get_version_info') def get_version_info(storagerouter_guid): """ Returns version information regarding a given StorageRouter :param storagerouter_guid: StorageRouter guid to get version information for :type storagerouter_guid: str :return: Version information :rtype: dict """ package_manager = PackageFactory.get_manager() client = SSHClient(StorageRouter(storagerouter_guid)) return { 'storagerouter_guid': storagerouter_guid, 'versions': dict((pkg_name, str(version)) for pkg_name, version in package_manager.get_installed_versions(client).iteritems()) } @staticmethod @ovs_task(name='ovs.storagerouter.get_support_info') def get_support_info(): """ Returns support information for the entire cluster :return: Support information :rtype: dict """ celery_scheduling = Configuration.get( key='/ovs/framework/scheduling/celery', default={}) stats_monkey_enabled = any( celery_scheduling.get(key) is not None for key in ['ovs.stats_monkey.run_all', 'alba.stats_monkey.run_all']) return { 'cluster_id': Configuration.get(key='/ovs/framework/cluster_id'), 'stats_monkey': stats_monkey_enabled, 'support_agent': Configuration.get(key='/ovs/framework/support|support_agent'), 'remote_access': Configuration.get(key='ovs/framework/support|remote_access'), 'stats_monkey_config': Configuration.get(key='ovs/framework/monitoring/stats_monkey', default={}) } @staticmethod @ovs_task(name='ovs.storagerouter.get_support_metadata') def get_support_metadata(): """ Returns support metadata for a given StorageRouter. This should be a routed task! :return: Metadata of the StorageRouter :rtype: dict """ return SupportAgent().get_heartbeat_data() @staticmethod @ovs_task(name='ovs.storagerouter.get_logfiles') def get_logfiles(local_storagerouter_guid): """ Collects logs, moves them to a web-accessible location and returns log tgz's filename :param local_storagerouter_guid: StorageRouter guid to retrieve log files on :type local_storagerouter_guid: str :return: Name of tgz containing the logs :rtype: str """ this_storagerouter = System.get_my_storagerouter() this_client = SSHClient(this_storagerouter, username='******') logfile = this_client.run(['ovs', 'collect', 'logs']).strip() logfilename = logfile.split('/')[-1] storagerouter = StorageRouter(local_storagerouter_guid) webpath = '/opt/OpenvStorage/webapps/frontend/downloads' client = SSHClient(storagerouter, username='******') client.dir_create(webpath) client.file_upload('{0}/{1}'.format(webpath, logfilename), logfile) client.run(['chmod', '666', '{0}/{1}'.format(webpath, logfilename)]) return logfilename @staticmethod @ovs_task(name='ovs.storagerouter.get_proxy_config') def get_proxy_config(vpool_guid, storagerouter_guid): """ Gets the ALBA proxy for a given StorageRouter and vPool :param storagerouter_guid: Guid of the StorageRouter on which the ALBA proxy is configured :type storagerouter_guid: str :param vpool_guid: Guid of the vPool for which the proxy is configured :type vpool_guid: str :return: The ALBA proxy configuration :rtype: dict """ vpool = VPool(vpool_guid) storagerouter = StorageRouter(storagerouter_guid) for sd in vpool.storagedrivers: if sd.storagerouter_guid == storagerouter.guid: if len(sd.alba_proxies) == 0: raise ValueError( 'No ALBA proxies configured for vPool {0} on StorageRouter {1}' .format(vpool.name, storagerouter.name)) return Configuration.get( '/ovs/vpools/{0}/proxies/{1}/config/main'.format( vpool.guid, sd.alba_proxies[0].guid)) raise ValueError( 'vPool {0} has not been extended to StorageRouter {1}'.format( vpool.name, storagerouter.name)) @staticmethod @ovs_task(name='ovs.storagerouter.configure_support') def configure_support(support_info): """ Configures support on all StorageRouters :param support_info: Information about which components should be configured {'stats_monkey': True, # Enable/disable the stats monkey scheduled task 'support_agent': True, # Responsible for enabling the ovs-support-agent service, which collects heart beat data 'remote_access': False, # Cannot be True when support agent is False. Is responsible for opening an OpenVPN tunnel to allow for remote access 'stats_monkey_config': {}} # Dict with information on how to configure the stats monkey (Only required when enabling the stats monkey :type support_info: dict :return: None :rtype: NoneType """ ExtensionsToolbox.verify_required_params(actual_params=support_info, required_params={ 'stats_monkey': (bool, None, False), 'remote_access': (bool, None, False), 'support_agent': (bool, None, False), 'stats_monkey_config': (dict, None, False) }) # All settings are optional, so if nothing is specified, no need to change anything if len(support_info) == 0: StorageRouterController._logger.warning( 'Configure support called without any specific settings. Doing nothing' ) return # Collect information support_agent_key = '/ovs/framework/support|support_agent' support_agent_new = support_info.get('support_agent') support_agent_old = Configuration.get(key=support_agent_key) support_agent_change = support_agent_new is not None and support_agent_old != support_agent_new remote_access_key = '/ovs/framework/support|remote_access' remote_access_new = support_info.get('remote_access') remote_access_old = Configuration.get(key=remote_access_key) remote_access_change = remote_access_new is not None and remote_access_old != remote_access_new stats_monkey_celery_key = '/ovs/framework/scheduling/celery' stats_monkey_config_key = '/ovs/framework/monitoring/stats_monkey' stats_monkey_new_config = support_info.get('stats_monkey_config') stats_monkey_old_config = Configuration.get( key=stats_monkey_config_key, default={}) stats_monkey_celery_config = Configuration.get( key=stats_monkey_celery_key, default={}) stats_monkey_new = support_info.get('stats_monkey') stats_monkey_old = stats_monkey_celery_config.get( 'ovs.stats_monkey.run_all' ) is not None or stats_monkey_celery_config.get( 'alba.stats_monkey.run_all') is not None stats_monkey_change = stats_monkey_new is not None and ( stats_monkey_old != stats_monkey_new or stats_monkey_new_config != stats_monkey_old_config) # Make sure support agent is enabled when trying to enable remote access if remote_access_new is True: if support_agent_new is False or (support_agent_new is None and support_agent_old is False): raise RuntimeError( 'Remote access cannot be enabled without the heart beat enabled' ) # Collect root_client information root_clients = {} for storagerouter in StorageRouterList.get_storagerouters(): try: root_clients[storagerouter] = SSHClient(endpoint=storagerouter, username='******') except UnableToConnectException: raise RuntimeError('Not all StorageRouters are reachable') if stats_monkey_new is True: ExtensionsToolbox.verify_required_params( actual_params=stats_monkey_new_config, required_params={ 'host': (str, ExtensionsToolbox.regex_ip), 'port': (int, { 'min': 1, 'max': 65535 }), 'database': (str, None), 'interval': (int, { 'min': 1, 'max': 86400 }), 'transport': (str, ['influxdb', 'redis', 'graphite']), 'environment': (str, None) }) if stats_monkey_new_config['transport'] in ['influxdb', 'reddis']: ExtensionsToolbox.verify_required_params( actual_params=stats_monkey_new_config, required_params={'password': (str, None)}) if stats_monkey_new_config['transport'] == 'influxdb': ExtensionsToolbox.verify_required_params( actual_params=stats_monkey_new_config, required_params={'username': (str, None)}) # Configure remote access if remote_access_change is True: Configuration.set(key=remote_access_key, value=remote_access_new) cid = Configuration.get('/ovs/framework/cluster_id').replace( r"'", r"'\''") for storagerouter, root_client in root_clients.iteritems(): if remote_access_new is False: StorageRouterController._logger.info( 'Un-configuring remote access on StorageRouter {0}'. format(root_client.ip)) nid = storagerouter.machine_id.replace(r"'", r"'\''") service_name = 'openvpn@ovs_{0}-{1}'.format(cid, nid) if StorageRouterController._service_manager.has_service( name=service_name, client=root_client): StorageRouterController._service_manager.stop_service( name=service_name, client=root_client) root_client.file_delete(filenames=['/etc/openvpn/ovs_*']) # Configure support agent if support_agent_change is True: service_name = 'support-agent' Configuration.set(key=support_agent_key, value=support_agent_new) for root_client in root_clients.itervalues(): if support_agent_new is True: StorageRouterController._logger.info( 'Configuring support agent on StorageRouter {0}'. format(root_client.ip)) if StorageRouterController._service_manager.has_service( name=service_name, client=root_client) is False: StorageRouterController._service_manager.add_service( name=service_name, client=root_client) StorageRouterController._service_manager.restart_service( name=service_name, client=root_client) else: StorageRouterController._logger.info( 'Un-configuring support agent on StorageRouter {0}'. format(root_client.ip)) if StorageRouterController._service_manager.has_service( name=service_name, client=root_client): StorageRouterController._service_manager.stop_service( name=service_name, client=root_client) StorageRouterController._service_manager.remove_service( name=service_name, client=root_client) # Configure stats monkey if stats_monkey_change is True: # 2 keys matter here: # - /ovs/framework/scheduling/celery --> used to check whether the stats monkey is disabled or not # - /ovs/framework/monitoring/stats_monkey --> contains the actual configuration parameters when enabling the stats monkey, such as host, port, username, ... service_name = 'scheduled-tasks' if stats_monkey_new is True: # Enable the scheduled task by removing the key StorageRouterController._logger.info( 'Configuring stats monkey') interval = stats_monkey_new_config['interval'] # The scheduled task cannot be configured to run more than once a minute, so for intervals < 60, the stats monkey task handles this itself StorageRouterController._logger.debug( 'Requested interval to run at: {0}'.format(interval)) Configuration.set(key=stats_monkey_config_key, value=stats_monkey_new_config) if interval > 0: days, hours, minutes, _ = ExtensionsToolbox.convert_to_days_hours_minutes_seconds( seconds=interval) if days == 1: # Max interval is 24 * 60 * 60, so once every day at 3 AM schedule = {'hour': '3'} elif hours > 0: schedule = {'hour': '*/{0}'.format(hours)} else: schedule = {'minute': '*/{0}'.format(minutes)} stats_monkey_celery_config[ 'ovs.stats_monkey.run_all'] = schedule stats_monkey_celery_config[ 'alba.stats_monkey.run_all'] = schedule StorageRouterController._logger.debug( 'Configured schedule is: {0}'.format(schedule)) else: stats_monkey_celery_config.pop('ovs.stats_monkey.run_all', None) stats_monkey_celery_config.pop('alba.stats_monkey.run_all', None) else: # Disable the scheduled task by setting the values for the celery tasks to None StorageRouterController._logger.info( 'Un-configuring stats monkey') stats_monkey_celery_config['ovs.stats_monkey.run_all'] = None stats_monkey_celery_config['alba.stats_monkey.run_all'] = None Configuration.set(key=stats_monkey_celery_key, value=stats_monkey_celery_config) for storagerouter in StorageRouterList.get_masters(): root_client = root_clients[storagerouter] StorageRouterController._logger.debug( 'Restarting ovs-scheduled-tasks service on node with IP {0}' .format(root_client.ip)) StorageRouterController._service_manager.restart_service( name=service_name, client=root_client) @staticmethod @ovs_task(name='ovs.storagerouter.mountpoint_exists') def mountpoint_exists(name, storagerouter_guid): """ Checks whether a given mount point for a vPool exists :param name: Name of the mount point to check :type name: str :param storagerouter_guid: Guid of the StorageRouter on which to check for mount point existence :type storagerouter_guid: str :return: True if mount point not in use else False :rtype: bool """ client = SSHClient(StorageRouter(storagerouter_guid)) return client.dir_exists(directory='/mnt/{0}'.format(name)) @staticmethod @ovs_task(name='ovs.storagerouter.refresh_hardware') def refresh_hardware(storagerouter_guid): """ Refreshes all hardware related information :param storagerouter_guid: Guid of the StorageRouter to refresh the hardware on :type storagerouter_guid: str :return: None :rtype: NoneType """ StorageRouterController.set_rdma_capability(storagerouter_guid) DiskController.sync_with_reality(storagerouter_guid) @staticmethod def set_rdma_capability(storagerouter_guid): """ Check if the StorageRouter has been reconfigured to be able to support RDMA :param storagerouter_guid: Guid of the StorageRouter to check and set :type storagerouter_guid: str :return: None :rtype: NoneType """ storagerouter = StorageRouter(storagerouter_guid) client = SSHClient(storagerouter, username='******') rdma_capable = False with remote(client.ip, [os], username='******') as rem: for root, dirs, files in rem.os.walk('/sys/class/infiniband'): for directory in dirs: ports_dir = '/'.join([root, directory, 'ports']) if not rem.os.path.exists(ports_dir): continue for sub_root, sub_dirs, _ in rem.os.walk(ports_dir): if sub_root != ports_dir: continue for sub_directory in sub_dirs: state_file = '/'.join( [sub_root, sub_directory, 'state']) if rem.os.path.exists(state_file): if 'ACTIVE' in client.run(['cat', state_file]): rdma_capable = True storagerouter.rdma_capable = rdma_capable storagerouter.save() @staticmethod @ovs_task(name='ovs.storagerouter.configure_disk', ensure_single_info={ 'mode': 'CHAINED', 'global_timeout': 1800 }) def configure_disk(storagerouter_guid, disk_guid, partition_guid, offset, size, roles): """ Configures a partition :param storagerouter_guid: Guid of the StorageRouter to configure a disk on :type storagerouter_guid: str :param disk_guid: Guid of the disk to configure :type disk_guid: str :param partition_guid: Guid of the partition on the disk :type partition_guid: str :param offset: Offset for the partition :type offset: int :param size: Size of the partition :type size: int :param roles: Roles assigned to the partition :type roles: list :return: None :rtype: NoneType """ # Validations storagerouter = StorageRouter(storagerouter_guid) for role in roles: if role not in DiskPartition.ROLES or role == DiskPartition.ROLES.BACKEND: raise RuntimeError('Invalid role specified: {0}'.format(role)) disk = Disk(disk_guid) if disk.storagerouter_guid != storagerouter_guid: raise RuntimeError( 'The given Disk is not on the given StorageRouter') for partition in disk.partitions: if DiskPartition.ROLES.BACKEND in partition.roles: raise RuntimeError('The given Disk is in use by a Backend') if len({DiskPartition.ROLES.DB, DiskPartition.ROLES.DTL}.intersection( set(roles))) > 0: roles_on_sr = StorageRouterController._get_roles_on_storagerouter( storagerouter.ip) for role in [DiskPartition.ROLES.DB, DiskPartition.ROLES.DTL]: if role in roles_on_sr and role in roles and roles_on_sr[role][ 0] != disk.name: # DB and DTL roles still have to be unassignable raise RoleDuplicationException( 'Disk {0} cannot have the {1} role due to presence on disk {2}' .format(disk.name, role, roles_on_sr[role][0])) # Create partition if partition_guid is None: StorageRouterController._logger.debug( 'Creating new partition - Offset: {0} bytes - Size: {1} bytes - Roles: {2}' .format(offset, size, roles)) with remote(storagerouter.ip, [DiskTools], username='******') as rem: if len(disk.aliases) == 0: raise ValueError( 'Disk {0} does not have any aliases'.format(disk.name)) rem.DiskTools.create_partition(disk_alias=disk.aliases[0], disk_size=disk.size, partition_start=offset, partition_size=size) DiskController.sync_with_reality(storagerouter_guid) disk = Disk(disk_guid) end_point = offset + size partition = None for part in disk.partitions: if offset < part.offset + part.size and end_point > part.offset: partition = part break if partition is None: raise RuntimeError( 'No new partition detected on disk {0} after having created 1' .format(disk.name)) StorageRouterController._logger.debug('Partition created') else: StorageRouterController._logger.debug('Using existing partition') partition = DiskPartition(partition_guid) if partition.disk_guid != disk_guid: raise RuntimeError( 'The given DiskPartition is not on the given Disk') if partition.filesystem in [ 'swap', 'linux_raid_member', 'LVM2_member' ]: raise RuntimeError( "It is not allowed to assign roles on partitions of type: ['swap', 'linux_raid_member', 'LVM2_member']" ) metadata = StorageRouterController.get_metadata(storagerouter_guid) partition_info = metadata['partitions'] removed_roles = set(partition.roles) - set(roles) used_roles = [] for role in removed_roles: for info in partition_info[role]: if info['in_use'] and info['guid'] == partition.guid: used_roles.append(role) if len(used_roles) > 0: raise RuntimeError( 'Roles in use cannot be removed. Used roles: {0}'.format( ', '.join(used_roles))) # Add filesystem if partition.filesystem is None or partition_guid is None: StorageRouterController._logger.debug('Creating filesystem') if len(partition.aliases) == 0: raise ValueError( 'Partition with offset {0} does not have any aliases'. format(partition.offset)) with remote(storagerouter.ip, [DiskTools], username='******') as rem: rem.DiskTools.make_fs(partition_alias=partition.aliases[0]) DiskController.sync_with_reality(storagerouter_guid) partition = DiskPartition(partition.guid) if partition.filesystem not in ['ext4', 'xfs']: raise RuntimeError('Unexpected filesystem') StorageRouterController._logger.debug('Filesystem created') # Mount the partition and add to FSTab if partition.mountpoint is None: StorageRouterController._logger.debug('Configuring mount point') with remote(storagerouter.ip, [DiskTools], username='******') as rem: counter = 1 mountpoint = '/mnt/{0}{1}'.format( 'ssd' if disk.is_ssd else 'hdd', counter) while True: if not rem.DiskTools.mountpoint_exists(mountpoint): break counter += 1 mountpoint = '/mnt/{0}{1}'.format( 'ssd' if disk.is_ssd else 'hdd', counter) StorageRouterController._logger.debug( 'Found mount point: {0}'.format(mountpoint)) rem.DiskTools.add_fstab(partition_aliases=partition.aliases, mountpoint=mountpoint, filesystem=partition.filesystem) rem.DiskTools.mount(mountpoint) DiskController.sync_with_reality(storagerouter_guid) partition = DiskPartition(partition.guid) if partition.mountpoint != mountpoint: raise RuntimeError('Unexpected mount point') StorageRouterController._logger.debug('Mount point configured') partition.roles = roles partition.save() StorageRouterController._logger.debug('Partition configured') @staticmethod def check_scrub_partition_present(): """ Checks whether at least 1 scrub partition is present on any StorageRouter :return: True if at least 1 SCRUB role present in the cluster else False :rtype: bool """ for storage_router in StorageRouterList.get_storagerouters(): for disk in storage_router.disks: for partition in disk.partitions: if DiskPartition.ROLES.SCRUB in partition.roles: return True return False @staticmethod def get_mountpoints(client): """ Retrieve the mount points :param client: SSHClient to retrieve the mount points on :return: List of mount points :rtype: list[str] """ mountpoints = [] for mountpoint in client.run(['mount', '-v']).strip().splitlines(): mp = mountpoint.split(' ')[2] if len( mountpoint.split(' ')) > 2 else None if mp and not mp.startswith('/dev') and not mp.startswith( '/proc') and not mp.startswith( '/sys') and not mp.startswith( '/run') and not mp.startswith( '/mnt/alba-asd') and mp != '/': mountpoints.append(mp) return mountpoints @staticmethod def _retrieve_alba_arakoon_config(alba_backend_guid, ovs_client): """ Retrieve the ALBA Arakoon configuration :param alba_backend_guid: Guid of the ALBA Backend :type alba_backend_guid: str :param ovs_client: OVS client object :type ovs_client: OVSClient :return: Arakoon configuration information :rtype: dict """ task_id = ovs_client.get( '/alba/backends/{0}/get_config_metadata'.format(alba_backend_guid)) successful, arakoon_config = ovs_client.wait_for_task(task_id, timeout=300) if successful is False: raise RuntimeError( 'Could not load metadata from environment {0}'.format( ovs_client.ip)) return arakoon_config @staticmethod def _revert_vpool_status(vpool, status=VPool.STATUSES.RUNNING, storagedriver=None, client=None, dirs_created=None): """ Remove the vPool being created or revert the vPool being extended :return: None :rtype: NoneType """ vpool.status = status vpool.save() if status == VPool.STATUSES.RUNNING: if len(dirs_created) > 0: try: client.dir_delete(directories=dirs_created) except Exception: StorageRouterController._logger.warning( 'Failed to clean up following directories: {0}'.format( ', '.join(dirs_created))) if storagedriver is not None: for sdp in storagedriver.partitions: sdp.delete() for proxy in storagedriver.alba_proxies: proxy.delete() storagedriver.delete() if len(vpool.storagedrivers) == 0: vpool.delete() if Configuration.dir_exists( key='/ovs/vpools/{0}'.format(vpool.guid)): Configuration.delete( key='/ovs/vpools/{0}'.format(vpool.guid)) @staticmethod def _get_roles_on_storagerouter(ip): """ returns a set with the roles present on the storagerouter :param ip: string with ip of the storagerouter :return: Dict """ sr = StorageRouterList.get_by_ip(ip) roles_on_sr = {} for sr_disk in sr.disks: for partition in sr_disk.partitions: for part_role in partition.roles: if part_role not in roles_on_sr: roles_on_sr[part_role] = [sr_disk.name] else: roles_on_sr[part_role].append(sr_disk.name) return roles_on_sr
def validate_cluster(cluster_name='ovsdb'): """ Validate if the chosen cluster is * deployed on all required nodes * running on all required nodes * working correctly on all required nodes :param cluster_name: name of a existing arakoon cluster (DEFAULT=ovsdb) :type cluster_name: str :return: """ ArakoonValidation.LOGGER.info("Starting validating arakoon cluster") master_storagerouters = [ storagerouter.ip for storagerouter in StorageRouterList.get_masters() ] assert len(master_storagerouters ) >= 2, 'Environment has only `{0}` node(s)'.format( len(master_storagerouters)) master_storagerouters.sort() arakoon_service_name = "ovs-arakoon-{0}".format(cluster_name) service_manager = ServiceFactory.get_manager() for storagerouter_ip in master_storagerouters: client = SSHClient(storagerouter_ip, username='******') # check if service file is available ArakoonValidation.LOGGER.info( "Validating if cluster service `{0}` is available on node `{1}`" .format(cluster_name, storagerouter_ip)) assert service_manager.has_service(arakoon_service_name, client), "Service file of `{0}` does not exists on storagerouter `{1}`"\ .format(cluster_name, storagerouter_ip) # check if service is running on system ArakoonValidation.LOGGER.info( "Validating if cluster service `{0}` is running on node `{1}`". format(cluster_name, storagerouter_ip)) assert service_manager.get_service_status(arakoon_service_name, client) == 'active', \ "Service of `{0}` is not running on storagerouter `{1}`".format(cluster_name, storagerouter_ip) # perform nop, get and set on cluster key = 'integration-tests-{0}'.format(str(uuid.uuid4())) value = str(time.time()) ArakoonValidation.LOGGER.info( "Validating if cluster `{0}` works".format(cluster_name)) # determine if there is a healthy cluster configuration = Configuration.get( '/ovs/arakoon/{0}/config'.format(cluster_name), raw=True) client = PyrakoonStore(cluster_name, configuration) client.nop() # perform set, get & compare client.set(key, value) get_value = client.get(key) assert get_value == value, "Value mismatch on cluster `{0}`, get value `{1}`, " \ "expected value `{2}` on key `{3}`".format(cluster_name, get_value, value, key) # perform delete client.delete(key) try: assert not client.get(key), "Key `{0}` still exists on cluster `{1}` after deleting it"\ .format(key, cluster_name) except KeyNotFoundException: # key not found so test has passed assert True ArakoonValidation.LOGGER.info("Finished validating arakoon cluster")