def add_services(client, node_type, logger): """ Add the services required by the OVS cluster :param client: Client on which to add the services :type client: ovs.extensions.generic.sshclient.SSHClient :param node_type: Type of node ('master' or 'extra') :type node_type: str :param logger: Logger object used for logging :type logger: ovs.log.log_handler.LogHandler :return: None """ Toolbox.log(logger=logger, messages='Adding services') services = {} worker_queue = System.get_my_machine_id(client=client) if node_type == 'master': worker_queue += ',ovs_masters' services.update({'memcached': {'MEMCACHE_NODE_IP': client.ip, 'WORKER_QUEUE': worker_queue}, 'rabbitmq-server': {'MEMCACHE_NODE_IP': client.ip, 'WORKER_QUEUE': worker_queue}, 'scheduled-tasks': {}, 'webapp-api': {}, 'volumerouter-consumer': {}}) services.update({'workers': {'WORKER_QUEUE': worker_queue}, 'watcher-framework': {}}) for service_name, params in services.iteritems(): if not ServiceManager.has_service(service_name, client): Toolbox.log(logger=logger, messages='Adding service {0}'.format(service_name)) ServiceManager.add_service(name=service_name, params=params, client=client)
def _configure_amqp_to_volumedriver(): Toolbox.log(logger=NodeTypeController._logger, messages='Update existing vPools') login = Configuration.get('/ovs/framework/messagequeue|user') password = Configuration.get('/ovs/framework/messagequeue|password') protocol = Configuration.get('/ovs/framework/messagequeue|protocol') uris = [] for endpoint in Configuration.get( '/ovs/framework/messagequeue|endpoints'): uris.append({ 'amqp_uri': '{0}://{1}:{2}@{3}'.format(protocol, login, password, endpoint) }) if Configuration.dir_exists('/ovs/vpools'): for vpool_guid in Configuration.list('/ovs/vpools'): for storagedriver_id in Configuration.list( '/ovs/vpools/{0}/hosts'.format(vpool_guid)): storagedriver_config = StorageDriverConfiguration( vpool_guid, storagedriver_id) storagedriver_config.configure_event_publisher( events_amqp_routing_key=Configuration.get( '/ovs/framework/messagequeue|queues.storagedriver' ), events_amqp_uris=uris) storagedriver_config.save()
def check_rabbitmq_and_enable_ha_mode(client, logger): """ Verify RabbitMQ is running properly and enable HA mode :param client: Client on which to check RabbitMQ :type client: ovs_extensions.generic.sshclient.SSHClient :param logger: Logger object used for logging :type logger: ovs.log.log_handler.LogHandler :return: None """ service_manager = ServiceFactory.get_manager() if not service_manager.has_service('rabbitmq-server', client): raise RuntimeError( 'Service rabbitmq-server has not been added on node {0}'. format(client.ip)) rabbitmq_running, same_process = service_manager.is_rabbitmq_running( client=client) if rabbitmq_running is False or same_process is False: Toolbox.change_service_state(client, 'rabbitmq-server', 'restart', logger) time.sleep(5) client.run([ 'rabbitmqctl', 'set_policy', 'ha-all', '^(volumerouter|ovs_.*)$', '{"ha-mode":"all"}' ])
def configure_avahi(client, node_name, node_type, logger): """ Configure Avahi :param client: Client on which to configure avahi :type client: ovs_extensions.generic.sshclient.SSHClient :param node_name: Name of the node to set in Avahi :type node_name: str :param node_type: Type of the node ('master' or 'extra') :type node_type: str :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: None """ valid_avahi = NodeTypeController.validate_avahi_cluster_name( ip=client.ip, cluster_name=Configuration.get('/ovs/framework/cluster_name'), node_name=node_name) if valid_avahi[0] is False: raise RuntimeError(valid_avahi[1]) Toolbox.log(logger=logger, messages='Announcing service') client.file_write( NodeTypeController.avahi_filename, """<?xml version="1.0" standalone='no'?> <!--*-nxml-*--> <!DOCTYPE service-group SYSTEM "avahi-service.dtd"> <!-- $Id$ --> <service-group> <name replace-wildcards="yes">{0}</name> <service> <type>_ovs_{1}_node._tcp</type> <port>443</port> </service> </service-group>""".format(valid_avahi[1], node_type)) ServiceFactory.change_service_state(client, 'avahi-daemon', 'restart', NodeTypeController._logger)
def configure_avahi(client, node_name, node_type, logger): """ Configure Avahi :param client: Client on which to configure avahi :type client: ovs.extensions.generic.sshclient.SSHClient :param node_name: Name of the node to set in Avahi :type node_name: str :param node_type: Type of the node ('master' or 'extra') :type node_type: str :param logger: Logger object used for logging :type logger: ovs.log.log_handler.LogHandler :return: None """ cluster_name = Configuration.get('/ovs/framework/cluster_name') Toolbox.log(logger=logger, messages='Announcing service') client.file_write(NodeTypeController.avahi_filename, """<?xml version="1.0" standalone='no'?> <!--*-nxml-*--> <!DOCTYPE service-group SYSTEM "avahi-service.dtd"> <!-- $Id$ --> <service-group> <name replace-wildcards="yes">ovs_cluster_{0}_{1}_{3}</name> <service> <type>_ovs_{2}_node._tcp</type> <port>443</port> </service> </service-group>""".format(cluster_name, node_name, node_type, client.ip.replace('.', '_'))) Toolbox.change_service_state(client, 'avahi-daemon', 'restart', NodeTypeController._logger)
def apply(license_string): """ Applies a license. It will apply as much licenses as possible, however, it won't fail on invalid licenses as it will simply skip them. """ try: clients = {} storagerouters = StorageRouterList.get_storagerouters() try: for storagerouter in storagerouters: clients[storagerouter] = SSHClient(storagerouter.ip) except UnableToConnectException: raise RuntimeError('Not all StorageRouters are reachable') data = LicenseController._decode(license_string) for component in data: cdata = data[component] name = cdata['name'] data = cdata['data'] token = cdata['token'] valid_until = float( cdata['valid_until']) if 'valid_until' in cdata else None if valid_until is not None and valid_until <= time.time(): continue signature = cdata['signature'] if 'signature' in cdata else None validate_functions = Toolbox.fetch_hooks( 'license', '{0}.validate'.format(component)) apply_functions = Toolbox.fetch_hooks( 'license', '{0}.apply'.format(component)) if len(validate_functions) == 1 and len(apply_functions) == 1: valid, metadata = validate_functions[0]( component=component, data=data, signature=signature) if valid is True: success = apply_functions[0](component=component, data=data, signature=signature) if success is True: license_object = LicenseList.get_by_component( component) if license_object is None: license_object = License() license_object.component = component license_object.name = name license_object.token = token license_object.data = data license_object.valid_until = valid_until license_object.signature = signature license_object.save() license_contents = [] for lic in LicenseList.get_licenses(): license_contents.append(lic.hash) for storagerouter in storagerouters: client = clients[storagerouter] client.file_write('/opt/OpenvStorage/config/licenses', '{0}\n'.format('\n'.join(license_contents))) except Exception, ex: logger.exception('Error applying license: {0}'.format(ex)) return None
def get_metadata(storagerouter): """ Retrieve metadata for a Storage Router Example return value: {'ipaddresses': ['10.100.174.254', '172.22.1.100', '192.168.122.1'], 'partitions': {'BACKEND': [{'available': 1000202043392, 'guid': '9ec473ad-5c3f-4fdb-a4ef-c99bb4449025', 'in_use': False, 'mountpoint': u'/mnt/alba-asd/hiu8WiD7sCfVF2IKRa5U1VZLOBS3H75W', 'size': 1000202043392, 'ssd': False, 'storagerouter_guid': u'f5155bc2-b238-4a94-b6ce-b5600e65607a'}], 'DB': [{'available': 425200713728, 'guid': 'c0064548-c0be-474d-a66b-da65639831f8', 'in_use': False, 'mountpoint': '/mnt/storage', 'size': 425200713728, 'ssd': False, 'storagerouter_guid': u'f5155bc2-b238-4a94-b6ce-b5600e65607a'}], 'SCRUB': [{'available': 340160570983, 'guid': 'c0064548-c0be-474d-a66b-da65639831f8', 'in_use': False, 'mountpoint': '/mnt/storage', 'size': 425200713728, 'ssd': False, 'storagerouter_guid': u'f5155bc2-b238-4a94-b6ce-b5600e65607a'}], 'WRITE': [{'available': 60016295936, 'guid': '0d167ced-5a5f-47aa-b890-45b923b686c4', 'in_use': False, 'mountpoint': u'/mnt/ssd2', 'size': 60016295936, 'ssd': True, 'storagerouter_guid': u'f5155bc2-b238-4a94-b6ce-b5600e65607a'}]}, 'scrub_available': True, 'writecache_size': 60016295936} :param storagerouter: Storage Router to retrieve metadata for :return: Metadata """ result, metadata = GeneralStorageRouter.api.execute_post_action(component='storagerouters', guid=storagerouter.guid, action='get_metadata', data={}, wait=True, timeout=300) assert result is True, 'Retrieving metadata failed for Storage Router {0}'.format(storagerouter.name) required_params = {'ipaddresses': (list, Toolbox.regex_ip), 'partitions': (dict, None), 'scrub_available': (bool, None), 'writecache_size': (int, {'min': 0})} Toolbox.verify_required_params(required_params=required_params, actual_params=metadata, exact_match=True) return metadata
def refresh_package_information(): """ Retrieve and store the package information of all StorageRouters :return: None """ GenericController._logger.info('Updating package information') threads = [] information = {} all_storagerouters = StorageRouterList.get_storagerouters() for storagerouter in all_storagerouters: information[storagerouter.ip] = {} for fct in Toolbox.fetch_hooks('update', 'get_package_info_multi'): try: # We make use of these clients in Threads --> cached = False client = SSHClient(endpoint=storagerouter, username='******', cached=False) except UnableToConnectException: information[storagerouter.ip]['errors'] = [ 'StorageRouter {0} is inaccessible'.format( storagerouter.name) ] break thread = Thread(target=fct, args=(client, information)) thread.start() threads.append(thread) for fct in Toolbox.fetch_hooks('update', 'get_package_info_single'): thread = Thread(target=fct, args=(information, )) thread.start() threads.append(thread) for thread in threads: thread.join() errors = [] copy_information = copy.deepcopy(information) for ip, info in information.iteritems(): if len(info.get('errors', [])) > 0: errors.extend( ['{0}: {1}'.format(ip, error) for error in info['errors']]) copy_information.pop(ip) for storagerouter in all_storagerouters: info = copy_information.get(storagerouter.ip, {}) if 'errors' in info: info.pop('errors') storagerouter.package_information = info storagerouter.save() if len(errors) > 0: errors = [str(error) for error in set(errors)] raise Exception(' - {0}'.format('\n - '.join(errors)))
def retrieve_storagerouter_info_via_host(ip, password): """ Retrieve the storagerouters from model """ storagerouters = {} try: from ovs.dal.lists.storagerouterlist import StorageRouterList with remote(ip_info=ip, modules=[StorageRouterList], username='******', password=password, strict_host_key_checking=False) as rem: for sr in rem.StorageRouterList.get_storagerouters(): storagerouters[sr.name] = {'ip': sr.ip, 'type': sr.node_type.lower()} except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages='Error loading storagerouters: {0}'.format(ex), loglevel='exception', silent=True) return storagerouters
def configure_memcached(client, logger): """ Configure Memcached :param client: Client on which to configure Memcached :type client: ovs.extensions.generic.sshclient.SSHClient :param logger: Logger object used for logging :type logger: ovs.log.log_handler.LogHandler :return: None """ Toolbox.log(logger=logger, messages='Setting up Memcached') client.run(['sed', '-i', 's/^-l.*/-l 0.0.0.0/g', '/etc/memcached.conf']) client.run(['sed', '-i', 's/^-m.*/-m 1024/g', '/etc/memcached.conf']) client.run(['sed', '-i', '-E', 's/^-v(.*)/# -v\1/g', '/etc/memcached.conf']) # Put all -v, -vv, ... back in comment client.run(['sed', '-i', 's/^# -v[^v]*$/-v/g', '/etc/memcached.conf']) # Uncomment only -v
def restart_framework_and_memcache_services(clients, logger, offline_node_ips=None): """ Restart framework and Memcached services :param clients: Clients on which to restart these services :type clients: dict :param logger: Logger object used for logging :type logger: ovs.log.log_handler.LogHandler :param offline_node_ips: IP addresses of offline nodes in the cluster :type offline_node_ips: list :return: None """ from ovs.dal.lists.storagerouterlist import StorageRouterList master_ips = [sr.ip for sr in StorageRouterList.get_masters()] slave_ips = [sr.ip for sr in StorageRouterList.get_slaves()] if offline_node_ips is None: offline_node_ips = [] memcached = 'memcached' watcher = 'watcher-framework' support_agent = 'support-agent' for ip in master_ips + slave_ips: if ip not in offline_node_ips: if ServiceManager.has_service(watcher, clients[ip]): Toolbox.change_service_state(clients[ip], watcher, 'stop', logger) for ip in master_ips: if ip not in offline_node_ips: Toolbox.change_service_state(clients[ip], memcached, 'restart', logger) for ip in master_ips + slave_ips: if ip not in offline_node_ips: if ServiceManager.has_service(watcher, clients[ip]): Toolbox.change_service_state(clients[ip], watcher, 'start', logger) if ServiceManager.has_service(support_agent, clients[ip]): Toolbox.change_service_state(clients[ip], support_agent, 'restart', logger) VolatileFactory.store = None
def validate_alba_backend_removal(alba_backend_info): """ Validate whether the backend has been deleted properly alba_backend_info should be a dictionary containing: - guid - name - maintenance_service_names :param alba_backend_info: Information about the backend :return: None """ Toolbox.verify_required_params(actual_params=alba_backend_info, required_params={'name': (str, None), 'guid': (str, Toolbox.regex_guid), 'maintenance_service_names': (list, None)}, exact_match=True) alba_backend_guid = alba_backend_info['guid'] alba_backend_name = alba_backend_info['name'] backend = GeneralBackend.get_by_name(alba_backend_name) assert backend is None,\ 'Still found a backend in the model with name {0}'.format(alba_backend_name) # Validate services removed from model for service in GeneralService.get_services_by_name(ServiceType.SERVICE_TYPES.ALBA_MGR): assert service.name != '{0}-abm'.format(alba_backend_name),\ 'An AlbaManager service has been found with name {0}'.format(alba_backend_name) for service in GeneralService.get_services_by_name(ServiceType.SERVICE_TYPES.NS_MGR): assert service.name.startswith('{0}-nsm_'.format(alba_backend_name)) is False,\ 'An NamespaceManager service has been found with name {0}'.format(alba_backend_name) # Validate ALBA backend configuration structure alba_backend_key = '/ovs/alba/backends' actual_configuration_keys = [key for key in Configuration.list(alba_backend_key)] assert alba_backend_guid not in actual_configuration_keys,\ 'Configuration still contains an entry in {0} with guid {1}'.format(alba_backend_key, alba_backend_guid) # Validate Arakoon configuration structure arakoon_keys = [key for key in Configuration.list('/ovs/arakoon') if key.startswith(alba_backend_name)] assert len(arakoon_keys) == 0,\ 'Configuration still contains configurations for clusters: {0}'.format(', '.join(arakoon_keys)) # Validate services for storagerouter in GeneralStorageRouter.get_storage_routers(): root_client = SSHClient(endpoint=storagerouter, username='******') maintenance_services = alba_backend_info['maintenance_service_names'] abm_arakoon_service_name = 'ovs-arakoon-{0}-abm'.format(alba_backend_name) nsm_arakoon_service_name = 'ovs-arakoon-{0}-nsm_0'.format(alba_backend_name) for service_name in [abm_arakoon_service_name, nsm_arakoon_service_name] + maintenance_services: assert GeneralService.has_service(name=service_name, client=root_client) is False,\ 'Service {0} still deployed on Storage Router {1}'.format(service_name, storagerouter.name)
def validate(license_string): """ Validates a license with the various components """ try: result = {} data = LicenseController._decode(license_string) for component in data: cdata = data[component] name = cdata['name'] data = cdata['data'] _ = cdata['token'] valid_until = float( cdata['valid_until']) if 'valid_until' in cdata else None if valid_until is not None and valid_until <= time.time(): result[component] = False continue signature = cdata['signature'] if 'signature' in cdata else None validate_functions = Toolbox.fetch_hooks( 'license', '{0}.validate'.format(component)) apply_functions = Toolbox.fetch_hooks( 'license', '{0}.apply'.format(component)) if len(validate_functions) == 1 and len(apply_functions) == 1: try: valid, metadata = validate_functions[0]( component=component, data=data, signature=signature) except Exception, ex: logger.debug( 'Error validating license for {0}: {1}'.format( component, ex)) valid = False metadata = None if valid is False: logger.debug('Invalid license for {0}: {1}'.format( component, license_string)) result[component] = False else: result[component] = { 'valid_until': valid_until, 'metadata': metadata, 'name': name } else: logger.debug( 'No validate nor apply functions found for {0}'.format( component)) result[component] = False return result
def apply(license_string): """ Applies a license. It will apply as much licenses as possible, however, it won't fail on invalid licenses as it will simply skip them. """ try: clients = {} storagerouters = StorageRouterList.get_storagerouters() try: for storagerouter in storagerouters: clients[storagerouter] = SSHClient(storagerouter.ip) except UnableToConnectException: raise RuntimeError('Not all StorageRouters are reachable') data = LicenseController._decode(license_string) for component in data: cdata = data[component] name = cdata['name'] data = cdata['data'] token = cdata['token'] valid_until = float(cdata['valid_until']) if 'valid_until' in cdata else None if valid_until is not None and valid_until <= time.time(): continue signature = cdata['signature'] if 'signature' in cdata else None validate_functions = Toolbox.fetch_hooks('license', '{0}.validate'.format(component)) apply_functions = Toolbox.fetch_hooks('license', '{0}.apply'.format(component)) if len(validate_functions) == 1 and len(apply_functions) == 1: valid, metadata = validate_functions[0](component=component, data=data, signature=signature) if valid is True: success = apply_functions[0](component=component, data=data, signature=signature) if success is True: license_object = LicenseList.get_by_component(component) if license_object is None: license_object = License() license_object.component = component license_object.name = name license_object.token = token license_object.data = data license_object.valid_until = valid_until license_object.signature = signature license_object.save() license_contents = [] for lic in LicenseList.get_licenses(): license_contents.append(lic.hash) for storagerouter in storagerouters: client = clients[storagerouter] client.file_write('/opt/OpenvStorage/config/licenses', '{0}\n'.format('\n'.join(license_contents))) except Exception, ex: LicenseController._logger.exception('Error applying license: {0}'.format(ex)) return None
def refresh_package_information(): """ Retrieve and store the package information of all StorageRouters :return: None """ GenericController._logger.info('Updating package information') threads = [] information = {} all_storagerouters = StorageRouterList.get_storagerouters() for storagerouter in all_storagerouters: information[storagerouter.ip] = {} for function in Toolbox.fetch_hooks('update', 'get_package_info_multi'): try: # We make use of these clients in Threads --> cached = False client = SSHClient(endpoint=storagerouter, username='******', cached=False) except UnableToConnectException: information[storagerouter.ip]['errors'] = ['StorageRouter {0} is inaccessible'.format(storagerouter.name)] break thread = Thread(target=function, args=(client, information)) thread.start() threads.append(thread) for function in Toolbox.fetch_hooks('update', 'get_package_info_single'): thread = Thread(target=function, args=(information,)) thread.start() threads.append(thread) for thread in threads: thread.join() errors = [] copy_information = copy.deepcopy(information) for ip, info in information.iteritems(): if len(info.get('errors', [])) > 0: errors.extend(['{0}: {1}'.format(ip, error) for error in info['errors']]) copy_information.pop(ip) for storagerouter in all_storagerouters: info = copy_information.get(storagerouter.ip, {}) if 'errors' in info: info.pop('errors') storagerouter.package_information = info storagerouter.save() if len(errors) > 0: errors = [str(error) for error in set(errors)] raise Exception(' - {0}'.format('\n - '.join(errors)))
def remove(license_guid): """ Removes a license """ clients = {} storagerouters = StorageRouterList.get_storagerouters() try: for storagerouter in storagerouters: clients[storagerouter] = SSHClient(storagerouter.ip) except UnableToConnectException: raise RuntimeError('Not all StorageRouters are reachable') lic = License(license_guid) if lic.can_remove is True: remove_functions = Toolbox.fetch_hooks( 'license', '{0}.remove'.format(lic.component)) result = remove_functions[0](component=lic.component, data=lic.data, valid_until=lic.valid_until, signature=lic.signature) if result is True: lic.delete() license_contents = [] for lic in LicenseList.get_licenses(): license_contents.append(lic.hash) for storagerouter in storagerouters: client = clients[storagerouter] client.file_write( '/opt/OpenvStorage/config/licenses', '{0}\n'.format('\n'.join(license_contents))) return result return None
def _get_package_information(self): versions_dict = {self._client.ip: {}} # ALba is always installed with OpenvStorage. The current split however offloads retrieving Alba information to the AlbaNode which is not # present for non openvstorage-hc installs. Therefore explicititely request the alba information like this (otherwise update will get jeopardized). final_dict = {} threads = [] for fct in Toolbox.fetch_hooks( component='update', sub_component='get_package_info_cluster'): thread = Thread(target=fct, args=(self._client, versions_dict)) thread.start() threads.append(thread) for thread in threads: thread.join() for versions in versions_dict[self._client.ip].itervalues(): for package, version in versions.iteritems(): if package in final_dict: if version != final_dict[package]: final_dict[package] = min(version, final_dict[package]) else: final_dict[package] = version return OrderedDict( (key, self._stringify_looseversion(value)) for key, value in sorted(final_dict.items(), key=lambda v: v[0]))
def _can_remove(self): """ Can be removed """ return len( Toolbox.fetch_hooks('license', '{0}.remove'.format(self.component))) == 1
def remove(license_guid): """ Removes a license """ clients = {} storagerouters = StorageRouterList.get_storagerouters() try: for storagerouter in storagerouters: clients[storagerouter] = SSHClient(storagerouter.ip) except UnableToConnectException: raise RuntimeError('Not all StorageRouters are reachable') lic = License(license_guid) if lic.can_remove is True: remove_functions = Toolbox.fetch_hooks('license', '{0}.remove'.format(lic.component)) result = remove_functions[0](component=lic.component, data=lic.data, valid_until=lic.valid_until, signature=lic.signature) if result is True: lic.delete() license_contents = [] for lic in LicenseList.get_licenses(): license_contents.append(lic.hash) for storagerouter in storagerouters: client = clients[storagerouter] client.file_write('/opt/OpenvStorage/config/licenses', '{0}\n'.format('\n'.join(license_contents))) return result return None
def check_rabbitmq_and_enable_ha_mode(client, logger): """ Verify RabbitMQ is running properly and enable HA mode :param client: Client on which to check RabbitMQ :type client: ovs.extensions.generic.sshclient.SSHClient :param logger: Logger object used for logging :type logger: ovs.log.log_handler.LogHandler :return: None """ if not ServiceManager.has_service('rabbitmq-server', client): raise RuntimeError('Service rabbitmq-server has not been added on node {0}'.format(client.ip)) rabbitmq_running, same_process = ServiceManager.is_rabbitmq_running(client=client) if rabbitmq_running is False or same_process is False: Toolbox.change_service_state(client, 'rabbitmq-server', 'restart', logger) time.sleep(5) client.run(['rabbitmqctl', 'set_policy', 'ha-all', '^(volumerouter|ovs_.*)$', '{"ha-mode":"all"}'])
def avahi_installed(client, logger): """ Verify whether Avahi is installed :param client: Client on which to check for Avahi :type client: ovs_extensions.generic.sshclient.SSHClient :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: True if Avahi is installed, False otherwise :rtype: bool """ installed = client.run(['which', 'avahi-daemon'], allow_nonzero=True) if installed == '': Toolbox.log(logger=logger, messages='Avahi not installed') return False else: Toolbox.log(logger=logger, messages='Avahi installed') return True
def avahi_installed(client, logger): """ Verify whether Avahi is installed :param client: Client on which to check for Avahi :type client: ovs.extensions.generic.sshclient.SSHClient :param logger: Logger object used for logging :type logger: ovs.log.log_handler.LogHandler :return: True if Avahi is installed, False otherwise :rtype: bool """ installed = client.run(['which', 'avahi-daemon'], allow_nonzero=True) if installed == '': Toolbox.log(logger=logger, messages='Avahi not installed') return False else: Toolbox.log(logger=logger, messages='Avahi installed') return True
def configure_memcached(client, logger): """ Configure Memcached :param client: Client on which to configure Memcached :type client: ovs_extensions.generic.sshclient.SSHClient :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: None """ Toolbox.log(logger=logger, messages='Setting up Memcached') client.run( ['sed', '-i', 's/^-l.*/-l 0.0.0.0/g', '/etc/memcached.conf']) client.run(['sed', '-i', 's/^-m.*/-m 1024/g', '/etc/memcached.conf']) client.run( ['sed', '-i', '-E', 's/^-v(.*)/# -v\1/g', '/etc/memcached.conf']) # Put all -v, -vv, ... back in comment client.run(['sed', '-i', 's/^# -v[^v]*$/-v/g', '/etc/memcached.conf']) # Uncomment only -v
def run_backend_domain_hooks(backend_guid): """ Run hooks when the Backend Domains have been updated :param backend_guid: Guid of the Backend to update :type backend_guid: str :return: None """ for function in Toolbox.fetch_hooks('backend', 'domains-update'): function(backend_guid=backend_guid)
def run_backend_domain_hooks(backend_guid): """ Run hooks when the Backend Domains have been updated :param backend_guid: Guid of the Backend to update :type backend_guid: str :return: None """ for fct in Toolbox.fetch_hooks('backend', 'domains-update'): fct(backend_guid=backend_guid)
def _configure_amqp_to_volumedriver(): Toolbox.log(logger=NodeTypeController._logger, messages='Update existing vPools') login = Configuration.get('/ovs/framework/messagequeue|user') password = Configuration.get('/ovs/framework/messagequeue|password') protocol = Configuration.get('/ovs/framework/messagequeue|protocol') uris = [] for endpoint in Configuration.get('/ovs/framework/messagequeue|endpoints'): uris.append({'amqp_uri': '{0}://{1}:{2}@{3}'.format(protocol, login, password, endpoint)}) if Configuration.dir_exists('/ovs/vpools'): for vpool_guid in Configuration.list('/ovs/vpools'): for storagedriver_id in Configuration.list('/ovs/vpools/{0}/hosts'.format(vpool_guid)): storagedriver_config = StorageDriverConfiguration('storagedriver', vpool_guid, storagedriver_id) storagedriver_config.load() storagedriver_config.configure_event_publisher(events_amqp_routing_key=Configuration.get('/ovs/framework/messagequeue|queues.storagedriver'), events_amqp_uris=uris) storagedriver_config.save()
def add_services(client, node_type, logger): """ Add the services required by the OVS cluster :param client: Client on which to add the services :type client: ovs_extensions.generic.sshclient.SSHClient :param node_type: Type of node ('master' or 'extra') :type node_type: str :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: None """ Toolbox.log(logger=logger, messages='Adding services') service_manager = ServiceFactory.get_manager() services = {} worker_queue = System.get_my_machine_id(client=client) if node_type == 'master': worker_queue += ',ovs_masters' services.update({ 'memcached': { 'MEMCACHE_NODE_IP': client.ip, 'WORKER_QUEUE': worker_queue }, 'rabbitmq-server': { 'MEMCACHE_NODE_IP': client.ip, 'WORKER_QUEUE': worker_queue }, 'scheduled-tasks': {}, 'webapp-api': {}, 'volumerouter-consumer': {} }) services.update({ 'workers': { 'WORKER_QUEUE': worker_queue }, 'watcher-framework': {} }) for service_name, params in services.iteritems(): if not service_manager.has_service(service_name, client): Toolbox.log(logger=logger, messages='Adding service {0}'.format(service_name)) service_manager.add_service(name=service_name, params=params, client=client)
def install_plugins(): """ (Re)load plugins """ if ServiceManager.has_service('ovs-watcher-framework', SSHClient('127.0.0.1', username='******')): # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed from ovs.dal.lists.storagerouterlist import StorageRouterList clients = [] try: for storagerouter in StorageRouterList.get_storagerouters(): clients.append(SSHClient(storagerouter, username='******')) except UnableToConnectException: raise RuntimeError('Not all StorageRouters are reachable') for client in clients: for service_name in ['watcher-framework', 'memcached']: ServiceManager.stop_service(service_name, client=client) wait = 30 while wait > 0: if ServiceManager.get_service_status( service_name, client=client) is False: break time.sleep(1) wait -= 1 if wait == 0: raise RuntimeError( 'Could not stop service: {0}'.format(service_name)) for client in clients: for service_name in ['memcached', 'watcher-framework']: ServiceManager.start_service(service_name, client=client) wait = 30 while wait > 0: if ServiceManager.get_service_status( service_name, client=client) is True: break time.sleep(1) wait -= 1 if wait == 0: raise RuntimeError( 'Could not start service: {0}'.format( service_name)) from ovs.dal.helpers import Migration Migration.migrate() from ovs.lib.helpers.toolbox import Toolbox ip = System.get_my_storagerouter().ip functions = Toolbox.fetch_hooks('plugin', 'postinstall') for function in functions: function(ip=ip)
def remove_services(client, node_type, logger): """ Remove all services managed by OVS :param client: Client on which to remove the services :type client: ovs.extensions.generic.sshclient.SSHClient :param node_type: Type of node, can be 'master' or 'extra' :type node_type: str :param logger: Logger object used for logging :type logger: ovs.log.log_handler.LogHandler :return: None """ Toolbox.log(logger=logger, messages="Removing services") stop_only = ["rabbitmq-server", "memcached"] services = ["workers", "support-agent", "watcher-framework"] if node_type == "master": services += ["scheduled-tasks", "webapp-api", "volumerouter-consumer"] if Toolbox.is_service_internally_managed(service="rabbitmq") is True: services.append("rabbitmq-server") if Toolbox.is_service_internally_managed(service="memcached") is True: services.append("memcached") for service in services: if ServiceManager.has_service(service, client=client): Toolbox.log( logger=logger, messages="{0} service {1}".format("Removing" if service not in stop_only else "Stopping", service), ) ServiceManager.stop_service(service, client=client) if service not in stop_only: ServiceManager.remove_service(service, client=client)
def change_services_state(services, ssh_clients, action): """ Stop/start services on SSH clients If action is start, we ignore errors and try to start other services on other nodes """ services = list(services) if action == 'start': services.reverse() # Start services again in reverse order of stopping for service_name in services: for ssh_client in ssh_clients: description = 'stopping' if action == 'stop' else 'starting' if action == 'start' else 'restarting' try: if ServiceManager.has_service(service_name, client=ssh_client): Toolbox.change_service_state(client=ssh_client, name=service_name, state=action, logger=UpdateController._logger) except Exception as exc: UpdateController._logger.warning('{0}: Something went wrong {1} service {2}: {3}'.format(ssh_client.ip, description, service_name, exc)) if action == 'stop': return False return True
def validate(license_string): """ Validates a license with the various components """ try: result = {} data = LicenseController._decode(license_string) for component in data: cdata = data[component] name = cdata['name'] data = cdata['data'] _ = cdata['token'] valid_until = float(cdata['valid_until']) if 'valid_until' in cdata else None if valid_until is not None and valid_until <= time.time(): result[component] = False continue signature = cdata['signature'] if 'signature' in cdata else None validate_functions = Toolbox.fetch_hooks('license', '{0}.validate'.format(component)) apply_functions = Toolbox.fetch_hooks('license', '{0}.apply'.format(component)) if len(validate_functions) == 1 and len(apply_functions) == 1: try: valid, metadata = validate_functions[0](component=component, data=data, signature=signature) except Exception, ex: LicenseController._logger.debug('Error validating license for {0}: {1}'.format(component, ex)) valid = False metadata = None if valid is False: LicenseController._logger.debug('Invalid license for {0}: {1}'.format(component, license_string)) result[component] = False else: result[component] = {'valid_until': valid_until, 'metadata': metadata, 'name': name} else: LicenseController._logger.debug('No validate nor apply functions found for {0}'.format(component)) result[component] = False return result
def _change_services_state(services, ssh_clients, action): """ Stop/start services on SSH clients If action is start, we ignore errors and try to start other services on other nodes """ if action == 'start': services.reverse( ) # Start services again in reverse order of stopping for service_name in services: for ssh_client in ssh_clients: description = 'stopping' if action == 'stop' else 'starting' if action == 'start' else 'restarting' try: if ServiceManager.has_service(service_name, client=ssh_client): UpdateController._log_message( '{0} service {1}'.format(description.capitalize(), service_name), ssh_client.ip) Toolbox.change_service_state( client=ssh_client, name=service_name, state=action, logger=UpdateController._logger) UpdateController._log_message( '{0} service {1}'.format( 'Stopped' if action == 'stop' else 'Started' if action == 'start' else 'Restarted', service_name), ssh_client.ip) except Exception as exc: UpdateController._log_message( 'Something went wrong {0} service {1}: {2}'.format( description, service_name, exc), ssh_client.ip, severity='warning') if action == 'stop': return False return True
def _change_services_state(services, ssh_clients, action): """ Stop/start services on SSH clients If action is start, we ignore errors and try to start other services on other nodes """ if action == 'start': services.reverse() # Start services again in reverse order of stopping for service_name in services: for ssh_client in ssh_clients: description = 'stopping' if action == 'stop' else 'starting' if action == 'start' else 'restarting' try: if ServiceManager.has_service(service_name, client=ssh_client): UpdateController._log_message('{0} service {1}'.format(description.capitalize(), service_name), ssh_client.ip) Toolbox.change_service_state(client=ssh_client, name=service_name, state=action, logger=UpdateController._logger) UpdateController._log_message('{0} service {1}'.format('Stopped' if action == 'stop' else 'Started' if action == 'start' else 'Restarted', service_name), ssh_client.ip) except Exception as exc: UpdateController._log_message('Something went wrong {0} service {1}: {2}'.format(description, service_name, exc), ssh_client.ip, severity='warning') if action == 'stop': return False return True
def merge_package_information(): """ Retrieve the package information from the model for both StorageRouters and ALBA Nodes and merge it :return: Package information for all StorageRouters and ALBA nodes :rtype: dict """ package_info = dict((storagerouter.ip, storagerouter.package_information) for storagerouter in StorageRouterList.get_storagerouters()) for function in Toolbox.fetch_hooks('update', 'merge_package_info'): output = function() for ip in output: if ip in package_info: package_info[ip].update(output[ip]) else: package_info[ip] = output[ip] return package_info
def retrieve_storagerouter_info_via_host(ip, password): """ Retrieve the storagerouters from model """ storagerouters = {} try: from ovs.dal.lists.storagerouterlist import StorageRouterList with remote(ip_info=ip, modules=[StorageRouterList], username='******', password=password, strict_host_key_checking=False) as rem: for sr in rem.StorageRouterList.get_storagerouters(): storagerouters[sr.name] = { 'ip': sr.ip, 'type': sr.node_type.lower() } except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages='Error loading storagerouters: {0}'.format(ex), loglevel='exception', silent=True) return storagerouters
def install_plugins(): """ (Re)load plugins """ manager = ServiceFactory.get_manager() if manager.has_service('ovs-watcher-framework', SSHClient('127.0.0.1', username='******')): # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed print 'Installing plugin into Open vStorage' from ovs.dal.lists.storagerouterlist import StorageRouterList clients = {} masters = StorageRouterList.get_masters() slaves = StorageRouterList.get_slaves() try: for sr in masters + slaves: clients[sr] = SSHClient(sr, username='******') except UnableToConnectException: raise RuntimeError('Not all StorageRouters are reachable') memcached = 'memcached' watcher = 'watcher-framework' for sr in masters + slaves: if manager.has_service(watcher, clients[sr]): print '- Stopping watcher on {0} ({1})'.format( sr.name, sr.ip) manager.stop_service(watcher, clients[sr]) for sr in masters: print '- Restarting memcached on {0} ({1})'.format( sr.name, sr.ip) manager.restart_service(memcached, clients[sr]) for sr in masters + slaves: if manager.has_service(watcher, clients[sr]): print '- Starting watcher on {0} ({1})'.format( sr.name, sr.ip) manager.start_service(watcher, clients[sr]) print '- Execute model migrations' from ovs.dal.helpers import Migration Migration.migrate() from ovs.lib.helpers.toolbox import Toolbox ip = System.get_my_storagerouter().ip functions = Toolbox.fetch_hooks('plugin', 'postinstall') if len(functions) > 0: print '- Execute post installation scripts' for fct in functions: fct(ip=ip) print 'Installing plugin into Open vStorage: Completed'
def install_plugins(): """ (Re)load plugins """ if ServiceManager.has_service('ovs-watcher-framework', SSHClient('127.0.0.1', username='******')): # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed from ovs.dal.lists.storagerouterlist import StorageRouterList clients = [] try: for storagerouter in StorageRouterList.get_storagerouters(): clients.append(SSHClient(storagerouter, username='******')) except UnableToConnectException: raise RuntimeError('Not all StorageRouters are reachable') for client in clients: for service_name in ['watcher-framework', 'memcached']: ServiceManager.stop_service(service_name, client=client) wait = 30 while wait > 0: if ServiceManager.get_service_status(service_name, client=client) is False: break time.sleep(1) wait -= 1 if wait == 0: raise RuntimeError('Could not stop service: {0}'.format(service_name)) for client in clients: for service_name in ['memcached', 'watcher-framework']: ServiceManager.start_service(service_name, client=client) wait = 30 while wait > 0: if ServiceManager.get_service_status(service_name, client=client) is True: break time.sleep(1) wait -= 1 if wait == 0: raise RuntimeError('Could not start service: {0}'.format(service_name)) from ovs.dal.helpers import Migration Migration.migrate() from ovs.lib.helpers.toolbox import Toolbox ip = System.get_my_storagerouter().ip functions = Toolbox.fetch_hooks('plugin', 'postinstall') for function in functions: function(ip=ip)
def restart_framework_and_memcache_services(clients, logger, offline_node_ips=None): """ Restart framework and Memcached services :param clients: Clients on which to restart these services :type clients: dict :param logger: Logger object used for logging :type logger: ovs.log.log_handler.LogHandler :param offline_node_ips: IP addresses of offline nodes in the cluster :type offline_node_ips: list :return: None """ from ovs.dal.lists.storagerouterlist import StorageRouterList service_manager = ServiceFactory.get_manager() master_ips = [sr.ip for sr in StorageRouterList.get_masters()] slave_ips = [sr.ip for sr in StorageRouterList.get_slaves()] if offline_node_ips is None: offline_node_ips = [] memcached = 'memcached' watcher = 'watcher-framework' support_agent = 'support-agent' for ip in master_ips + slave_ips: if ip not in offline_node_ips: if service_manager.has_service(watcher, clients[ip]): Toolbox.change_service_state(clients[ip], watcher, 'stop', logger) for ip in master_ips: if ip not in offline_node_ips: Toolbox.change_service_state(clients[ip], memcached, 'restart', logger) for ip in master_ips + slave_ips: if ip not in offline_node_ips: if service_manager.has_service(watcher, clients[ip]): Toolbox.change_service_state(clients[ip], watcher, 'start', logger) if service_manager.has_service(support_agent, clients[ip]): Toolbox.change_service_state(clients[ip], support_agent, 'restart', logger) VolatileFactory.store = None
def get_update_information_all(): """ Retrieve the update information for all StorageRouters This contains information about - downtime of model, GUI, vPools, proxies, ... - services that will be restarted - packages that will be updated - prerequisites that have not been met :return: Information about the update :rtype: dict """ information = {} for function in Toolbox.fetch_hooks('update', 'information'): function(information=information) for component, info in copy.deepcopy(information).iteritems(): if len(info['packages']) == 0: information.pop(component) return information
def install_plugins(): """ (Re)load plugins """ if ServiceManager.has_service('ovs-watcher-framework', SSHClient('127.0.0.1', username='******')): # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed print 'Installing plugin into Open vStorage' from ovs.dal.lists.storagerouterlist import StorageRouterList clients = {} masters = StorageRouterList.get_masters() slaves = StorageRouterList.get_slaves() try: for sr in masters + slaves: clients[sr] = SSHClient(sr, username='******') except UnableToConnectException: raise RuntimeError('Not all StorageRouters are reachable') memcached = 'memcached' watcher = 'watcher-framework' for sr in masters + slaves: if ServiceManager.has_service(watcher, clients[sr]): print '- Stopping watcher on {0} ({1})'.format(sr.name, sr.ip) ServiceManager.stop_service(watcher, clients[sr]) for sr in masters: print '- Restarting memcached on {0} ({1})'.format(sr.name, sr.ip) ServiceManager.restart_service(memcached, clients[sr]) for sr in masters + slaves: if ServiceManager.has_service(watcher, clients[sr]): print '- Starting watcher on {0} ({1})'.format(sr.name, sr.ip) ServiceManager.start_service(watcher, clients[sr]) print '- Execute model migrations' from ovs.dal.helpers import Migration Migration.migrate() from ovs.lib.helpers.toolbox import Toolbox ip = System.get_my_storagerouter().ip functions = Toolbox.fetch_hooks('plugin', 'postinstall') if len(functions) > 0: print '- Execute post installation scripts' for function in functions: function(ip=ip) print 'Installing plugin into Open vStorage: Completed'
def remove_services(client, node_type, logger): """ Remove all services managed by OVS :param client: Client on which to remove the services :type client: ovs_extensions.generic.sshclient.SSHClient :param node_type: Type of node, can be 'master' or 'extra' :type node_type: str :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: None """ Toolbox.log(logger=logger, messages='Removing services') service_manager = ServiceFactory.get_manager() stop_only = ['rabbitmq-server', 'memcached'] services = ['workers', 'support-agent', 'watcher-framework'] if node_type == 'master': services += [ 'scheduled-tasks', 'webapp-api', 'volumerouter-consumer' ] if Toolbox.is_service_internally_managed( service='rabbitmq') is True: services.append('rabbitmq-server') if Toolbox.is_service_internally_managed( service='memcached') is True: services.append('memcached') for service in services: if service_manager.has_service(service, client=client): Toolbox.log( logger=logger, messages='{0} service {1}'.format( 'Removing' if service not in stop_only else 'Stopping', service)) service_manager.stop_service(service, client=client) if service not in stop_only: service_manager.remove_service(service, client=client)
def dtl_checkup(vpool_guid=None, vdisk_guid=None, storagerouters_to_exclude=None): """ Check DTL for all volumes :param vpool_guid: vPool to check the DTL configuration of all its disks :type vpool_guid: String :param vdisk_guid: Virtual Disk to check its DTL configuration :type vdisk_guid: String :param storagerouters_to_exclude: Storage Routers to exclude from possible targets :type storagerouters_to_exclude: List :return: None """ if vpool_guid is not None and vdisk_guid is not None: raise ValueError('vpool and vdisk are mutually exclusive') if storagerouters_to_exclude is None: storagerouters_to_exclude = [] from ovs.lib.vpool import VPoolController logger.info('DTL checkup started') required_params = {'dtl_mode': (str, StorageDriverClient.VPOOL_DTL_MODE_MAP.keys()), 'dtl_enabled': (bool, None)} vdisk = VDisk(vdisk_guid) if vdisk_guid else None vpool = VPool(vpool_guid) if vpool_guid else None errors_found = False root_client_map = {} vpool_dtl_config_cache = {} vdisks = VDiskList.get_vdisks() if vdisk is None and vpool is None else vpool.vdisks if vpool is not None else [vdisk] for vdisk in vdisks: logger.info(' Verifying vDisk {0} with guid {1}'.format(vdisk.name, vdisk.guid)) vdisk.invalidate_dynamics(['storagedriver_client', 'storagerouter_guid']) if vdisk.storagedriver_client is None: continue vpool = vdisk.vpool if vpool.guid not in vpool_dtl_config_cache: vpool_config = VPoolController.get_configuration(vpool.guid) # Config on vPool is permanent for DTL settings vpool_dtl_config_cache[vpool.guid] = vpool_config Toolbox.verify_required_params(required_params, vpool_config) volume_id = str(vdisk.volume_id) vpool_config = vpool_dtl_config_cache[vpool.guid] dtl_vpool_enabled = vpool_config['dtl_enabled'] try: current_dtl_config = vdisk.storagedriver_client.get_dtl_config(volume_id) current_dtl_config_mode = vdisk.storagedriver_client.get_dtl_config_mode(volume_id) except RuntimeError as rte: # Can occur when a volume has not been stolen yet from a dead node logger.error('Retrieving DTL configuration from storage driver failed with error: {0}'.format(rte)) errors_found = True continue if dtl_vpool_enabled is False and (current_dtl_config is None or current_dtl_config.host == 'null'): logger.info(' DTL is globally disabled for vPool {0} with guid {1}'.format(vpool.name, vpool.guid)) vdisk.storagedriver_client.set_manual_dtl_config(volume_id, None) continue elif current_dtl_config_mode == DTLConfigMode.MANUAL and (current_dtl_config is None or current_dtl_config.host == 'null'): logger.info(' DTL is disabled for virtual disk {0} with guid {1}'.format(vdisk.name, vdisk.guid)) continue storage_router = StorageRouter(vdisk.storagerouter_guid) available_storagerouters = [] # 1. Check available storage routers in the backup failure domain if storage_router.secondary_failure_domain is not None: for storagerouter in storage_router.secondary_failure_domain.primary_storagerouters: if vpool.guid not in storagerouter.vpools_guids: continue if storagerouter not in root_client_map: try: root_client = SSHClient(storagerouter, username='******') except UnableToConnectException: logger.warning(' Storage Router with IP {0} of vDisk {1} is not reachable'.format(storagerouter.ip, vdisk.name)) continue root_client_map[storagerouter] = root_client else: root_client = root_client_map[storagerouter] if ServiceManager.get_service_status('dtl_{0}'.format(vpool.name), client=root_client) is True: available_storagerouters.append(storagerouter) # 2. Check available storage routers in the same failure domain as current storage router if len(available_storagerouters) == 0: for storagerouter in storage_router.primary_failure_domain.primary_storagerouters: if vpool.guid not in storagerouter.vpools_guids or storagerouter == storage_router: continue if storagerouter not in root_client_map: try: root_client = SSHClient(storagerouter, username='******') except UnableToConnectException: logger.warning(' Storage Router with IP {0} of vDisk {1} is not reachable'.format(storagerouter.ip, vdisk.name)) continue root_client_map[storagerouter] = root_client else: root_client = root_client_map[storagerouter] if ServiceManager.get_service_status('dtl_{0}'.format(vpool.name), client=root_client) is True: available_storagerouters.append(storagerouter) # Remove storage routers to exclude for sr_guid in storagerouters_to_exclude: sr_to_exclude = StorageRouter(sr_guid) if sr_to_exclude in available_storagerouters: available_storagerouters.remove(sr_to_exclude) if len(available_storagerouters) == 0: logger.info(' No Storage Routers could be found as valid DTL target') vdisk.storagedriver_client.set_manual_dtl_config(volume_id, None) continue # Check whether reconfiguration is required reconfigure_required = False if current_dtl_config is None: logger.info(' No DTL configuration found, but there are Storage Routers available') reconfigure_required = True elif current_dtl_config_mode == DTLConfigMode.AUTOMATIC: logger.info(' DTL configuration set to AUTOMATIC, switching to manual') reconfigure_required = True else: dtl_host = current_dtl_config.host dtl_port = current_dtl_config.port storage_drivers = [sd for sd in vpool.storagedrivers if sd.storagerouter.ip == dtl_host] logger.info(' DTL host: {0}'.format(dtl_host or '-')) logger.info(' DTL port: {0}'.format(dtl_port or '-')) if dtl_host not in [sr.ip for sr in available_storagerouters]: logger.info(' Host not in available Storage Routers') reconfigure_required = True elif dtl_port != storage_drivers[0].ports[2]: logger.info(' Configured port does not match expected port ({0} vs {1})'.format(dtl_port, storage_drivers[0].ports[2])) reconfigure_required = True # Perform the reconfiguration if reconfigure_required is True: logger.info(' Reconfigure required') index = random.randint(0, len(available_storagerouters) - 1) dtl_target = available_storagerouters[index] storage_drivers = [sd for sd in vpool.storagedrivers if sd.storagerouter == dtl_target] if len(storage_drivers) == 0: raise ValueError('Could not retrieve related storagedriver') port = storage_drivers[0].ports[2] vpool_dtl_mode = vpool_config.get('dtl_mode', StorageDriverClient.FRAMEWORK_DTL_ASYNC) logger.info(' DTL config that will be set --> Host: {0}, Port: {1}, Mode: {2}'.format(dtl_target.ip, port, vpool_dtl_mode)) dtl_config = DTLConfig(str(dtl_target.ip), port, StorageDriverClient.VDISK_DTL_MODE_MAP[vpool_dtl_mode]) vdisk.storagedriver_client.set_manual_dtl_config(volume_id, dtl_config) if errors_found is True: logger.error('DTL checkup ended with errors') raise Exception('DTL checkup failed with errors. Please check /var/log/ovs/lib.log for more information') logger.info('DTL checkup ended')
def set_config_params(vdisk_guid, new_config_params): """ Sets configuration parameters for a given vdisk. :param vdisk_guid: Guid of the virtual disk to set the configuration parameters for :param new_config_params: New configuration parameters """ required_params = {'dtl_mode': (str, StorageDriverClient.VDISK_DTL_MODE_MAP.keys()), 'sco_size': (int, StorageDriverClient.TLOG_MULTIPLIER_MAP.keys()), 'dedupe_mode': (str, StorageDriverClient.VDISK_DEDUPE_MAP.keys()), 'write_buffer': (int, {'min': 128, 'max': 10 * 1024}), 'cache_strategy': (str, StorageDriverClient.VDISK_CACHE_MAP.keys()), 'readcache_limit': (int, {'min': 1, 'max': 10 * 1024}, False)} if new_config_params.get('dtl_target') is not None: required_params.update({'dtl_target': (str, Toolbox.regex_ip)}) Toolbox.verify_required_params(required_params, new_config_params) if new_config_params['dtl_mode'] != 'no_sync' and new_config_params.get('dtl_target') is None: raise Exception('If DTL mode is Asynchronous or Synchronous, a target IP should always be specified') errors = False vdisk = VDisk(vdisk_guid) volume_id = str(vdisk.volume_id) old_config_params = VDiskController.get_config_params(vdisk.guid) # 1st update SCO size, because this impacts TLOG multiplier which on its turn impacts write buffer new_sco_size = new_config_params['sco_size'] old_sco_size = old_config_params['sco_size'] if new_sco_size != old_sco_size: write_buffer = float(new_config_params['write_buffer']) tlog_multiplier = StorageDriverClient.TLOG_MULTIPLIER_MAP[new_sco_size] sco_factor = write_buffer / tlog_multiplier / new_sco_size try: logger.info('Updating property sco_size on vDisk {0} to {1}'.format(vdisk_guid, new_sco_size)) vdisk.storagedriver_client.set_sco_multiplier(volume_id, new_sco_size / 4 * 1024) vdisk.storagedriver_client.set_tlog_multiplier(volume_id, tlog_multiplier) vdisk.storagedriver_client.set_sco_cache_max_non_disposable_factor(volume_id, sco_factor) logger.info('Updated property sco_size') except Exception as ex: logger.error('Error updating "sco_size": {0}'.format(ex)) errors = True # 2nd Check for DTL changes new_dtl_mode = new_config_params['dtl_mode'] old_dtl_mode = old_config_params['dtl_mode'] new_dtl_target = new_config_params.get('dtl_target') old_dtl_target = old_config_params['dtl_target'] if old_dtl_mode != new_dtl_mode or new_dtl_target != old_dtl_target: if old_dtl_mode != new_dtl_mode and new_dtl_mode == 'no_sync': logger.info('Disabling DTL for vDisk {0}'.format(vdisk_guid)) vdisk.storagedriver_client.set_manual_dtl_config(volume_id, None) elif (new_dtl_target is not None and new_dtl_target != old_dtl_target or old_dtl_mode != new_dtl_mode) and new_dtl_mode != 'no_sync': logger.info('Changing DTL to use global values for vDisk {0}'.format(vdisk_guid)) sr_target = StorageRouterList.get_by_ip(new_dtl_target) if sr_target is None: logger.error('Failed to retrieve Storage Router with IP {0}'.format(new_dtl_target)) errors = True for sd in sr_target.storagedrivers: if sd.vpool == vdisk.vpool: dtl_config = DTLConfig(str(new_dtl_target), sd.ports[2], StorageDriverClient.VDISK_DTL_MODE_MAP[new_dtl_mode]) vdisk.storagedriver_client.set_manual_dtl_config(volume_id, dtl_config) break else: logger.error('Failed to retrieve Storage Driver with IP {0}'.format(new_dtl_target)) errors = True # 2nd update rest for key in required_params: try: if key in ['sco_size', 'dtl_mode', 'dtl_target']: continue new_value = new_config_params[key] old_value = old_config_params[key] if new_value != old_value: logger.info('Updating property {0} on vDisk {1} from to {2}'.format(key, vdisk_guid, new_value)) if key == 'dedupe_mode': vdisk.storagedriver_client.set_readcache_mode(volume_id, StorageDriverClient.VDISK_DEDUPE_MAP[new_value]) elif key == 'write_buffer': tlog_multiplier = vdisk.storagedriver_client.get_tlog_multiplier(volume_id) or StorageDriverClient.TLOG_MULTIPLIER_MAP[new_sco_size] sco_factor = float(new_value) / tlog_multiplier / new_sco_size vdisk.storagedriver_client.set_sco_cache_max_non_disposable_factor(volume_id, sco_factor) elif key == 'cache_strategy': vdisk.storagedriver_client.set_readcache_behaviour(volume_id, StorageDriverClient.VDISK_CACHE_MAP[new_value]) elif key == 'readcache_limit': vol_info = vdisk.storagedriver_client.info_volume(volume_id) block_size = vol_info.lba_size * vol_info.cluster_multiplier or 4096 limit = new_value * 1024 * 1024 * 1024 / block_size if new_value else None vdisk.storagedriver_client.set_readcache_limit(volume_id, limit) else: raise KeyError('Unsupported property provided: "{0}"'.format(key)) logger.info('Updated property {0}'.format(key)) except Exception as ex: logger.error('Error updating "{0}": {1}'.format(key, ex)) errors = True if errors is True: raise Exception('Failed to update the values for vDisk {0}'.format(vdisk.name))
def set_config_params(vdisk_guid, new_config_params, old_config_params): """ Sets configuration parameters for a given vdisk. """ required_params = { # 'dtl_mode': (str, StorageDriverClient.VDISK_DTL_MODE_MAP.keys()), 'sco_size': (int, StorageDriverClient.TLOG_MULTIPLIER_MAP.keys()), 'dedupe_mode': (str, StorageDriverClient.VDISK_DEDUPE_MAP.keys()), 'dtl_enabled': (bool, None), # 'dtl_location': (str, None), 'write_buffer': (int, {'min': 128, 'max': 10 * 1024}), 'cache_strategy': (str, StorageDriverClient.VDISK_CACHE_MAP.keys()), 'readcache_limit': (int, {'min': 1, 'max': 10 * 1024}, False)} Toolbox.verify_required_params(required_params, new_config_params) Toolbox.verify_required_params(required_params, old_config_params) errors = False vdisk = VDisk(vdisk_guid) volume_id = str(vdisk.volume_id) old_sco_size = old_config_params['sco_size'] new_sco_size = new_config_params['sco_size'] # 1st update SCO size, because this impacts TLOG multiplier which on its turn impacts write buffer if new_sco_size != old_sco_size: write_buffer = float(new_config_params['write_buffer']) tlog_multiplier = StorageDriverClient.TLOG_MULTIPLIER_MAP[new_sco_size] sco_factor = write_buffer / tlog_multiplier / new_sco_size try: logger.info('Updating property sco_size on vDisk {0} from {1} to {2}'.format(vdisk_guid, old_sco_size, new_sco_size)) vdisk.storagedriver_client.set_sco_multiplier(volume_id, new_sco_size / 4 * 1024) vdisk.storagedriver_client.set_tlog_multiplier(volume_id, tlog_multiplier) vdisk.storagedriver_client.set_sco_cache_max_non_disposable_factor(volume_id, sco_factor) logger.info('Updated property sco_size') except Exception as ex: logger.error('Error updating "sco_size": {0}'.format(ex)) errors = True # 2nd update rest for key, old_value in old_config_params.iteritems(): if key.startswith('dtl') or key == 'sco_size': continue new_value = new_config_params[key] if new_value != old_value: try: logger.info('Updating property {0} on vDisk {1} from {2} to {3}'.format(key, vdisk_guid, old_value, new_value)) if key == 'cache_strategy': vdisk.storagedriver_client.set_readcache_behaviour(volume_id, StorageDriverClient.VDISK_CACHE_MAP[new_value]) elif key == 'dedupe_mode': vdisk.storagedriver_client.set_readcache_mode(volume_id, StorageDriverClient.VDISK_DEDUPE_MAP[new_value]) elif key == 'write_buffer': tlog_multiplier = vdisk.storagedriver_client.get_tlog_multiplier(volume_id) or StorageDriverClient.TLOG_MULTIPLIER_MAP[new_sco_size] sco_factor = float(new_value) / tlog_multiplier / new_sco_size vdisk.storagedriver_client.set_sco_cache_max_non_disposable_factor(volume_id, sco_factor) elif key == 'readcache_limit': volume_info = vdisk.storagedriver_client.info_volume(volume_id) block_size = volume_info.lba_size * volume_info.cluster_multiplier or 4096 limit = new_value * 1024 * 1024 * 1024 / block_size if new_value else None vdisk.storagedriver_client.set_readcache_limit(volume_id, limit) else: raise KeyError('Unsupported property provided: "{0}"'.format(key)) logger.info('Updated property {0}'.format(key)) except Exception as ex: logger.error('Error updating "{0}": {1}'.format(key, ex)) errors = True if errors is True: raise Exception('Failed to update the values for vDisk {0}'.format(vdisk.name))
def promote_or_demote_node(node_action, cluster_ip=None, execute_rollback=False): """ Promotes or demotes the local node :param node_action: Demote or promote :type node_action: str :param cluster_ip: IP of node to promote or demote :type cluster_ip: str :param execute_rollback: In case of failure revert the changes made :type execute_rollback: bool :return: None """ if node_action not in ('promote', 'demote'): raise ValueError('Nodes can only be promoted or demoted') Toolbox.log(logger=NodeTypeController._logger, messages='Open vStorage Setup - {0}'.format( node_action.capitalize()), boxed=True) try: Toolbox.log(logger=NodeTypeController._logger, messages='Collecting information', title=True) machine_id = System.get_my_machine_id() if Configuration.get('/ovs/framework/hosts/{0}/setupcompleted'. format(machine_id)) is False: raise RuntimeError('No local OVS setup found.') if cluster_ip and not re.match(Toolbox.regex_ip, cluster_ip): raise RuntimeError( 'Incorrect IP provided ({0})'.format(cluster_ip)) if cluster_ip: client = SSHClient(endpoint=cluster_ip) machine_id = System.get_my_machine_id(client) node_type = Configuration.get( '/ovs/framework/hosts/{0}/type'.format(machine_id)) if node_action == 'promote' and node_type == 'MASTER': raise RuntimeError('This node is already master.') elif node_action == 'demote' and node_type == 'EXTRA': raise RuntimeError('This node should be a master.') elif node_type not in ['MASTER', 'EXTRA']: raise RuntimeError('This node is not correctly configured.') master_ip = None offline_nodes = [] online = True target_client = None if node_action == 'demote' and cluster_ip: # Demote an offline node from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.lib.storagedriver import StorageDriverController ip = cluster_ip unique_id = None ip_client_map = {} for storage_router in StorageRouterList.get_storagerouters(): try: client = SSHClient(storage_router.ip, username='******') if storage_router.node_type == 'MASTER': master_ip = storage_router.ip ip_client_map[storage_router.ip] = client except UnableToConnectException: if storage_router.ip == cluster_ip: online = False unique_id = storage_router.machine_id StorageDriverController.mark_offline( storagerouter_guid=storage_router.guid) offline_nodes.append(storage_router) if online is True: raise RuntimeError( "If the node is online, please use 'ovs setup demote' executed on the node you wish to demote" ) if master_ip is None: raise RuntimeError( 'Failed to retrieve another responsive MASTER node') else: target_password = Toolbox.ask_validate_password( ip='127.0.0.1', logger=NodeTypeController._logger) target_client = SSHClient('127.0.0.1', username='******', password=target_password) unique_id = System.get_my_machine_id(target_client) ip = Configuration.get( '/ovs/framework/hosts/{0}/ip'.format(unique_id)) storagerouter_info = NodeTypeController.retrieve_storagerouter_info_via_host( ip=target_client.ip, password=target_password) node_ips = [ sr_info['ip'] for sr_info in storagerouter_info.itervalues() ] master_node_ips = [ sr_info['ip'] for sr_info in storagerouter_info.itervalues() if sr_info['type'] == 'master' and sr_info['ip'] != ip ] if len(master_node_ips) == 0: if node_action == 'promote': raise RuntimeError('No master node could be found') else: raise RuntimeError( 'It is not possible to remove the only master') master_ip = master_node_ips[0] ip_client_map = dict( (node_ip, SSHClient(node_ip, username='******')) for node_ip in node_ips) if node_action == 'demote': for cluster_name in Configuration.list('/ovs/arakoon'): config = ArakoonClusterConfig(cluster_id=cluster_name) arakoon_client = ArakoonInstaller.build_client(config) metadata = json.loads( arakoon_client.get(ArakoonInstaller.METADATA_KEY)) if len(config.nodes) == 1 and config.nodes[ 0].ip == ip and metadata.get('internal') is True: raise RuntimeError( 'Demote is not supported when single node Arakoon cluster(s) are present on the node to be demoted.' ) configure_rabbitmq = Toolbox.is_service_internally_managed( service='rabbitmq') configure_memcached = Toolbox.is_service_internally_managed( service='memcached') if node_action == 'promote': try: NodeTypeController.promote_node( cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, configure_memcached=configure_memcached, configure_rabbitmq=configure_rabbitmq) except Exception: if execute_rollback is True: NodeTypeController.demote_node( cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, unconfigure_memcached=configure_memcached, unconfigure_rabbitmq=configure_rabbitmq, offline_nodes=offline_nodes) elif target_client is not None: target_client.file_write('/tmp/ovs_rollback', 'demote') raise else: try: NodeTypeController.demote_node( cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, unconfigure_memcached=configure_memcached, unconfigure_rabbitmq=configure_rabbitmq, offline_nodes=offline_nodes) except Exception: if execute_rollback is True: NodeTypeController.promote_node( cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, configure_memcached=configure_memcached, configure_rabbitmq=configure_rabbitmq) elif target_client is not None: target_client.file_write('/tmp/ovs_rollback', 'promote') raise Toolbox.log(logger=NodeTypeController._logger, messages='\n') Toolbox.log(logger=NodeTypeController._logger, messages='{0} complete.'.format( node_action.capitalize()), boxed=True) except Exception as exception: Toolbox.log(logger=NodeTypeController._logger, messages='\n') Toolbox.log( logger=NodeTypeController._logger, messages=['An unexpected error occurred:', str(exception)], boxed=True, loglevel='exception') sys.exit(1) except KeyboardInterrupt: Toolbox.log(logger=NodeTypeController._logger, messages='\n') Toolbox.log( logger=NodeTypeController._logger, messages= 'This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.', boxed=True, loglevel='error') sys.exit(1)
def validate_vpool_sanity(expected_settings): """ Check if all requirements are met for a healthy vPool :param expected_settings: Parameters used to create a vPool, which will be verified :type expected_settings: dict :return: None """ if not isinstance(expected_settings, dict) or len(expected_settings) == 0: raise ValueError('Cannot validate vpool when no settings are passed') generic_settings = expected_settings.values()[0] vpool_name = generic_settings['vpool_name'] mountpoint = '/mnt/{0}'.format(vpool_name) backend_type = generic_settings['type'] rdma_enabled = generic_settings['config_params']['dtl_transport'] == StorageDriverClient.FRAMEWORK_DTL_TRANSPORT_RSOCKET vpool = GeneralVPool.get_vpool_by_name(vpool_name=vpool_name) assert vpool is not None, 'Could not find vPool with name {0}'.format(vpool_name) vpool_config = GeneralVPool.get_configuration(vpool) # Verify some basic vPool attributes assert vpool.name == vpool_name, 'Expected name {0} for vPool'.format(vpool_name) assert vpool.backend_type.code == backend_type, 'Expected backend type {0}'.format(backend_type) assert vpool.status == VPool.STATUSES.RUNNING, 'vPool does not have RUNNING status' assert vpool.rdma_enabled == rdma_enabled, 'RDMA enabled setting is incorrect' assert set(expected_settings.keys()) == set([sd.storagerouter for sd in vpool.storagedrivers]), "vPool storagerouters don't match the expected Storage Routers" # Verify vPool Storage Driver configuration expected_vpool_config = copy.deepcopy(generic_settings['config_params']) for key, value in vpool_config.iteritems(): if key == 'dtl_enabled' or key == 'tlog_multiplier': continue if key not in expected_vpool_config: raise ValueError('Expected settings does not contain key {0}'.format(key)) if value != expected_vpool_config[key]: raise ValueError('vPool does not have expected configuration {0} for key {1}'.format(expected_vpool_config[key], key)) expected_vpool_config.pop(key) if len(expected_vpool_config) > 0: raise ValueError('Actual vPool configuration does not contain keys: {0}'.format(', '.join(expected_vpool_config.keys()))) # Prepare some fields to check config = generic_settings['config_params'] dtl_mode = config['dtl_mode'] sco_size = config['sco_size'] dedupe_mode = config['dedupe_mode'] cluster_size = config['cluster_size'] write_buffer = config['write_buffer'] dtl_transport = config['dtl_transport'] cache_strategy = config['cache_strategy'] # @TODO: Add more validations for other expected settings (instead of None) expected_config = {'backend_connection_manager': {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}, 'content_addressed_cache': {'clustercache_mount_points': None, 'read_cache_serialization_path': u'/var/rsp/{0}'.format(vpool.name)}, 'distributed_lock_store': {'dls_arakoon_cluster_id': None, 'dls_arakoon_cluster_nodes': None, 'dls_type': u'Arakoon'}, 'distributed_transaction_log': {'dtl_path': None, 'dtl_transport': dtl_transport.upper()}, 'event_publisher': {'events_amqp_routing_key': u'volumerouter', 'events_amqp_uris': None}, 'file_driver': {'fd_cache_path': None, 'fd_extent_cache_capacity': u'1024', 'fd_namespace': None}, 'filesystem': {'fs_dtl_config_mode': u'Automatic', 'fs_dtl_mode': u'{0}'.format(StorageDriverClient.VPOOL_DTL_MODE_MAP[dtl_mode]), 'fs_enable_shm_interface': 1, 'fs_file_event_rules': None, 'fs_metadata_backend_arakoon_cluster_nodes': None, 'fs_metadata_backend_mds_nodes': None, 'fs_metadata_backend_type': u'MDS', 'fs_raw_disk_suffix': None, 'fs_virtual_disk_format': None}, 'metadata_server': {'mds_nodes': None}, 'scocache': {'backoff_gap': u'2GB', 'scocache_mount_points': None, 'trigger_gap': u'1GB'}, 'threadpool_component': {'num_threads': 16}, 'volume_manager': {'clean_interval': 1, 'default_cluster_size': 1024 * cluster_size, 'dtl_throttle_usecs': 4000, 'metadata_path': None, 'non_disposable_scos_factor': float(write_buffer) / StorageDriverClient.TLOG_MULTIPLIER_MAP[sco_size] / sco_size, 'number_of_scos_in_tlog': StorageDriverClient.TLOG_MULTIPLIER_MAP[sco_size], 'read_cache_default_behaviour': StorageDriverClient.VPOOL_CACHE_MAP[cache_strategy], 'read_cache_default_mode': StorageDriverClient.VPOOL_DEDUPE_MAP[dedupe_mode], 'tlog_path': None}, 'volume_registry': {'vregistry_arakoon_cluster_id': u'voldrv', 'vregistry_arakoon_cluster_nodes': None}, 'volume_router': {'vrouter_backend_sync_timeout_ms': 5000, 'vrouter_file_read_threshold': 1024, 'vrouter_file_write_threshold': 1024, 'vrouter_id': None, 'vrouter_max_workers': 16, 'vrouter_migrate_timeout_ms': 5000, 'vrouter_min_workers': 4, 'vrouter_redirect_timeout_ms': u'5000', 'vrouter_routing_retries': 10, 'vrouter_sco_multiplier': 1024, 'vrouter_volume_read_threshold': 1024, 'vrouter_volume_write_threshold': 1024}, 'volume_router_cluster': {'vrouter_cluster_id': None}} vpool_services = {'all': ['ovs-watcher-volumedriver', 'ovs-dtl_{0}'.format(vpool.name), 'ovs-volumedriver_{0}'.format(vpool.name), 'ovs-volumerouter-consumer'], 'extra': [], 'master': ['ovs-arakoon-voldrv']} sd_partitions = {'DB': ['MD', 'MDS', 'TLOG'], 'READ': ['None'], 'WRITE': ['FD', 'DTL', 'SCO'], 'SCRUB': ['None']} if backend_type == 'alba': backend_metadata = {'name': (str, None), 'preset': (str, Toolbox.regex_preset), 'backend_guid': (str, Toolbox.regex_guid), 'arakoon_config': (dict, None), 'connection': (dict, {'host': (str, Toolbox.regex_ip, False), 'port': (int, {'min': 1, 'max': 65535}), 'client_id': (str, Toolbox.regex_guid), 'client_secret': (str, None), 'local': (bool, None)}), 'backend_info': (dict, {'policies': (list, None), 'sco_size': (float, None), 'frag_size': (float, None), 'total_size': (float, None), 'nsm_partition_guids': (list, Toolbox.regex_guid)})} required = {'backend': (dict, backend_metadata), 'backend_aa': (dict, backend_metadata, False)} Toolbox.verify_required_params(required_params=required, actual_params=vpool.metadata) vpool_services['all'].append("ovs-albaproxy_{0}".format(vpool.name)) sd_partitions['WRITE'].append('FCACHE') expected_config['backend_connection_manager'].update({'alba_connection_host': None, 'alba_connection_port': None, 'alba_connection_preset': None, 'alba_connection_timeout': 15, 'backend_type': u'{0}'.format(vpool.backend_type.code.upper())}) elif backend_type == 'distributed': expected_config['backend_connection_manager'].update({'backend_type': u'LOCAL', 'local_connection_path': u'{0}'.format(generic_settings['distributed_mountpoint'])}) assert EtcdConfiguration.exists('/ovs/arakoon/voldrv/config', raw=True), 'Volumedriver arakoon does not exist' # Do some verifications for all SDs storage_ip = None voldrv_config = GeneralArakoon.get_config('voldrv') all_files = GeneralVPool.get_related_files(vpool=vpool) all_directories = GeneralVPool.get_related_directories(vpool=vpool) for storagedriver in vpool.storagedrivers: storagerouter = storagedriver.storagerouter root_client = SSHClient(storagerouter, username='******') assert EtcdConfiguration.exists('/ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, storagedriver.storagedriver_id), raw=True), 'vPool config not found in etcd' current_config_sections = set([item for item in EtcdConfiguration.list('/ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, storagedriver.storagedriver_id))]) assert not current_config_sections.difference(set(expected_config.keys())), 'New section appeared in the storage driver config in etcd' assert not set(expected_config.keys()).difference(current_config_sections), 'Config section expected for storage driver, but not found in etcd' for key, values in expected_config.iteritems(): current_config = EtcdConfiguration.get('/ovs/vpools/{0}/hosts/{1}/config/{2}'.format(vpool.guid, storagedriver.storagedriver_id, key)) assert set(current_config.keys()).union(set(values.keys())) == set(values.keys()), 'Not all expected keys match for key "{0}" on Storage Driver {1}'.format(key, storagedriver.name) for sub_key, value in current_config.iteritems(): expected_value = values[sub_key] if expected_value is None: continue assert value == expected_value, 'Key: {0} - Sub key: {1} - Value: {2} - Expected value: {3}'.format(key, sub_key, value, expected_value) # Check services if storagerouter.node_type == 'MASTER': for service_name in vpool_services['all'] + vpool_services['master']: if service_name == 'ovs-arakoon-voldrv' and GeneralStorageDriver.has_role(storagedriver, 'DB') is False: continue if ServiceManager.get_service_status(name=service_name, client=root_client) is not True: raise ValueError('Service {0} is not running on node {1}'.format(service_name, storagerouter.ip)) else: for service_name in vpool_services['all'] + vpool_services['extra']: if ServiceManager.get_service_status(name=service_name, client=root_client) is not True: raise ValueError('Service {0} is not running on node {1}'.format(service_name, storagerouter.ip)) # Check arakoon config if not voldrv_config.has_section(storagerouter.machine_id): raise ValueError('Voldrv arakoon cluster does not have section {0}'.format(storagerouter.machine_id)) # Basic SD checks assert storagedriver.cluster_ip == storagerouter.ip, 'Incorrect cluster IP. Expected: {0} - Actual: {1}'.format(storagerouter.ip, storagedriver.cluster_ip) assert storagedriver.mountpoint == '/mnt/{0}'.format(vpool.name), 'Incorrect mountpoint. Expected: {0} - Actual: {1}'.format(mountpoint, storagedriver.mountpoint) if storage_ip is not None: assert storagedriver.storage_ip == storage_ip, 'Incorrect storage IP. Expected: {0} - Actual: {1}'.format(storage_ip, storagedriver.storage_ip) storage_ip = storagedriver.storage_ip # Check required directories and files if storagerouter.guid not in all_directories: raise ValueError('Could not find directory information for Storage Router {0}'.format(storagerouter.ip)) if storagerouter.guid not in all_files: raise ValueError('Could not find file information for Storage Router {0}'.format(storagerouter.ip)) for directory in all_directories[storagerouter.guid]: if root_client.dir_exists(directory) is False: raise ValueError('Directory {0} does not exist on Storage Router {1}'.format(directory, storagerouter.ip)) for file_name in all_files[storagerouter.guid]: if root_client.file_exists(file_name) is False: raise ValueError('File {0} does not exist on Storage Router {1}'.format(file_name, storagerouter.ip)) for partition in storagedriver.partitions: if partition.role in sd_partitions and partition.sub_role in sd_partitions[partition.role]: sd_partitions[partition.role].remove(partition.sub_role) elif partition.role in sd_partitions and partition.sub_role is None: sd_partitions[partition.role].remove('None') # Verify vPool writeable if storagerouter.pmachine.hvtype == 'VMWARE': GeneralVPool.mount_vpool(vpool=vpool, root_client=root_client) vdisk = GeneralVDisk.create_volume(size=10, vpool=vpool, root_client=root_client) GeneralVDisk.write_to_volume(vdisk=vdisk, vpool=vpool, root_client=root_client, count=10, bs='1M', input_type='random') GeneralVDisk.delete_volume(vdisk=vdisk, vpool=vpool, root_client=root_client) for role, sub_roles in sd_partitions.iteritems(): for sub_role in sub_roles: raise ValueError('Not a single Storage Driver found with partition role {0} and sub-role {1}'.format(role, sub_role))
def refresh_package_information(): """ Retrieve and store the package information of all StorageRouters :return: None """ GenericController._logger.info('Updating package information') client_map = {} prerequisites = [] package_info_cluster = {} all_storagerouters = StorageRouterList.get_storagerouters() all_storagerouters.sort(key=lambda sr: ExtensionsToolbox.advanced_sort( element=sr.ip, separator='.')) for storagerouter in all_storagerouters: package_info_cluster[storagerouter.ip] = {} try: # We make use of these clients in Threads --> cached = False client_map[storagerouter] = SSHClient(endpoint=storagerouter, username='******', cached=False) except (NotAuthenticatedException, UnableToConnectException): GenericController._logger.warning( 'StorageRouter {0} is inaccessible'.format( storagerouter.ip)) prerequisites.append(['node_down', storagerouter.name]) package_info_cluster[storagerouter.ip]['errors'] = [ 'StorageRouter {0} is inaccessible'.format( storagerouter.name) ] # Retrieve for each StorageRouter in the cluster the installed and candidate versions of related packages # This also validates whether all required packages have been installed GenericController._logger.debug( 'Retrieving package information for the cluster') threads = [] for storagerouter, client in client_map.iteritems(): for fct in Toolbox.fetch_hooks( component='update', sub_component='get_package_update_info_cluster'): thread = Thread(target=fct, args=(client, package_info_cluster)) thread.start() threads.append(thread) for thread in threads: thread.join() # Retrieve the related downtime / service restart information GenericController._logger.debug( 'Retrieving update information for the cluster') update_info_cluster = {} for storagerouter, client in client_map.iteritems(): update_info_cluster[storagerouter.ip] = { 'errors': package_info_cluster[storagerouter.ip].get('errors', []) } for fct in Toolbox.fetch_hooks( component='update', sub_component='get_update_info_cluster'): fct(client, update_info_cluster, package_info_cluster[storagerouter.ip]) # Retrieve the update information for plugins (eg: ALBA, iSCSI) GenericController._logger.debug( 'Retrieving package and update information for the plugins') threads = [] update_info_plugin = {} for fct in Toolbox.fetch_hooks('update', 'get_update_info_plugin'): thread = Thread(target=fct, args=(update_info_plugin, )) thread.start() threads.append(thread) for thread in threads: thread.join() # Add the prerequisites if len(prerequisites) > 0: for ip, component_info in update_info_cluster.iteritems(): if PackageFactory.COMP_FWK in component_info: component_info[PackageFactory.COMP_FWK][ 'prerequisites'].extend(prerequisites) # Store information in model and collect errors for OVS cluster errors = set() for storagerouter in all_storagerouters: GenericController._logger.debug( 'Storing update information for StorageRouter {0}'.format( storagerouter.ip)) update_info = update_info_cluster.get(storagerouter.ip, {}) # Remove the errors from the update information sr_errors = update_info.pop('errors', []) if len(sr_errors) > 0: errors.update([ '{0}: {1}'.format(storagerouter.ip, error) for error in sr_errors ]) update_info = { } # If any error occurred, we store no update information for this StorageRouter # Remove the components without updates from the update information update_info_copy = copy.deepcopy(update_info) for component, info in update_info_copy.iteritems(): if len(info['packages']) == 0: update_info.pop(component) # Store the update information storagerouter.package_information = update_info storagerouter.save() # Collect errors for plugins for ip, plugin_errors in update_info_plugin.iteritems(): if len(plugin_errors) > 0: errors.update( ['{0}: {1}'.format(ip, error) for error in plugin_errors]) if len(errors) > 0: raise Exception('\n - {0}'.format('\n - '.join(errors))) GenericController._logger.info('Finished updating package information')
def remove_node(node_ip, silent=None): """ Remove the node with specified IP from the cluster :param node_ip: IP of the node to remove :type node_ip: str :param silent: If silent == '--force-yes' no question will be asked to confirm the removal :type silent: str :return: None """ from ovs.lib.storagedriver import StorageDriverController from ovs.lib.storagerouter import StorageRouterController from ovs.dal.lists.storagerouterlist import StorageRouterList Toolbox.log(logger=NodeRemovalController._logger, messages="Remove node", boxed=True) Toolbox.log( logger=NodeRemovalController._logger, messages="WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n", ) ############### # VALIDATIONS # ############### try: node_ip = node_ip.strip() if not isinstance(node_ip, str): raise ValueError("Node IP must be a string") if not re.match(SSHClient.IP_REGEX, node_ip): raise ValueError("Invalid IP {0} specified".format(node_ip)) storage_router_all = StorageRouterList.get_storagerouters() storage_router_masters = StorageRouterList.get_masters() storage_router_all_ips = set([storage_router.ip for storage_router in storage_router_all]) storage_router_master_ips = set([storage_router.ip for storage_router in storage_router_masters]) storage_router_to_remove = StorageRouterList.get_by_ip(node_ip) if node_ip not in storage_router_all_ips: raise ValueError( "Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}".format( "\n - ".join(storage_router_all_ips), node_ip ) ) if len(storage_router_all_ips) == 1: raise RuntimeError("Removing the only node is not possible") if node_ip in storage_router_master_ips and len(storage_router_master_ips) == 1: raise RuntimeError("Removing the only master node is not possible") if System.get_my_storagerouter() == storage_router_to_remove: raise RuntimeError( "The node to be removed cannot be identical to the node on which the removal is initiated" ) Toolbox.log( logger=NodeRemovalController._logger, messages="Creating SSH connections to remaining master nodes" ) master_ip = None ip_client_map = {} storage_routers_offline = [] storage_router_to_remove_online = True for storage_router in storage_router_all: try: client = SSHClient(storage_router, username="******") if client.run(["pwd"]): Toolbox.log( logger=NodeRemovalController._logger, messages=" Node with IP {0:<15} successfully connected to".format(storage_router.ip), ) ip_client_map[storage_router.ip] = client if storage_router != storage_router_to_remove and storage_router.node_type == "MASTER": master_ip = storage_router.ip except UnableToConnectException: Toolbox.log( logger=NodeRemovalController._logger, messages=" Node with IP {0:<15} is unreachable".format(storage_router.ip), ) storage_routers_offline.append(storage_router) if storage_router == storage_router_to_remove: storage_router_to_remove_online = False if len(ip_client_map) == 0 or master_ip is None: raise RuntimeError("Could not connect to any master node in the cluster") storage_router_to_remove.invalidate_dynamics("vdisks_guids") if ( len(storage_router_to_remove.vdisks_guids) > 0 ): # vDisks are supposed to be moved away manually before removing a node raise RuntimeError("Still vDisks attached to Storage Router {0}".format(storage_router_to_remove.name)) internal_memcached = Toolbox.is_service_internally_managed(service="memcached") internal_rabbit_mq = Toolbox.is_service_internally_managed(service="rabbitmq") memcached_endpoints = Configuration.get(key="/ovs/framework/memcache|endpoints") rabbit_mq_endpoints = Configuration.get(key="/ovs/framework/messagequeue|endpoints") copy_memcached_endpoints = list(memcached_endpoints) copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints) for endpoint in memcached_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_memcached_endpoints.remove(endpoint) for endpoint in rabbit_mq_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_rabbit_mq_endpoints.remove(endpoint) if len(copy_memcached_endpoints) == 0 and internal_memcached is True: raise RuntimeError( "Removal of provided nodes will result in a complete removal of the memcached service" ) if len(copy_rabbit_mq_endpoints) == 0 and internal_rabbit_mq is True: raise RuntimeError( "Removal of provided nodes will result in a complete removal of the messagequeue service" ) except Exception as exception: Toolbox.log( logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel="exception" ) sys.exit(1) ################# # CONFIRMATIONS # ################# interactive = silent != "--force-yes" remove_asd_manager = not interactive # Remove ASD manager if non-interactive else ask if interactive is True: proceed = Interactive.ask_yesno( message="Are you sure you want to remove node {0}?".format(storage_router_to_remove.name), default_value=False, ) if proceed is False: Toolbox.log(logger=NodeRemovalController._logger, messages="Abort removal", title=True) sys.exit(1) if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username="******") if ServiceManager.has_service(name="asd-manager", client=client): remove_asd_manager = Interactive.ask_yesno( message="Do you also want to remove the ASD manager and related ASDs?", default_value=False ) if remove_asd_manager is True or storage_router_to_remove_online is False: for function in Toolbox.fetch_hooks("setup", "validate_asd_removal"): validation_output = function(storage_router_to_remove.ip) if validation_output["confirm"] is True: if Interactive.ask_yesno(message=validation_output["question"], default_value=False) is False: remove_asd_manager = False break ########### # REMOVAL # ########### try: Toolbox.log( logger=NodeRemovalController._logger, messages="Starting removal of node {0} - {1}".format( storage_router_to_remove.name, storage_router_to_remove.ip ), ) if storage_router_to_remove_online is False: Toolbox.log( logger=NodeRemovalController._logger, messages=" Marking all Storage Drivers served by Storage Router {0} as offline".format( storage_router_to_remove.ip ), ) StorageDriverController.mark_offline(storagerouter_guid=storage_router_to_remove.guid) # Remove vPools Toolbox.log( logger=NodeRemovalController._logger, messages=" Removing vPools from node".format(storage_router_to_remove.ip), ) storage_routers_offline_guids = [ sr.guid for sr in storage_routers_offline if sr.guid != storage_router_to_remove.guid ] for storage_driver in storage_router_to_remove.storagedrivers: Toolbox.log( logger=NodeRemovalController._logger, messages=" Removing vPool {0} from node".format(storage_driver.vpool.name), ) StorageRouterController.remove_storagedriver( storagedriver_guid=storage_driver.guid, offline_storage_router_guids=storage_routers_offline_guids ) # Demote if MASTER if storage_router_to_remove.node_type == "MASTER": NodeTypeController.demote_node( cluster_ip=storage_router_to_remove.ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=storage_router_to_remove.machine_id, unconfigure_memcached=internal_memcached, unconfigure_rabbitmq=internal_rabbit_mq, offline_nodes=storage_routers_offline, ) # Stop / remove services Toolbox.log(logger=NodeRemovalController._logger, messages="Stopping and removing services") config_store = Configuration.get_store() if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username="******") NodeRemovalController.remove_services( client=client, node_type=storage_router_to_remove.node_type.lower(), logger=NodeRemovalController._logger, ) service = "watcher-config" if ServiceManager.has_service(service, client=client): Toolbox.log(logger=NodeRemovalController._logger, messages="Removing service {0}".format(service)) ServiceManager.stop_service(service, client=client) ServiceManager.remove_service(service, client=client) if config_store == "etcd": from ovs.extensions.db.etcd.installer import EtcdInstaller if Configuration.get(key="/ovs/framework/external_config") is None: Toolbox.log(logger=NodeRemovalController._logger, messages=" Removing Etcd cluster") try: EtcdInstaller.stop("config", client) EtcdInstaller.remove("config", client) except Exception as ex: Toolbox.log( logger=NodeRemovalController._logger, messages=["\nFailed to unconfigure Etcd", ex], loglevel="exception", ) Toolbox.log(logger=NodeRemovalController._logger, messages="Removing Etcd proxy") EtcdInstaller.remove_proxy("config", client.ip) Toolbox.run_hooks( component="noderemoval", sub_component="remove", logger=NodeRemovalController._logger, cluster_ip=storage_router_to_remove.ip, complete_removal=remove_asd_manager, ) # Clean up model Toolbox.log(logger=NodeRemovalController._logger, messages="Removing node from model") for service in storage_router_to_remove.services: service.delete() for disk in storage_router_to_remove.disks: for partition in disk.partitions: partition.delete() disk.delete() for j_domain in storage_router_to_remove.domains: j_domain.delete() Configuration.delete("/ovs/framework/hosts/{0}".format(storage_router_to_remove.machine_id)) NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, offline_node_ips=[node.ip for node in storage_routers_offline], logger=NodeRemovalController._logger, ) if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username="******") if config_store == "arakoon": client.file_delete(filenames=[ArakoonConfiguration.CACC_LOCATION]) client.file_delete(filenames=[Configuration.BOOTSTRAP_CONFIG_LOCATION]) storage_router_to_remove.delete() Toolbox.log(logger=NodeRemovalController._logger, messages="Successfully removed node\n") except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages="\n") Toolbox.log( logger=NodeRemovalController._logger, messages=["An unexpected error occurred:", str(exception)], boxed=True, loglevel="exception", ) sys.exit(1) except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages="\n") Toolbox.log( logger=NodeRemovalController._logger, messages="This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.", boxed=True, loglevel="error", ) sys.exit(1) if remove_asd_manager is True: Toolbox.log(logger=NodeRemovalController._logger, messages="\nRemoving ASD Manager") with remote(storage_router_to_remove.ip, [os]) as rem: rem.os.system("asd-manager remove --force-yes") Toolbox.log(logger=NodeRemovalController._logger, messages="Remove nodes finished", title=True)
def update_volumedriver(): """ Update the volumedriver :return: None """ file_mutex = FileMutex('system_update', wait=2) upgrade_file = '/etc/ready_for_upgrade' upgrade_ongoing_check_file = '/etc/upgrade_ongoing' ssh_clients = [] try: file_mutex.acquire() UpdateController._log_message( '+++ Starting volumedriver update +++') from ovs.dal.lists.storagerouterlist import StorageRouterList UpdateController._log_message( 'Generating SSH client connections for each storage router') storage_routers = StorageRouterList.get_storagerouters() ssh_clients = [ SSHClient(storage_router.ip, 'root') for storage_router in storage_routers ] this_client = [ client for client in ssh_clients if client.is_local is True ][0] # Commence update !!!!!!! # 0. Create locks UpdateController._log_message('Creating lock files', client_ip=this_client.ip) for client in ssh_clients: client.run( 'touch {0}'.format(upgrade_file) ) # Prevents manual install or upgrade individual packages client.run('touch {0}'.format(upgrade_ongoing_check_file) ) # Prevents clicking x times on 'Update' btn # 1. Check requirements packages_to_update = set() all_services_to_restart = [] for client in ssh_clients: for function in Toolbox.fetch_hooks('update', 'metadata'): UpdateController._log_message( 'Executing function {0}'.format(function.__name__), client_ip=client.ip) output = function(client) for key, value in output.iteritems(): if key != 'volumedriver': continue for package_info in value: packages_to_update.update(package_info['packages']) all_services_to_restart += package_info['services'] services_to_restart = [] for service in all_services_to_restart: if service not in services_to_restart: services_to_restart.append( service ) # Filter out duplicates keeping the order of services (eg: watcher-framework before memcached) UpdateController._log_message( 'Services which will be restarted --> {0}'.format( ', '.join(services_to_restart))) UpdateController._log_message( 'Packages which will be installed --> {0}'.format( ', '.join(packages_to_update))) # 1. Stop services if UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='stop') is False: UpdateController._log_message( 'Stopping all services on every node failed, cannot continue', client_ip=this_client.ip, severity='warning') UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message( 'Attempting to start the services again', client_ip=this_client.ip) UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message( 'Failed to stop all required services, update aborted', client_ip=this_client.ip, severity='error') return # 2. Update packages failed_clients = [] for client in ssh_clients: PackageManager.update(client=client) try: for package_name in packages_to_update: UpdateController._log_message( 'Installing {0}'.format(package_name), client.ip) PackageManager.install(package_name=package_name, client=client, force=True) UpdateController._log_message( 'Installed {0}'.format(package_name), client.ip) client.file_delete(upgrade_file) except subprocess.CalledProcessError as cpe: UpdateController._log_message( 'Upgrade failed with error: {0}'.format(cpe.output), client.ip, 'error') failed_clients.append(client) break if failed_clients: UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message( 'Error occurred. Attempting to start all services again', client_ip=this_client.ip, severity='error') UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message( 'Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information' .format('\n - '.join([ client.ip for client in failed_clients ])), this_client.ip, 'error') return # 3. Post upgrade actions UpdateController._log_message('Executing post upgrade actions', client_ip=this_client.ip) for client in ssh_clients: for function in Toolbox.fetch_hooks('update', 'postupgrade'): UpdateController._log_message( 'Executing action: {0}'.format(function.__name__), client_ip=client.ip) try: function(client) except Exception as ex: UpdateController._log_message( 'Post upgrade action failed with error: {0}'. format(ex), client.ip, 'error') # 4. Start services UpdateController._log_message('Starting services', client_ip=this_client.ip) UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('+++ Finished updating +++') except RuntimeError as rte: if 'Could not acquire lock' in rte.message: UpdateController._log_message( 'Another volumedriver update is currently in progress!') else: UpdateController._log_message( 'Error during volumedriver update: {0}'.format(rte), severity='error') UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) except Exception as ex: UpdateController._log_message( 'Error during volumedriver update: {0}'.format(ex), severity='error') UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) finally: file_mutex.release()
def demote_node(cluster_ip, master_ip, ip_client_map, unique_id, unconfigure_memcached, unconfigure_rabbitmq, offline_nodes=None): """ Demotes a given node """ from ovs.dal.lists.storagerouterlist import StorageRouterList Toolbox.log(logger=NodeTypeController._logger, messages='Demoting node', title=True) service_manager = ServiceFactory.get_manager() if offline_nodes is None: offline_nodes = [] if unconfigure_memcached is True and len(offline_nodes) == 0: if NodeTypeController._validate_local_memcache_servers( ip_client_map) is False: raise RuntimeError( 'Not all memcache nodes can be reached which is required for demoting a node.' ) # Find other (arakoon) master nodes arakoon_cluster_name = str( Configuration.get('/ovs/framework/arakoon_clusters|ovsdb')) arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=arakoon_cluster_name) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name) master_node_ips = [node.ip for node in config.nodes] shrink = False if cluster_ip in master_node_ips: shrink = True master_node_ips.remove(cluster_ip) if len(master_node_ips) == 0: raise RuntimeError( 'There should be at least one other master node') storagerouter = StorageRouterList.get_by_machine_id(unique_id) storagerouter.node_type = 'EXTRA' storagerouter.save() offline_node_ips = [node.ip for node in offline_nodes] if arakoon_metadata['internal'] is True and shrink is True: Toolbox.log(logger=NodeTypeController._logger, messages='Leaving Arakoon {0} cluster'.format( arakoon_cluster_name)) arakoon_installer = ArakoonInstaller( cluster_name=arakoon_cluster_name) arakoon_installer.load() arakoon_installer.shrink_cluster(removal_ip=cluster_ip, offline_nodes=offline_node_ips) arakoon_installer.restart_cluster_after_shrinking() try: external_config = Configuration.get( '/ovs/framework/external_config') if external_config is None and shrink is True: Toolbox.log(logger=NodeTypeController._logger, messages='Leaving Arakoon config cluster') arakoon_installer = ArakoonInstaller(cluster_name='config') arakoon_installer.load(ip=master_node_ips[0]) arakoon_installer.shrink_cluster( removal_ip=cluster_ip, offline_nodes=offline_node_ips) arakoon_installer.restart_cluster_after_shrinking() except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages=['\nFailed to leave configuration cluster', ex], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Update configurations') try: if unconfigure_memcached is True: endpoints = Configuration.get( '/ovs/framework/memcache|endpoints') endpoint = '{0}:{1}'.format(cluster_ip, 11211) if endpoint in endpoints: endpoints.remove(endpoint) Configuration.set('/ovs/framework/memcache|endpoints', endpoints) if unconfigure_rabbitmq is True: endpoints = Configuration.get( '/ovs/framework/messagequeue|endpoints') endpoint = '{0}:{1}'.format(cluster_ip, 5672) if endpoint in endpoints: endpoints.remove(endpoint) Configuration.set('/ovs/framework/messagequeue|endpoints', endpoints) except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=['\nFailed to update configurations', ex], loglevel='exception') if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Restarting master node services') remaining_nodes = ip_client_map.keys()[:] if cluster_ip in remaining_nodes: remaining_nodes.remove(cluster_ip) PersistentFactory.store = None VolatileFactory.store = None for service in storagerouter.services: if service.name == 'arakoon-ovsdb': service.delete() target_client = None if storagerouter in offline_nodes: if unconfigure_rabbitmq is True: Toolbox.log( logger=NodeTypeController._logger, messages='Removing/unconfiguring offline RabbitMQ node') client = ip_client_map[master_ip] try: client.run([ 'rabbitmqctl', 'forget_cluster_node', 'rabbit@{0}'.format(storagerouter.name) ]) except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=[ '\nFailed to forget RabbitMQ cluster node', ex ], loglevel='exception') else: target_client = ip_client_map[cluster_ip] if unconfigure_rabbitmq is True: Toolbox.log(logger=NodeTypeController._logger, messages='Removing/unconfiguring RabbitMQ') try: if service_manager.has_service('rabbitmq-server', client=target_client): ServiceFactory.change_service_state( target_client, 'rabbitmq-server', 'stop', NodeTypeController._logger) target_client.run(['rabbitmq-server', '-detached']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop_app']) time.sleep(5) target_client.run(['rabbitmqctl', 'reset']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop']) time.sleep(5) target_client.file_unlink( "/var/lib/rabbitmq/.erlang.cookie") ServiceFactory.change_service_state( target_client, 'rabbitmq-server', 'stop', NodeTypeController._logger) # To be sure except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=[ '\nFailed to remove/unconfigure RabbitMQ', ex ], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Stopping services') services = ['memcached', 'rabbitmq-server'] if unconfigure_rabbitmq is False: services.remove('rabbitmq-server') if unconfigure_memcached is False: services.remove('memcached') for service in services: if service_manager.has_service(service, client=target_client): Toolbox.log( logger=NodeTypeController._logger, messages='Stopping service {0}'.format(service)) try: ServiceFactory.change_service_state( target_client, service, 'stop', NodeTypeController._logger) except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages=[ '\nFailed to stop service'.format(service), ex ], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Removing services') services = [ 'scheduled-tasks', 'webapp-api', 'volumerouter-consumer' ] for service in services: if service_manager.has_service(service, client=target_client): Toolbox.log( logger=NodeTypeController._logger, messages='Removing service {0}'.format(service)) try: ServiceFactory.change_service_state( target_client, service, 'stop', NodeTypeController._logger) service_manager.remove_service(service, client=target_client) except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages=[ '\nFailed to remove service'.format(service), ex ], loglevel='exception') if service_manager.has_service('workers', client=target_client): service_manager.add_service( name='workers', client=target_client, params={'WORKER_QUEUE': '{0}'.format(unique_id)}) try: NodeTypeController._configure_amqp_to_volumedriver() except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages=['\nFailed to configure AMQP to Storage Driver', ex], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger, offline_node_ips=offline_node_ips) if Toolbox.run_hooks(component='nodetype', sub_component='demote', logger=NodeTypeController._logger, cluster_ip=cluster_ip, master_ip=master_ip, offline_node_ips=offline_node_ips): Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger, offline_node_ips=offline_node_ips) if storagerouter not in offline_nodes: target_client = ip_client_map[cluster_ip] node_name, _ = target_client.get_hostname() if NodeTypeController.avahi_installed( client=target_client, logger=NodeTypeController._logger) is True: NodeTypeController.configure_avahi( client=target_client, node_name=node_name, node_type='extra', logger=NodeTypeController._logger) Configuration.set( '/ovs/framework/hosts/{0}/type'.format(storagerouter.machine_id), 'EXTRA') if target_client is not None and target_client.file_exists( '/tmp/ovs_rollback'): target_client.file_write('/tmp/ovs_rollback', 'rollback') Toolbox.log(logger=NodeTypeController._logger, messages='Demote complete', title=True)
def _can_remove(self): """ Can be removed """ return len(Toolbox.fetch_hooks('license', '{0}.remove'.format(self.component))) == 1
def update_framework(): """ Update the framework :return: None """ filemutex = file_mutex('system_update', wait=2) upgrade_file = '/etc/ready_for_upgrade' upgrade_ongoing_check_file = '/etc/upgrade_ongoing' ssh_clients = [] try: filemutex.acquire() UpdateController._log_message('+++ Starting framework update +++') from ovs.dal.lists.storagerouterlist import StorageRouterList UpdateController._log_message('Generating SSH client connections for each storage router') upgrade_file = '/etc/ready_for_upgrade' upgrade_ongoing_check_file = '/etc/upgrade_ongoing' storage_routers = StorageRouterList.get_storagerouters() ssh_clients = [] master_ips = [] extra_ips = [] for sr in storage_routers: ssh_clients.append(SSHClient(sr.ip, username='******')) if sr.node_type == 'MASTER': master_ips.append(sr.ip) elif sr.node_type == 'EXTRA': extra_ips.append(sr.ip) this_client = [client for client in ssh_clients if client.is_local is True][0] # Create locks UpdateController._log_message('Creating lock files', client_ip=this_client.ip) for client in ssh_clients: client.run('touch {0}'.format(upgrade_file)) # Prevents manual install or upgrade individual packages client.run('touch {0}'.format(upgrade_ongoing_check_file)) # Prevents clicking x times on 'Update' btn # Check requirements packages_to_update = set() all_services_to_restart = [] for client in ssh_clients: for function in Toolbox.fetch_hooks('update', 'metadata'): UpdateController._log_message('Executing function {0}'.format(function.__name__), client_ip=client.ip) output = function(client) for key, value in output.iteritems(): if key != 'framework': continue for package_info in value: packages_to_update.update(package_info['packages']) all_services_to_restart += package_info['services'] services_to_restart = [] for service in all_services_to_restart: if service not in services_to_restart: services_to_restart.append(service) # Filter out duplicates maintaining the order of services (eg: watcher-framework before memcached) UpdateController._log_message('Services which will be restarted --> {0}'.format(', '.join(services_to_restart))) UpdateController._log_message('Packages which will be installed --> {0}'.format(', '.join(packages_to_update))) # Stop services if UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='stop') is False: UpdateController._log_message('Stopping all services on every node failed, cannot continue', client_ip=this_client.ip, severity='warning') UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) # Start services again if a service could not be stopped UpdateController._log_message('Attempting to start the services again', client_ip=this_client.ip) UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message('Failed to stop all required services, aborting update', client_ip=this_client.ip, severity='error') return # Update packages failed_clients = [] for client in ssh_clients: PackageManager.update(client=client) try: UpdateController._log_message('Installing latest packages', client.ip) for package in packages_to_update: UpdateController._log_message('Installing {0}'.format(package), client.ip) PackageManager.install(package_name=package, client=client, force=True) UpdateController._log_message('Installed {0}'.format(package), client.ip) client.file_delete(upgrade_file) except subprocess.CalledProcessError as cpe: UpdateController._log_message('Upgrade failed with error: {0}'.format(cpe.output), client.ip, 'error') failed_clients.append(client) break if failed_clients: UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('Error occurred. Attempting to start all services again', client_ip=this_client.ip, severity='error') UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message('Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information'.format('\n - '.join([client.ip for client in failed_clients]), this_client.ip), this_client.ip, 'error') return # Migrate code for client in ssh_clients: try: UpdateController._log_message('Started code migration', client.ip) try: with remote(client.ip, [Migrator]) as rem: rem.Migrator.migrate(master_ips, extra_ips) except EOFError as eof: UpdateController._log_message('EOFError during code migration, retrying {0}'.format(eof), client.ip, 'warning') with remote(client.ip, [Migrator]) as rem: rem.Migrator.migrate(master_ips, extra_ips) UpdateController._log_message('Finished code migration', client.ip) except Exception as ex: UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('Code migration failed with error: {0}'.format(ex), client.ip, 'error') return # Start services UpdateController._log_message('Starting services', client_ip=this_client.ip) model_services = [] if 'arakoon-ovsdb' in services_to_restart: model_services.append('arakoon-ovsdb') services_to_restart.remove('arakoon-ovsdb') if 'memcached' in services_to_restart: model_services.append('memcached') services_to_restart.remove('memcached') UpdateController._change_services_state(services=model_services, ssh_clients=ssh_clients, action='start') # Migrate model UpdateController._log_message('Started model migration', client_ip=this_client.ip) try: from ovs.dal.helpers import Migration with remote(ssh_clients[0].ip, [Migration]) as rem: rem.Migration.migrate() UpdateController._log_message('Finished model migration', client_ip=this_client.ip) except Exception as ex: UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('An unexpected error occurred: {0}'.format(ex), client_ip=this_client.ip, severity='error') return # Post upgrade actions UpdateController._log_message('Executing post upgrade actions', client_ip=this_client.ip) for client in ssh_clients: with remote(client.ip, [Toolbox, SSHClient]) as rem: for function in rem.Toolbox.fetch_hooks('update', 'postupgrade'): UpdateController._log_message('Executing action {0}'.format(function.__name__), client_ip=client.ip) try: function(rem.SSHClient(client.ip, username='******')) UpdateController._log_message('Executing action {0} completed'.format(function.__name__), client_ip=client.ip) except Exception as ex: UpdateController._log_message('Post upgrade action failed with error: {0}'.format(ex), client.ip, 'error') # Start watcher and restart support-agent UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._change_services_state(services=['support-agent'], ssh_clients=ssh_clients, action='restart') UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('+++ Finished updating +++') except RuntimeError as rte: UpdateController._log_message('Error during framework update: {0}'.format(rte), severity='error') UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) except NoLockAvailableException: UpdateController._log_message('Another framework update is currently in progress!') except Exception as ex: UpdateController._log_message('Error during framework update: {0}'.format(ex), severity='error') UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) finally: filemutex.release()
def remove_node(node_ip, silent=None): """ Remove the node with specified IP from the cluster :param node_ip: IP of the node to remove :type node_ip: str :param silent: If silent == '--force-yes' no question will be asked to confirm the removal :type silent: str :return: None """ from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.lib.storagedriver import StorageDriverController from ovs.lib.vpool import VPoolController Toolbox.log(logger=NodeRemovalController._logger, messages='Remove node', boxed=True) Toolbox.log( logger=NodeRemovalController._logger, messages= 'WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n' ) service_manager = ServiceFactory.get_manager() ############### # VALIDATIONS # ############### try: node_ip = node_ip.strip() if not isinstance(node_ip, str): raise ValueError('Node IP must be a string') if not re.match(SSHClient.IP_REGEX, node_ip): raise ValueError('Invalid IP {0} specified'.format(node_ip)) storage_router_all = sorted(StorageRouterList.get_storagerouters(), key=lambda k: k.name) storage_router_masters = StorageRouterList.get_masters() storage_router_all_ips = set( [storage_router.ip for storage_router in storage_router_all]) storage_router_master_ips = set([ storage_router.ip for storage_router in storage_router_masters ]) storage_router_to_remove = StorageRouterList.get_by_ip(node_ip) offline_reasons = {} if node_ip not in storage_router_all_ips: raise ValueError( 'Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}' .format('\n - '.join(storage_router_all_ips), node_ip)) if len(storage_router_all_ips) == 1: raise RuntimeError("Removing the only node is not possible") if node_ip in storage_router_master_ips and len( storage_router_master_ips) == 1: raise RuntimeError( "Removing the only master node is not possible") if System.get_my_storagerouter() == storage_router_to_remove: raise RuntimeError( 'The node to be removed cannot be identical to the node on which the removal is initiated' ) Toolbox.log( logger=NodeRemovalController._logger, messages='Creating SSH connections to remaining master nodes') master_ip = None ip_client_map = {} storage_routers_offline = [] storage_router_to_remove_online = True for storage_router in storage_router_all: try: client = SSHClient(storage_router, username='******', timeout=10) except (UnableToConnectException, NotAuthenticatedException, TimeOutException) as ex: if isinstance(ex, UnableToConnectException): msg = 'Unable to connect' elif isinstance(ex, NotAuthenticatedException): msg = 'Could not authenticate' elif isinstance(ex, TimeOutException): msg = 'Connection timed out' Toolbox.log( logger=NodeRemovalController._logger, messages=' * Node with IP {0:<15}- {1}'.format( storage_router.ip, msg)) offline_reasons[storage_router.ip] = msg storage_routers_offline.append(storage_router) if storage_router == storage_router_to_remove: storage_router_to_remove_online = False continue Toolbox.log( logger=NodeRemovalController._logger, messages=' * Node with IP {0:<15}- Successfully connected' .format(storage_router.ip)) ip_client_map[storage_router.ip] = client if storage_router != storage_router_to_remove and storage_router.node_type == 'MASTER': master_ip = storage_router.ip if len(ip_client_map) == 0 or master_ip is None: raise RuntimeError( 'Could not connect to any master node in the cluster') storage_router_to_remove.invalidate_dynamics('vdisks_guids') if len( storage_router_to_remove.vdisks_guids ) > 0: # vDisks are supposed to be moved away manually before removing a node raise RuntimeError( "Still vDisks attached to Storage Router {0}".format( storage_router_to_remove.name)) internal_memcached = Toolbox.is_service_internally_managed( service='memcached') internal_rabbit_mq = Toolbox.is_service_internally_managed( service='rabbitmq') memcached_endpoints = Configuration.get( key='/ovs/framework/memcache|endpoints') rabbit_mq_endpoints = Configuration.get( key='/ovs/framework/messagequeue|endpoints') copy_memcached_endpoints = list(memcached_endpoints) copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints) for endpoint in memcached_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_memcached_endpoints.remove(endpoint) for endpoint in rabbit_mq_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_rabbit_mq_endpoints.remove(endpoint) if len(copy_memcached_endpoints ) == 0 and internal_memcached is True: raise RuntimeError( 'Removal of provided nodes will result in a complete removal of the memcached service' ) if len(copy_rabbit_mq_endpoints ) == 0 and internal_rabbit_mq is True: raise RuntimeError( 'Removal of provided nodes will result in a complete removal of the messagequeue service' ) Toolbox.run_hooks(component='noderemoval', sub_component='validate_removal', logger=NodeRemovalController._logger, cluster_ip=storage_router_to_remove.ip) except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages= 'Removal has been aborted during the validation step. No changes have been applied.', boxed=True, loglevel='warning') sys.exit(1) except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel='exception') sys.exit(1) ################# # CONFIRMATIONS # ################# try: interactive = silent != '--force-yes' remove_asd_manager = not interactive # Remove ASD manager if non-interactive else ask if interactive is True: if len(storage_routers_offline) > 0: Toolbox.log( logger=NodeRemovalController._logger, messages= 'Certain nodes appear to be offline. These will not fully removed and will cause issues if they are not really offline.' ) Toolbox.log( logger=NodeRemovalController._logger, messages='Offline nodes: {0}'.format(''.join( ('\n * {0:<15}- {1}.'.format(ip, message) for ip, message in offline_reasons.iteritems())))) valid_node_info = Interactive.ask_yesno( message= 'Continue the removal with these being presumably offline?', default_value=False) if valid_node_info is False: Toolbox.log( logger=NodeRemovalController._logger, messages= 'Please validate the state of the nodes before removing.', title=True) sys.exit(1) proceed = Interactive.ask_yesno( message='Are you sure you want to remove node {0}?'.format( storage_router_to_remove.name), default_value=False) if proceed is False: Toolbox.log(logger=NodeRemovalController._logger, messages='Abort removal', title=True) sys.exit(1) remove_asd_manager = True if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username='******') if service_manager.has_service(name='asd-manager', client=client): remove_asd_manager = Interactive.ask_yesno( message= 'Do you also want to remove the ASD manager and related ASDs?', default_value=False) if remove_asd_manager is True or storage_router_to_remove_online is False: for fct in Toolbox.fetch_hooks('noderemoval', 'validate_asd_removal'): validation_output = fct(storage_router_to_remove.ip) if validation_output['confirm'] is True: if Interactive.ask_yesno( message=validation_output['question'], default_value=False) is False: remove_asd_manager = False break except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages= 'Removal has been aborted during the confirmation step. No changes have been applied.', boxed=True, loglevel='warning') sys.exit(1) except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel='exception') sys.exit(1) ########### # REMOVAL # ########### try: Toolbox.log(logger=NodeRemovalController._logger, messages='Starting removal of node {0} - {1}'.format( storage_router_to_remove.name, storage_router_to_remove.ip)) if storage_router_to_remove_online is False: Toolbox.log( logger=NodeRemovalController._logger, messages= ' Marking all Storage Drivers served by Storage Router {0} as offline' .format(storage_router_to_remove.ip)) StorageDriverController.mark_offline( storagerouter_guid=storage_router_to_remove.guid) # Remove vPools Toolbox.log(logger=NodeRemovalController._logger, messages=' Removing vPools from node'.format( storage_router_to_remove.ip)) storage_routers_offline_guids = [ sr.guid for sr in storage_routers_offline if sr.guid != storage_router_to_remove.guid ] for storage_driver in storage_router_to_remove.storagedrivers: Toolbox.log(logger=NodeRemovalController._logger, messages=' Removing vPool {0} from node'.format( storage_driver.vpool.name)) VPoolController.shrink_vpool( storagedriver_guid=storage_driver.guid, offline_storage_router_guids=storage_routers_offline_guids) # Demote if MASTER if storage_router_to_remove.node_type == 'MASTER': NodeTypeController.demote_node( cluster_ip=storage_router_to_remove.ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=storage_router_to_remove.machine_id, unconfigure_memcached=internal_memcached, unconfigure_rabbitmq=internal_rabbit_mq, offline_nodes=storage_routers_offline) # Stop / remove services Toolbox.log(logger=NodeRemovalController._logger, messages='Stopping and removing services') if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username='******') NodeRemovalController.remove_services( client=client, node_type=storage_router_to_remove.node_type.lower(), logger=NodeRemovalController._logger) service = 'watcher-config' if service_manager.has_service(service, client=client): Toolbox.log( logger=NodeRemovalController._logger, messages='Removing service {0}'.format(service)) service_manager.stop_service(service, client=client) service_manager.remove_service(service, client=client) Toolbox.run_hooks(component='noderemoval', sub_component='remove', logger=NodeRemovalController._logger, cluster_ip=storage_router_to_remove.ip, complete_removal=remove_asd_manager) # Clean up model Toolbox.log(logger=NodeRemovalController._logger, messages='Removing node from model') for service in storage_router_to_remove.services: service.delete() for disk in storage_router_to_remove.disks: for partition in disk.partitions: partition.delete() disk.delete() for j_domain in storage_router_to_remove.domains: j_domain.delete() Configuration.delete('/ovs/framework/hosts/{0}'.format( storage_router_to_remove.machine_id)) NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, offline_node_ips=[node.ip for node in storage_routers_offline], logger=NodeRemovalController._logger) if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username='******') client.file_delete(filenames=[CACC_LOCATION]) client.file_delete(filenames=[CONFIG_STORE_LOCATION]) storage_router_to_remove.delete() Toolbox.log(logger=NodeRemovalController._logger, messages='Successfully removed node\n') except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages=['An unexpected error occurred:', str(exception)], boxed=True, loglevel='exception') sys.exit(1) except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages= 'This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.', boxed=True, loglevel='error') sys.exit(1) if remove_asd_manager is True and storage_router_to_remove_online is True: Toolbox.log(logger=NodeRemovalController._logger, messages='\nRemoving ASD Manager') with remote(storage_router_to_remove.ip, [os]) as rem: rem.os.system('asd-manager remove --force-yes') Toolbox.log(logger=NodeRemovalController._logger, messages='Remove nodes finished', title=True)
def update_framework(): """ Update the framework :return: None """ file_mutex = FileMutex('system_update', wait=2) upgrade_file = '/etc/ready_for_upgrade' upgrade_ongoing_check_file = '/etc/upgrade_ongoing' ssh_clients = [] try: file_mutex.acquire() UpdateController._log_message('+++ Starting framework update +++') from ovs.dal.lists.storagerouterlist import StorageRouterList UpdateController._log_message( 'Generating SSH client connections for each storage router') upgrade_file = '/etc/ready_for_upgrade' upgrade_ongoing_check_file = '/etc/upgrade_ongoing' storage_routers = StorageRouterList.get_storagerouters() ssh_clients = [] master_ips = [] extra_ips = [] for sr in storage_routers: ssh_clients.append(SSHClient(sr.ip, username='******')) if sr.node_type == 'MASTER': master_ips.append(sr.ip) elif sr.node_type == 'EXTRA': extra_ips.append(sr.ip) this_client = [ client for client in ssh_clients if client.is_local is True ][0] # Create locks UpdateController._log_message('Creating lock files', client_ip=this_client.ip) for client in ssh_clients: client.run( 'touch {0}'.format(upgrade_file) ) # Prevents manual install or upgrade individual packages client.run('touch {0}'.format(upgrade_ongoing_check_file) ) # Prevents clicking x times on 'Update' btn # Check requirements packages_to_update = set() all_services_to_restart = [] for client in ssh_clients: for function in Toolbox.fetch_hooks('update', 'metadata'): UpdateController._log_message( 'Executing function {0}'.format(function.__name__), client_ip=client.ip) output = function(client) for key, value in output.iteritems(): if key != 'framework': continue for package_info in value: packages_to_update.update(package_info['packages']) all_services_to_restart += package_info['services'] services_to_restart = [] for service in all_services_to_restart: if service not in services_to_restart: services_to_restart.append( service ) # Filter out duplicates maintaining the order of services (eg: watcher-framework before memcached) UpdateController._log_message( 'Services which will be restarted --> {0}'.format( ', '.join(services_to_restart))) UpdateController._log_message( 'Packages which will be installed --> {0}'.format( ', '.join(packages_to_update))) # Stop services if UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='stop') is False: UpdateController._log_message( 'Stopping all services on every node failed, cannot continue', client_ip=this_client.ip, severity='warning') UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) # Start services again if a service could not be stopped UpdateController._log_message( 'Attempting to start the services again', client_ip=this_client.ip) UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message( 'Failed to stop all required services, aborting update', client_ip=this_client.ip, severity='error') return # Update packages failed_clients = [] for client in ssh_clients: PackageManager.update(client=client) try: UpdateController._log_message('Installing latest packages', client.ip) for package in packages_to_update: UpdateController._log_message( 'Installing {0}'.format(package), client.ip) PackageManager.install(package_name=package, client=client, force=True) UpdateController._log_message( 'Installed {0}'.format(package), client.ip) client.file_delete(upgrade_file) except subprocess.CalledProcessError as cpe: UpdateController._log_message( 'Upgrade failed with error: {0}'.format(cpe.output), client.ip, 'error') failed_clients.append(client) break if failed_clients: UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message( 'Error occurred. Attempting to start all services again', client_ip=this_client.ip, severity='error') UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message( 'Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information' .format('\n - '.join([ client.ip for client in failed_clients ])), this_client.ip, 'error') return # Migrate code for client in ssh_clients: try: UpdateController._log_message('Started code migration', client.ip) try: with Remote(client.ip, [Migrator]) as remote: remote.Migrator.migrate(master_ips, extra_ips) except EOFError as eof: UpdateController._log_message( 'EOFError during code migration, retrying {0}'. format(eof), client.ip, 'warning') with Remote(client.ip, [Migrator]) as remote: remote.Migrator.migrate(master_ips, extra_ips) UpdateController._log_message('Finished code migration', client.ip) except Exception as ex: UpdateController._remove_lock_files( [upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message( 'Code migration failed with error: {0}'.format(ex), client.ip, 'error') return # Start services UpdateController._log_message('Starting services', client_ip=this_client.ip) model_services = [] if 'arakoon-ovsdb' in services_to_restart: model_services.append('arakoon-ovsdb') services_to_restart.remove('arakoon-ovsdb') if 'memcached' in services_to_restart: model_services.append('memcached') services_to_restart.remove('memcached') UpdateController._change_services_state(services=model_services, ssh_clients=ssh_clients, action='start') # Migrate model UpdateController._log_message('Started model migration', client_ip=this_client.ip) try: from ovs.dal.helpers import Migration Migration.migrate() UpdateController._log_message('Finished model migration', client_ip=this_client.ip) except Exception as ex: UpdateController._remove_lock_files( [upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message( 'An unexpected error occurred: {0}'.format(ex), client_ip=this_client.ip, severity='error') return # Post upgrade actions UpdateController._log_message('Executing post upgrade actions', client_ip=this_client.ip) for client in ssh_clients: with Remote(client.ip, [Toolbox, SSHClient]) as remote: for function in remote.Toolbox.fetch_hooks( 'update', 'postupgrade'): UpdateController._log_message( 'Executing action {0}'.format(function.__name__), client_ip=client.ip) try: function( remote.SSHClient(client.ip, username='******')) UpdateController._log_message( 'Executing action {0} completed'.format( function.__name__), client_ip=client.ip) except Exception as ex: UpdateController._log_message( 'Post upgrade action failed with error: {0}'. format(ex), client.ip, 'error') # Start watcher and restart support-agent UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._change_services_state(services=['support-agent'], ssh_clients=ssh_clients, action='restart') UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('+++ Finished updating +++') except RuntimeError as rte: if 'Could not acquire lock' in rte.message: UpdateController._log_message( 'Another framework update is currently in progress!') else: UpdateController._log_message( 'Error during framework update: {0}'.format(rte), severity='error') UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) except Exception as ex: UpdateController._log_message( 'Error during framework update: {0}'.format(ex), severity='error') UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) finally: file_mutex.release()
def check_if_proxies_work(result_handler): """ Checks if all Alba Proxies work on a local machine, it creates a namespace and tries to put and object :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: None :rtype: NoneType """ namespace_params = { 'bucket_count': (list, None), 'logical': (int, None), 'storage': (int, None), 'storage_per_osd': (list, None) } result_handler.info('Checking the ALBA proxies.', add_to_result=False) amount_of_presets_not_working = [] # ignore possible subprocess output fnull = open(os.devnull, 'w') # try put/get/verify on all available proxies on the local node local_proxies = ServiceHelper.get_local_proxy_services() if len(local_proxies) == 0: result_handler.info('Found no proxies.', add_to_result=False) return amount_of_presets_not_working for service in local_proxies: try: result_handler.info('Checking ALBA proxy {0}.'.format( service.name), add_to_result=False) ip = service.alba_proxy.storagedriver.storage_ip # Encapsulating try to determine test output try: # Determine what to what backend the proxy is connected proxy_client_cfg = AlbaCLI.run(command='proxy-client-cfg', named_params={ 'host': ip, 'port': service.ports[0] }) except AlbaException: result_handler.failure( 'Fetching proxy info has failed. Please verify if {0}:{1} is the correct address for proxy {2}.' .format(ip, service.ports[0], service.name)) continue # Fetch arakoon information abm_name = proxy_client_cfg.get('cluster_id') # Check if proxy config is correctly setup if abm_name is None: raise ConfigNotMatchedException( 'Proxy config for proxy {0} does not have the correct format on node {1} with port {2}.' .format(service.name, ip, service.ports[0])) abm_config = Configuration.get_configuration_path( '/ovs/vpools/{0}/proxies/{1}/config/abm'.format( service.alba_proxy.storagedriver.vpool.guid, service.alba_proxy.guid)) # Determine presets / backend try: presets = AlbaCLI.run(command='list-presets', config=abm_config) except AlbaException: result_handler.failure( 'Listing the presets has failed. Please check the arakoon config path. We used {0}' .format(abm_config)) continue for preset in presets: # If preset is not in use, test will fail so add a skip if preset['in_use'] is False: result_handler.skip( 'Preset {0} is not in use and will not be checked'. format(preset['name'])) continue preset_name = preset['name'] # Encapsulation try for cleanup try: # Generate new namespace name using the preset namespace_key_prefix = 'ovs-healthcheck-ns-{0}-{1}'.format( preset_name, AlbaHealthCheck.LOCAL_ID) namespace_key = '{0}_{1}'.format( namespace_key_prefix, uuid.uuid4()) object_key = 'ovs-healthcheck-obj-{0}'.format( str(uuid.uuid4())) # Create namespace AlbaCLI.run(command='proxy-create-namespace', named_params={ 'host': ip, 'port': service.ports[0] }, extra_params=[namespace_key, preset_name]) # Wait until fully created namespace_start_time = time.time() for index in xrange(2): # Running twice because the first one could give a false positive as the osds will alert the nsm # and the nsm would respond with got messages but these were not the ones we are after AlbaCLI.run(command='deliver-messages', config=abm_config) while True: if time.time( ) - namespace_start_time > AlbaHealthCheck.NAMESPACE_TIMEOUT: raise RuntimeError( 'Creation namespace has timed out after {0}s' .format(time.time() - namespace_start_time)) list_ns_osds_output = AlbaCLI.run( command='list-ns-osds', config=abm_config, extra_params=[namespace_key]) # Example output: [[0, [u'Active']], [3, [u'Active']]] namespace_ready = True for osd_info in list_ns_osds_output: # If there are no osd_info records, uploading will fail so covered by HC osd_state = osd_info[1][0] if osd_state != 'Active': namespace_ready = False if namespace_ready is True: break result_handler.success( 'Namespace successfully created on proxy {0} with preset {1}!' .format(service.name, preset_name)) namespace_info = AlbaCLI.run( command='show-namespace', config=abm_config, extra_params=[namespace_key]) Toolbox.verify_required_params( required_params=namespace_params, actual_params=namespace_info) result_handler.success( 'Namespace successfully fetched on proxy {0} with preset {1}!' .format(service.name, preset_name)) # Put test object to given dir with open(AlbaHealthCheck.TEMP_FILE_LOC, 'wb') as output_file: output_file.write( os.urandom(AlbaHealthCheck.TEMP_FILE_SIZE)) AlbaCLI.run(command='proxy-upload-object', named_params={ 'host': ip, 'port': service.ports[0] }, extra_params=[ namespace_key, AlbaHealthCheck.TEMP_FILE_LOC, object_key ]) result_handler.success( 'Successfully uploaded the object to namespace {0}' .format(namespace_key)) # download object AlbaCLI.run(command='proxy-download-object', named_params={ 'host': ip, 'port': service.ports[0] }, extra_params=[ namespace_key, object_key, AlbaHealthCheck.TEMP_FILE_FETCHED_LOC ]) result_handler.success( 'Successfully downloaded the object to namespace {0}' .format(namespace_key)) # check if files exists - issue #57 if not (os.path.isfile( AlbaHealthCheck.TEMP_FILE_FETCHED_LOC) and os.path.isfile(AlbaHealthCheck.TEMP_FILE_LOC)): # creation of object failed raise ObjectNotFoundException( ValueError('Creation of object has failed')) hash_original = hashlib.md5( open(AlbaHealthCheck.TEMP_FILE_LOC, 'rb').read()).hexdigest() hash_fetched = hashlib.md5( open(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC, 'rb').read()).hexdigest() if hash_original == hash_fetched: result_handler.success( 'Fetched object {0} from namespace {1} on proxy {2} with preset {3} matches the created object!' .format(object_key, namespace_key, service.name, preset_name)) else: result_handler.failure( 'Fetched object {0} from namespace {1} on proxy {2} with preset {3} does not match the created object!' .format(object_key, namespace_key, service.name, preset_name)) except ObjectNotFoundException as ex: amount_of_presets_not_working.append(preset_name) result_handler.failure( 'Failed to put object on namespace {0} failed on proxy {1}with preset {2} With error {3}' .format(namespace_key, service.name, preset_name, ex)) except AlbaException as ex: if ex.alba_command == 'proxy-create-namespace': result_handler.failure( 'Create namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}' .format(str(ex), namespace_key, service.name, preset_name)) elif ex.alba_command == 'show-namespace': result_handler.failure( 'Show namespace has failed with {0} on namespace {1} with proxy {2} with preset {3}' .format(str(ex), namespace_key, service.name, preset_name)) elif ex.alba_command == 'proxy-upload-object': result_handler.failure( 'Uploading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}' .format(str(ex), namespace_key, service.name, preset_name)) elif ex.alba_command == 'proxy-download-object': result_handler.failure( 'Downloading the object has failed with {0} on namespace {1} with proxy {2} with preset {3}' .format(str(ex), namespace_key, service.name, preset_name)) finally: # Delete the created namespace and preset subprocess.call( ['rm', str(AlbaHealthCheck.TEMP_FILE_LOC)], stdout=fnull, stderr=subprocess.STDOUT) subprocess.call( ['rm', str(AlbaHealthCheck.TEMP_FILE_FETCHED_LOC)], stdout=fnull, stderr=subprocess.STDOUT) namespaces = AlbaCLI.run(command='list-namespaces', config=abm_config) namespaces_to_remove = [] proxy_named_params = { 'host': ip, 'port': service.ports[0] } for namespace in namespaces: if namespace['name'].startswith( namespace_key_prefix): namespaces_to_remove.append(namespace['name']) for namespace_name in namespaces_to_remove: if namespace_name == namespace_key: result_handler.info( 'Deleting namespace {0}.'.format( namespace_name)) else: result_handler.warning( 'Deleting namespace {0} which was leftover from a previous run.' .format(namespace_name)) AlbaCLI.run(command='proxy-delete-namespace', named_params=proxy_named_params, extra_params=[namespace_name]) namespace_delete_start = time.time() while True: try: AlbaCLI.run( command='show-namespace', config=abm_config, extra_params=[namespace_name] ) # Will fail if the namespace does not exist except AlbaException: result_handler.success( 'Namespace {0} successfully removed.'. format(namespace_name)) break if time.time( ) - namespace_delete_start > AlbaHealthCheck.NAMESPACE_TIMEOUT: raise RuntimeError( 'Delete namespace has timed out after {0}s' .format(time.time() - namespace_start_time)) # be tidy, and make the proxy forget the namespace try: AlbaCLI.run( command='proxy-statistics', named_params=proxy_named_params, extra_params=['--forget', namespace_name]) except: result_handler.warning( 'Failed to make proxy forget namespace {0}.' .format(namespace_name)) except subprocess.CalledProcessError as ex: # this should stay for the deletion of the remaining files amount_of_presets_not_working.append(service.name) result_handler.failure( 'Proxy {0} has some problems. Got {1} as error'.format( service.name, ex)) except ConfigNotMatchedException as ex: amount_of_presets_not_working.append(service.name) result_handler.failure( 'Proxy {0} has some problems. Got {1} as error'.format( service.name, ex))
def promote_node(cluster_ip, master_ip, ip_client_map, unique_id, configure_memcached, configure_rabbitmq): """ Promotes a given node """ from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.servicelist import ServiceList from ovs.dal.hybrids.service import Service Toolbox.log(logger=NodeTypeController._logger, messages='Promoting node', title=True) service_manager = ServiceFactory.get_manager() if configure_memcached is True: if NodeTypeController._validate_local_memcache_servers( ip_client_map) is False: raise RuntimeError( 'Not all memcache nodes can be reached which is required for promoting a node.' ) target_client = ip_client_map[cluster_ip] machine_id = System.get_my_machine_id(target_client) node_name, _ = target_client.get_hostname() master_client = ip_client_map[master_ip] storagerouter = StorageRouterList.get_by_machine_id(unique_id) storagerouter.node_type = 'MASTER' storagerouter.save() external_config = Configuration.get('/ovs/framework/external_config') if external_config is None: Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon configuration cluster') arakoon_installer = ArakoonInstaller(cluster_name='config') arakoon_installer.load(ip=master_ip) arakoon_installer.extend_cluster( new_ip=cluster_ip, base_dir=Configuration.get('/ovs/framework/paths|ovsdb')) arakoon_installer.restart_cluster_after_extending( new_ip=cluster_ip) service_manager.register_service( node_name=machine_id, service_metadata=arakoon_installer.service_metadata[cluster_ip] ) # Find other (arakoon) master nodes arakoon_cluster_name = str( Configuration.get('/ovs/framework/arakoon_clusters|ovsdb')) arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=arakoon_cluster_name) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name) master_node_ips = [node.ip for node in config.nodes] if cluster_ip in master_node_ips: master_node_ips.remove(cluster_ip) if len(master_node_ips) == 0: raise RuntimeError( 'There should be at least one other master node') arakoon_ports = [] if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon OVS DB cluster') arakoon_installer = ArakoonInstaller( cluster_name=arakoon_cluster_name) arakoon_installer.load() arakoon_installer.extend_cluster( new_ip=cluster_ip, base_dir=Configuration.get('/ovs/framework/paths|ovsdb')) arakoon_installer.restart_cluster_after_extending( new_ip=cluster_ip) arakoon_ports = arakoon_installer.ports[cluster_ip] if configure_memcached is True: NodeTypeController.configure_memcached( client=target_client, logger=NodeTypeController._logger) NodeTypeController.add_services(client=target_client, node_type='master', logger=NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Update configurations') if configure_memcached is True: endpoints = Configuration.get('/ovs/framework/memcache|endpoints') endpoint = '{0}:11211'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/memcache|endpoints', endpoints) if configure_rabbitmq is True: endpoints = Configuration.get( '/ovs/framework/messagequeue|endpoints') endpoint = '{0}:5672'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/messagequeue|endpoints', endpoints) if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Restarting master node services') PersistentFactory.store = None VolatileFactory.store = None if 'arakoon-ovsdb' not in [ s.name for s in ServiceList.get_services() if s.is_internal is False or s.storagerouter.ip == cluster_ip ]: service = Service() service.name = 'arakoon-ovsdb' service.type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) service.ports = arakoon_ports service.storagerouter = storagerouter service.save() if configure_rabbitmq is True: NodeTypeController.configure_rabbitmq( client=target_client, logger=NodeTypeController._logger) # Copy rabbitmq cookie rabbitmq_cookie_file = '/var/lib/rabbitmq/.erlang.cookie' Toolbox.log(logger=NodeTypeController._logger, messages='Copying RabbitMQ cookie') contents = master_client.file_read(rabbitmq_cookie_file) master_hostname, _ = master_client.get_hostname() target_client.dir_create(os.path.dirname(rabbitmq_cookie_file)) target_client.file_write(rabbitmq_cookie_file, contents) target_client.file_chmod(rabbitmq_cookie_file, mode=0400) target_client.run(['rabbitmq-server', '-detached']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop_app']) time.sleep(5) target_client.run([ 'rabbitmqctl', 'join_cluster', 'rabbit@{0}'.format(master_hostname) ]) time.sleep(5) target_client.run(['rabbitmqctl', 'stop']) time.sleep(5) # Enable HA for the rabbitMQ queues ServiceFactory.change_service_state(target_client, 'rabbitmq-server', 'start', NodeTypeController._logger) NodeTypeController.check_rabbitmq_and_enable_ha_mode( client=target_client, logger=NodeTypeController._logger) NodeTypeController._configure_amqp_to_volumedriver() Toolbox.log(logger=NodeTypeController._logger, messages='Starting services') services = ['memcached', 'arakoon-ovsdb', 'rabbitmq-server'] if arakoon_metadata['internal'] is True: services.remove('arakoon-ovsdb') for service in services: if service_manager.has_service(service, client=target_client): ServiceFactory.change_service_state(target_client, service, 'start', NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger) if Toolbox.run_hooks(component='nodetype', sub_component='promote', logger=NodeTypeController._logger, cluster_ip=cluster_ip, master_ip=master_ip): Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger) if NodeTypeController.avahi_installed( client=target_client, logger=NodeTypeController._logger) is True: NodeTypeController.configure_avahi( client=target_client, node_name=node_name, node_type='master', logger=NodeTypeController._logger) Configuration.set('/ovs/framework/hosts/{0}/type'.format(machine_id), 'MASTER') target_client.run( ['chown', '-R', 'ovs:ovs', '/opt/OpenvStorage/config']) Configuration.set( '/ovs/framework/hosts/{0}/promotecompleted'.format(machine_id), True) if target_client.file_exists('/tmp/ovs_rollback'): target_client.file_delete('/tmp/ovs_rollback') Toolbox.log(logger=NodeTypeController._logger, messages='Promote complete')
def update_volumedriver(): """ Update the volumedriver :return: None """ filemutex = file_mutex('system_update', wait=2) upgrade_file = '/etc/ready_for_upgrade' upgrade_ongoing_check_file = '/etc/upgrade_ongoing' ssh_clients = [] try: filemutex.acquire() UpdateController._log_message('+++ Starting volumedriver update +++') from ovs.dal.lists.storagerouterlist import StorageRouterList UpdateController._log_message('Generating SSH client connections for each storage router') storage_routers = StorageRouterList.get_storagerouters() ssh_clients = [SSHClient(storage_router.ip, 'root') for storage_router in storage_routers] this_client = [client for client in ssh_clients if client.is_local is True][0] # Commence update !!!!!!! # 0. Create locks UpdateController._log_message('Creating lock files', client_ip=this_client.ip) for client in ssh_clients: client.run('touch {0}'.format(upgrade_file)) # Prevents manual install or upgrade individual packages client.run('touch {0}'.format(upgrade_ongoing_check_file)) # Prevents clicking x times on 'Update' btn # 1. Check requirements packages_to_update = set() all_services_to_restart = [] for client in ssh_clients: for function in Toolbox.fetch_hooks('update', 'metadata'): UpdateController._log_message('Executing function {0}'.format(function.__name__), client_ip=client.ip) output = function(client) for key, value in output.iteritems(): if key != 'volumedriver': continue for package_info in value: packages_to_update.update(package_info['packages']) all_services_to_restart += package_info['services'] services_to_restart = [] for service in all_services_to_restart: if service not in services_to_restart: services_to_restart.append(service) # Filter out duplicates keeping the order of services (eg: watcher-framework before memcached) UpdateController._log_message('Services which will be restarted --> {0}'.format(', '.join(services_to_restart))) UpdateController._log_message('Packages which will be installed --> {0}'.format(', '.join(packages_to_update))) # 1. Stop services if UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='stop') is False: UpdateController._log_message('Stopping all services on every node failed, cannot continue', client_ip=this_client.ip, severity='warning') UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('Attempting to start the services again', client_ip=this_client.ip) UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message('Failed to stop all required services, update aborted', client_ip=this_client.ip, severity='error') return # 2. Update packages failed_clients = [] for client in ssh_clients: PackageManager.update(client=client) try: for package_name in packages_to_update: UpdateController._log_message('Installing {0}'.format(package_name), client.ip) PackageManager.install(package_name=package_name, client=client, force=True) UpdateController._log_message('Installed {0}'.format(package_name), client.ip) client.file_delete(upgrade_file) except subprocess.CalledProcessError as cpe: UpdateController._log_message('Upgrade failed with error: {0}'.format(cpe.output), client.ip, 'error') failed_clients.append(client) break if failed_clients: UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('Error occurred. Attempting to start all services again', client_ip=this_client.ip, severity='error') UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message('Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information'.format('\n - '.join([client.ip for client in failed_clients]), this_client.ip), this_client.ip, 'error') return # 3. Post upgrade actions UpdateController._log_message('Executing post upgrade actions', client_ip=this_client.ip) for client in ssh_clients: for function in Toolbox.fetch_hooks('update', 'postupgrade'): UpdateController._log_message('Executing action: {0}'.format(function.__name__), client_ip=client.ip) try: function(client) except Exception as ex: UpdateController._log_message('Post upgrade action failed with error: {0}'.format(ex), client.ip, 'error') # 4. Start services UpdateController._log_message('Starting services', client_ip=this_client.ip) UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('+++ Finished updating +++') except RuntimeError as rte: UpdateController._log_message('Error during volumedriver update: {0}'.format(rte), severity='error') UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) except NoLockAvailableException: UpdateController._log_message('Another volumedriver update is currently in progress!') except Exception as ex: UpdateController._log_message('Error during volumedriver update: {0}'.format(ex), severity='error') UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) finally: filemutex.release()
def configure_rabbitmq(client, logger): """ Configure RabbitMQ :param client: Client on which to configure RabbitMQ :type client: ovs_extensions.generic.sshclient.SSHClient :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: None """ Toolbox.log(logger=logger, messages='Setting up RabbitMQ') service_manager = ServiceFactory.get_manager() rabbitmq_port = Configuration.get( '/ovs/framework/messagequeue|endpoints')[0].split(':')[1] rabbitmq_login = Configuration.get('/ovs/framework/messagequeue|user') rabbitmq_password = Configuration.get( '/ovs/framework/messagequeue|password') client.file_write( '/etc/rabbitmq/rabbitmq.config', """[ {{rabbit, [{{tcp_listeners, [{0}]}}, {{default_user, <<"{1}">>}}, {{default_pass, <<"{2}">>}}, {{cluster_partition_handling, autoheal}}, {{log_levels, [{{connection, warning}}]}}, {{vm_memory_high_watermark, 0.2}}]}} ].""".format(rabbitmq_port, rabbitmq_login, rabbitmq_password)) rabbitmq_running, same_process = service_manager.is_rabbitmq_running( client=client) if rabbitmq_running is True: # Example output of 'list_users' command # Listing users ... # guest [administrator] # ovs [] # ... done. users = [ user.split('\t')[0] for user in client.run( ['rabbitmqctl', 'list_users']).splitlines() if '\t' in user and '[' in user and ']' in user ] if 'ovs' in users: Toolbox.log(logger=logger, messages='Already configured RabbitMQ') return ServiceFactory.change_service_state(client, 'rabbitmq-server', 'stop', logger) client.run(['rabbitmq-server', '-detached']) time.sleep(5) # Sometimes/At random the rabbitmq server takes longer than 5 seconds to start, # and the next command fails so the best solution is to retry several times # Also retry the add_user/set_permissions, and validate the result retry = 0 while retry < 10: users = Toolbox.retry_client_run( client=client, command=['rabbitmqctl', 'list_users'], logger=logger).splitlines() users = [ usr.split('\t')[0] for usr in users if '\t' in usr and '[' in usr and ']' in usr ] logger.debug('Rabbitmq users {0}'.format(users)) if 'ovs' in users: logger.debug('User ovs configured in rabbitmq') break logger.debug( Toolbox.retry_client_run(client=client, command=[ 'rabbitmqctl', 'add_user', rabbitmq_login, rabbitmq_password ], logger=logger)) logger.debug( Toolbox.retry_client_run(client=client, command=[ 'rabbitmqctl', 'set_permissions', rabbitmq_login, '.*', '.*', '.*' ], logger=logger)) retry += 1 time.sleep(1) client.run(['rabbitmqctl', 'stop']) time.sleep(5)