def is_service_internally_managed(service): """ Validate whether the service is internally or externally managed :param service: Service to verify :type service: str :return: True if internally managed, False otherwise :rtype: bool """ if service not in ['memcached', 'rabbitmq']: raise ValueError('Can only check memcached or rabbitmq') service_name_map = {'memcached': 'memcache', 'rabbitmq': 'messagequeue'}[service] config_key = '/ovs/framework/{0}'.format(service_name_map) if not Configuration.exists(key=config_key): return True if not Configuration.exists(key='{0}|metadata'.format(config_key)): raise ValueError('Not all required keys ({0}) for {1} are present in the configuration management'.format(config_key, service)) metadata = Configuration.get('{0}|metadata'.format(config_key)) if 'internal' not in metadata: raise ValueError('Internal flag not present in metadata for {0}.\nPlease provide a key: {1} and value "metadata": {{"internal": True/False}}'.format(service, config_key)) internal = metadata['internal'] if internal is False: if not Configuration.exists(key='{0}|endpoints'.format(config_key)): raise ValueError('Externally managed {0} cluster must have "endpoints" information\nPlease provide a key: {1} and value "endpoints": [<ip:port>]'.format(service, config_key)) endpoints = Configuration.get(key='{0}|endpoints'.format(config_key)) if not isinstance(endpoints, list) or len(endpoints) == 0: raise ValueError('The endpoints for {0} cannot be empty and must be a list'.format(service)) return internal
def migrate(previous_version): """ Migrates from any version to any version, running all migrations required If previous_version is for example 0 and this script is at verison 3 it will execute two steps: - 1 > 2 - 2 > 3 @param previous_version: The previous version from which to start the migration. """ working_version = previous_version # Version 1 introduced: # - Flexible SSD layout if working_version < 1: from ovs.extensions.generic.configuration import Configuration if Configuration.exists('ovs.arakoon'): Configuration.delete('ovs.arakoon', remove_root=True) Configuration.set('ovs.core.ovsdb', '/opt/OpenvStorage/db') working_version = 1 # Version 2 introduced: # - Registration if working_version < 2: import time from ovs.extensions.generic.configuration import Configuration if not Configuration.exists('ovs.core.registered'): Configuration.set('ovs.core.registered', False) Configuration.set('ovs.core.install_time', time.time()) working_version = 2 return working_version
def migrate(previous_version, master_ips=None, extra_ips=None): """ Migrates from any version to any version, running all migrations required If previous_version is for example 0 and this script is at verison 3 it will execute two steps: - 1 > 2 - 2 > 3 :param previous_version: The previous version from which to start the migration. :param master_ips: IP addresses of the MASTER nodes :param extra_ips: IP addresses of the EXTRA nodes """ working_version = previous_version # Version 1 introduced: # - Flexible SSD layout if working_version < 1: from ovs.extensions.generic.configuration import Configuration if Configuration.exists('ovs.arakoon'): Configuration.delete('ovs.arakoon', remove_root=True) Configuration.set('ovs.core.ovsdb', '/opt/OpenvStorage/db') working_version = 1 # Version 2 introduced: # - Registration if working_version < 2: import time from ovs.extensions.generic.configuration import Configuration if not Configuration.exists('ovs.core.registered'): Configuration.set('ovs.core.registered', False) Configuration.set('ovs.core.install_time', time.time()) working_version = 2 # Version 3 introduced: # - New arakoon clients if working_version < 3: from ovs.extensions.db.arakoon import ArakoonInstaller reload(ArakoonInstaller) from ovs.extensions.db.arakoon import ArakoonInstaller from ovs.extensions.generic.sshclient import SSHClient from ovs.extensions.generic.configuration import Configuration if master_ips is not None: for ip in master_ips: client = SSHClient(ip) if client.dir_exists(ArakoonInstaller.ArakoonInstaller.ARAKOON_CONFIG_DIR): for cluster_name in client.dir_list(ArakoonInstaller.ArakoonInstaller.ARAKOON_CONFIG_DIR): try: ArakoonInstaller.ArakoonInstaller.deploy_cluster(cluster_name, ip) except: pass if Configuration.exists('ovs.core.storage.persistent'): Configuration.set('ovs.core.storage.persistent', 'pyrakoon') working_version = 3 return working_version
def migrate(master_ips=None, extra_ips=None): """ Executes all migrations. It keeps track of an internal "migration version" which is always increasing by one :param master_ips: IP addresses of the MASTER nodes :param extra_ips: IP addresses of the EXTRA nodes """ machine_id = System.get_my_machine_id() key = '/ovs/framework/hosts/{0}/versions'.format(machine_id) data = Configuration.get(key) if Configuration.exists(key) else {} migrators = [] path = '/'.join([os.path.dirname(__file__), 'migration']) for filename in os.listdir(path): if os.path.isfile('/'.join([path, filename])) and filename.endswith('.py'): name = filename.replace('.py', '') module = imp.load_source(name, '/'.join([path, filename])) for member in inspect.getmembers(module): if inspect.isclass(member[1]) and member[1].__module__ == name and 'object' in [base.__name__ for base in member[1].__bases__]: migrators.append((member[1].identifier, member[1].migrate)) end_version = 0 for identifier, method in migrators: base_version = data[identifier] if identifier in data else 0 version = method(base_version, master_ips, extra_ips) if version > end_version: end_version = version data[identifier] = end_version Configuration.set(key, data)
def __init__(self, vpool_guid, storagedriver_id): """ Initializes the class """ _log_level = LOG_LEVEL_MAPPING[OVSLogger( 'extensions').getEffectiveLevel()] # noinspection PyCallByClass,PyTypeChecker storagerouterclient.Logger.setupLogging( OVSLogger.load_path('storagerouterclient'), _log_level) # noinspection PyArgumentList storagerouterclient.Logger.enableLogging() self._key = '/ovs/vpools/{0}/hosts/{1}/config'.format( vpool_guid, storagedriver_id) self._logger = OVSLogger('extensions') self._dirty_entries = [] self.remote_path = Configuration.get_configuration_path( self._key).strip('/') # Load configuration if Configuration.exists(self._key): self.configuration = Configuration.get(self._key) self.config_missing = False else: self.configuration = {} self.config_missing = True self._logger.debug( 'Could not find config {0}, a new one will be created'.format( self._key))
def config_files_check_test(): """ Verify some configuration files """ issues_found = '' config_keys = { "/ovs/framework/memcache", "/ovs/arakoon/ovsdb/config" } for key_to_check in config_keys: if not Configuration.exists(key_to_check, raw=True): issues_found += "Couldn't find {0}\n".format(key_to_check) config_files = { "rabbitmq.config": "/etc/rabbitmq/rabbitmq.config", } grid_ip = General.get_config().get('main', 'grid_ip') ssh_pass = General.get_config().get('mgmtcenter', 'password') client = SSHClient(grid_ip, username='******', password=ssh_pass) for config_file_to_check in config_files.iterkeys(): if not client.file_exists(config_files[config_file_to_check]): issues_found += "Couldn't find {0}\n".format(config_file_to_check) assert issues_found == '',\ "Found the following issues while checking for the config files:{0}\n".format(issues_found)
def delete_config(cluster_name): """ Remove the configuration entry for arakoon cluster_name :param cluster_name: Name of the arakoon cluster :return: None """ config_key = GeneralArakoon.CONFIG_KEY.format(cluster_name) if Configuration.exists(config_key, raw=True): Configuration.delete(os.path.dirname(config_key))
def is_service_internally_managed(service): """ Validate whether the service is internally or externally managed :param service: Service to verify :type service: str :return: True if internally managed, False otherwise :rtype: bool """ if service not in ['memcached', 'rabbitmq']: raise ValueError('Can only check memcached or rabbitmq') service_name_map = { 'memcached': 'memcache', 'rabbitmq': 'messagequeue' }[service] config_key = '/ovs/framework/{0}'.format(service_name_map) if not Configuration.exists(key=config_key): return True if not Configuration.exists(key='{0}|metadata'.format(config_key)): raise ValueError( 'Not all required keys ({0}) for {1} are present in the configuration management' .format(config_key, service)) metadata = Configuration.get('{0}|metadata'.format(config_key)) if 'internal' not in metadata: raise ValueError( 'Internal flag not present in metadata for {0}.\nPlease provide a key: {1} and value "metadata": {{"internal": True/False}}' .format(service, config_key)) internal = metadata['internal'] if internal is False: if not Configuration.exists( key='{0}|endpoints'.format(config_key)): raise ValueError( 'Externally managed {0} cluster must have "endpoints" information\nPlease provide a key: {1} and value "endpoints": [<ip:port>]' .format(service, config_key)) endpoints = Configuration.get( key='{0}|endpoints'.format(config_key)) if not isinstance(endpoints, list) or len(endpoints) == 0: raise ValueError( 'The endpoints for {0} cannot be empty and must be a list'. format(service)) return internal
def load(self): """ Loads the configuration from a given file, optionally a remote one """ self.configuration = {} if Configuration.exists(self.key): self.is_new = False self.configuration = json.loads(Configuration.get(self.key, raw=True)) else: self._logger.debug('Could not find config {0}, a new one will be created'.format(self.key)) self.dirty_entries = []
def delete_config(self, ip=None): """ Deletes a configuration file """ if self.filesystem is False: key = self.config_path if Configuration.exists(key, raw=True): Configuration.delete(key, raw=True) else: client = self._load_client(ip) client.file_delete(self.config_path)
def ipmi_check(cls, result_handler): """ :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: """ for albanode in AlbaNodeList.get_albanodes(): node_id = albanode.node_id ipmi_config_loc = '/ovs/alba/asdnodes/{0}/config/ipmi'.format( node_id) if not Configuration.exists(ipmi_config_loc): result_handler.skip( 'No IPMI info found on AlbaNode with ID {0}'.format( node_id)) continue ipmi_config = Configuration.get(ipmi_config_loc) ip = ipmi_config.get('ip') try: controller = IPMIController( ip=ip, username=ipmi_config.get('username'), password=ipmi_config.get('password'), client=SSHClient(System.get_my_storagerouter())) except: result_handler.failure( 'IPMI settings are not valid for AlbaNode with ID {0}'. format(node_id)) continue try: status = controller.status_node().get(ip) if status == IPMIController.IPMI_POWER_ON: result_handler.success( 'IPMI AlbaNode with ID {0} status is POWER ON'.format( node_id)) elif status == IPMIController.IPMI_POWER_OFF: result_handler.warning( 'IPMI AlbaNode with ID {0} status is POWER OFF'.format( node_id)) except IPMITimeOutException as ex: result_handler.failure( "IPMI AlbaNode with ID {0} timed out: '{1}'".format( node_id, ex)) except IPMICallException as ex: result_handler.failure( "IPMI AlbaNode with ID {0} call failed: '{1}'".format( node_id, ex)) except Exception: msg = 'Could not retrieve info through IPMI for AlbaNode with ID {0}'.format( node_id) cls.logger.exception(msg) result_handler.exception(msg)
def load(self): """ Loads the configuration from a given file, optionally a remote one """ self.configuration = {} if Configuration.exists(self.key): self.is_new = False self.configuration = json.loads( Configuration.get(self.key, raw=True)) else: self._logger.debug( 'Could not find config {0}, a new one will be created'.format( self.key)) self.dirty_entries = []
def get_config(cluster_name): """ Retrieve the configuration for given cluster :param cluster_name: Name of the cluster :return: RawConfigParser object """ config_key = GeneralArakoon.CONFIG_KEY.format(cluster_name) if not Configuration.exists(config_key, raw=True): raise ValueError('Unknown arakoon cluster_name {0} provided'.format(cluster_name)) voldrv_config = Configuration.get(config_key, raw=True) parser = RawConfigParser() parser.readfp(StringIO(voldrv_config)) return parser
def get_cluster_name(internal_name): """ Retrieve the name of the cluster :param internal_name: Name as known by the framework :type internal_name: str :return: Name known by user :rtype: str """ config_key = '/ovs/framework/arakoon_clusters' if Configuration.exists(config_key): cluster_info = Configuration.get(config_key) if internal_name in cluster_info: return cluster_info[internal_name] if internal_name not in ['ovsdb', 'voldrv']: return internal_name
def get_path(binary_name): """ Retrieve the absolute path for binary :param binary_name: Binary to get path for :return: Path """ machine_id = System.get_my_machine_id() config_location = '/ovs/framework/hosts/{0}/paths|{1}'.format(machine_id, binary_name) if not Configuration.exists(config_location): try: path = check_output("which '{0}'".format(binary_name.replace(r"'", r"'\''")), shell=True).strip() Configuration.set(config_location, path) except CalledProcessError: return None else: path = Configuration.get(config_location) return path
def _update_manifest_cache_size(_proxy_config_key): updated = False manifest_cache_size = 500 * 1024 * 1024 if Configuration.exists(key=_proxy_config_key): _proxy_config = Configuration.get(key=_proxy_config_key) for cache_type in [StorageDriverConfiguration.CACHE_BLOCK, StorageDriverConfiguration.CACHE_FRAGMENT]: if cache_type in _proxy_config and _proxy_config[cache_type][0] == 'alba': if _proxy_config[cache_type][1]['manifest_cache_size'] != manifest_cache_size: updated = True _proxy_config[cache_type][1]['manifest_cache_size'] = manifest_cache_size if _proxy_config['manifest_cache_size'] != manifest_cache_size: updated = True _proxy_config['manifest_cache_size'] = manifest_cache_size if updated is True: Configuration.set(key=_proxy_config_key, value=_proxy_config) return updated
def verify_arakoon_structure(client, cluster_name, config_present, dir_present): """ Verify the expected arakoon structure and configuration :param client: SSHClient object :param cluster_name: Name of the arakoon cluster :param config_present: configuration presence expectancy :param dir_present: Directory structure presence expectancy :return: True if correct """ tlog_dir = GeneralArakoon.TLOG_DIR.format('/var/tmp', cluster_name) home_dir = GeneralArakoon.HOME_DIR.format('/var/tmp', cluster_name) key_exists = Configuration.exists(GeneralArakoon.CONFIG_KEY.format(cluster_name), raw=True) assert key_exists is config_present,\ "Arakoon configuration was {0} expected".format('' if config_present else 'not ') for directory in [tlog_dir, home_dir]: assert client.dir_exists(directory) is dir_present,\ "Arakoon directory {0} was {1} expected".format(directory, '' if dir_present else 'not ')
def teardown(): """ Teardown for Arakoon package, will be executed when all started tests in this package have ended Removal actions of possible things left over after the test-run :return: None """ for storagerouter in GeneralStorageRouter.get_masters(): root_client = SSHClient(storagerouter, username='******') if GeneralService.get_service_status(name='ovs-scheduled-tasks', client=root_client) is False: GeneralService.start_service(name='ovs-scheduled-tasks', client=root_client) for location in TEST_CLEANUP: root_client.run(['rm', '-rf', location]) for key in KEY_CLEANUP: if Configuration.exists('{0}/{1}'.format(GeneralArakoon.CONFIG_ROOT, key), raw=True): Configuration.delete('{0}/{1}'.format(GeneralArakoon.CONFIG_ROOT, key))
def get_path(binary_name): """ Retrieve the absolute path for binary :param binary_name: Binary to get path for :return: Path """ machine_id = System.get_my_machine_id() config_location = '/ovs/framework/hosts/{0}/paths|{1}'.format( machine_id, binary_name) if not Configuration.exists(config_location): try: path = check_output("which '{0}'".format( binary_name.replace(r"'", r"'\''")), shell=True).strip() Configuration.set(config_location, path) except CalledProcessError: return None else: path = Configuration.get(config_location) return path
def migrate(): """ Executes all migrations. It keeps track of an internal "migration version" which is always increasing by one """ data = Configuration.get('ovs.core.versions') if Configuration.exists('ovs.core.versions') else {} migrators = [] path = os.path.join(os.path.dirname(__file__), 'migration') for filename in os.listdir(path): if os.path.isfile(os.path.join(path, filename)) and filename.endswith('.py'): name = filename.replace('.py', '') module = imp.load_source(name, os.path.join(path, filename)) for member in inspect.getmembers(module): if inspect.isclass(member[1]) and member[1].__module__ == name and 'object' in [base.__name__ for base in member[1].__bases__]: migrators.append((member[1].identifier, member[1].migrate)) end_version = 0 for identifier, method in migrators: base_version = data[identifier] if identifier in data else 0 version = method(base_version) if version > end_version: end_version = version data[identifier] = end_version Configuration.set('ovs.core.versions', data)
def ipmi_check(cls, result_handler): """ :param result_handler: logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: """ for albanode in AlbaNodeList.get_albanodes(): node_id = albanode.node_id ipmi_config_loc = '/ovs/alba/asdnodes/{0}/config/ipmi'.format(node_id) if not Configuration.exists(ipmi_config_loc): result_handler.skip('No IPMI info found on AlbaNode with ID {0}'.format(node_id)) continue ipmi_config = Configuration.get(ipmi_config_loc) ip = ipmi_config.get('ip') try: controller = IPMIController(ip=ip, username=ipmi_config.get('username'), password=ipmi_config.get('password'), client=SSHClient(System.get_my_storagerouter())) except: result_handler.failure('IPMI settings are not valid for AlbaNode with ID {0}'.format(node_id)) continue try: status = controller.status_node().get(ip) if status == IPMIController.IPMI_POWER_ON: result_handler.success('IPMI AlbaNode with ID {0} status is POWER ON'.format(node_id)) elif status == IPMIController.IPMI_POWER_OFF: result_handler.warning('IPMI AlbaNode with ID {0} status is POWER OFF'.format(node_id)) except IPMITimeOutException as ex: result_handler.failure("IPMI AlbaNode with ID {0} timed out: '{1}'".format(node_id, ex)) except IPMICallException as ex: result_handler.failure("IPMI AlbaNode with ID {0} call failed: '{1}'".format(node_id, ex)) except Exception: msg = 'Could not retrieve info through IPMI for AlbaNode with ID {0}'.format(node_id) cls.logger.exception(msg) result_handler.exception(msg)
def change_scheduled_task(task_name, state, disabled=False, cron=None): if not Configuration.exists(celery_key): Configuration.set(celery_key, {}) jobs = Configuration.get(celery_key) if state == 'present': if disabled: jobs[task_name] = None output = 'task {0}: disabled'.format(task_name) else: jobs[task_name] = cron settings = '' for key, value in cron.iteritems(): settings += "{0}: {1} ".format(key, value) output = 'task {0}: cron settings {1}'.format(task_name, settings) else: jobs.pop(task_name, None) output = 'task {0}: removed, default settings will be applied.'.format(task_name) Configuration.set(celery_key, jobs) service_name = 'scheduled-tasks' service_manager = ServiceFactory.get_manager() for storagerouter in StorageRouterList.get_masters(): client = SSHClient(storagerouter, username='******') service_manager.restart_service(service_name, client=client) return output
def _stack(self): """ Returns an overview of this node's storage stack """ from ovs.dal.hybrids.albabackend import AlbaBackend from ovs.dal.lists.albabackendlist import AlbaBackendList def _move(info): for move in [('state', 'status'), ('state_detail', 'status_detail')]: if move[0] in info: info[move[1]] = info[move[0]] del info[move[0]] stack = {} node_down = False # Fetch stack from asd-manager try: remote_stack = self.client.get_stack() for slot_id, slot_data in remote_stack.iteritems(): stack[slot_id] = {'status': 'ok'} stack[slot_id].update(slot_data) # Migrate state > status _move(stack[slot_id]) for osd_data in slot_data.get('osds', {}).itervalues(): _move(osd_data) except (requests.ConnectionError, requests.Timeout, InvalidCredentialsError): self._logger.warning( 'Error during stack retrieval. Assuming that the node is down') node_down = True model_osds = {} found_osds = {} # Apply own model to fetched stack for osd in self.osds: model_osds[osd.osd_id] = osd # Initially set the info if osd.slot_id not in stack: stack[osd.slot_id] = { 'status': self.OSD_STATUSES.UNKNOWN if node_down is True else self.OSD_STATUSES.MISSING, 'status_detail': self.OSD_STATUS_DETAILS.NODEDOWN if node_down is True else '', 'osds': {} } osd_data = stack[osd.slot_id]['osds'].get(osd.osd_id, {}) stack[osd.slot_id]['osds'][ osd.osd_id] = osd_data # Initially set the info in the stack osd_data.update(osd.stack_info) if node_down is True: osd_data['status'] = self.OSD_STATUSES.UNKNOWN osd_data['status_detail'] = self.OSD_STATUS_DETAILS.NODEDOWN elif osd.alba_backend_guid is not None: # Osds has been claimed # Load information from alba if osd.alba_backend_guid not in found_osds: found_osds[osd.alba_backend_guid] = {} if osd.alba_backend.abm_cluster is not None: config = Configuration.get_configuration_path( osd.alba_backend.abm_cluster.config_location) try: for found_osd in AlbaCLI.run( command='list-all-osds', config=config): found_osds[osd.alba_backend_guid][ found_osd['long_id']] = found_osd except (AlbaError, RuntimeError): self._logger.exception( 'Listing all osds has failed') osd_data['status'] = self.OSD_STATUSES.UNKNOWN osd_data[ 'status_detail'] = self.OSD_STATUS_DETAILS.ALBAERROR continue if osd.osd_id not in found_osds[osd.alba_backend_guid]: # Not claimed by any backend thus not in use continue found_osd = found_osds[osd.alba_backend_guid][osd.osd_id] if found_osd['decommissioned'] is True: osd_data['status'] = self.OSD_STATUSES.UNAVAILABLE osd_data[ 'status_detail'] = self.OSD_STATUS_DETAILS.DECOMMISSIONED continue backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format( osd.alba_backend_guid) if Configuration.exists(backend_interval_key): interval = Configuration.get(backend_interval_key) else: interval = Configuration.get( '/ovs/alba/backends/global_gui_error_interval') read = found_osd['read'] or [0] write = found_osd['write'] or [0] errors = found_osd['errors'] osd_data['status'] = self.OSD_STATUSES.WARNING osd_data['status_detail'] = self.OSD_STATUS_DETAILS.ERROR if len(errors) == 0 or (len(read + write) > 0 and max(min(read), min(write)) > max(error[0] for error in errors) + interval): osd_data['status'] = self.OSD_STATUSES.OK osd_data['status_detail'] = '' statistics = {} for slot_info in stack.itervalues(): for osd_id, osd in slot_info['osds'].iteritems(): if osd.get( 'status_detail') == self.OSD_STATUS_DETAILS.ACTIVATING: osd['claimed_by'] = 'unknown' # We won't be able to connect to it just yet continue if osd_id not in model_osds: # The osd is known by the remote node but not in the model # In that case, let's connect to the OSD to see whether we get some info from it try: ips = osd['hosts'] if 'hosts' in osd and len( osd['hosts']) > 0 else osd.get('ips', []) port = osd['port'] claimed_by = 'unknown' for ip in ips: try: # Output will be None if it is not claimed claimed_by = AlbaCLI.run('get-osd-claimed-by', named_params={ 'host': ip, 'port': port }) break except (AlbaError, RuntimeError): self._logger.warning( 'get-osd-claimed-by failed for IP:port {0}:{1}' .format(ip, port)) alba_backend = AlbaBackendList.get_by_alba_id( claimed_by) osd['claimed_by'] = alba_backend.guid if alba_backend is not None else claimed_by except KeyError: osd['claimed_by'] = 'unknown' except: self._logger.exception( 'Could not load OSD info: {0}'.format(osd_id)) osd['claimed_by'] = 'unknown' if osd.get('status') not in ['error', 'warning']: osd['status'] = self.OSD_STATUSES.ERROR osd['status_detail'] = self.OSD_STATUS_DETAILS.UNREACHABLE claimed_by = osd.get('claimed_by', 'unknown') if claimed_by == 'unknown': continue try: alba_backend = AlbaBackend(claimed_by) except ObjectNotFoundException: continue # Add usage information if alba_backend not in statistics: statistics[alba_backend] = alba_backend.osd_statistics osd_statistics = statistics[alba_backend] if osd_id not in osd_statistics: continue stats = osd_statistics[osd_id] osd['usage'] = { 'size': int(stats['capacity']), 'used': int(stats['disk_usage']), 'available': int(stats['capacity'] - stats['disk_usage']) } return stack
def get(self, request, *args, **kwargs): """ Fetches metadata """ _ = args, kwargs data = { 'authenticated': False, 'authentication_state': None, 'authentication_metadata': {}, 'username': None, 'userguid': None, 'roles': [], 'identification': {}, 'storagerouter_ips': [sr.ip for sr in StorageRouterList.get_storagerouters()], 'versions': list(settings.VERSION), 'plugins': {} } try: # Gather plugin metadata plugins = {} # - Backends. BackendType plugins must set the has_plugin flag on True for backend_type in BackendTypeList.get_backend_types(): if backend_type.has_plugin is True: if backend_type.code not in plugins: plugins[backend_type.code] = [] plugins[backend_type.code] += ['backend', 'gui'] # - Generic plugins, as added to the configuration file(s) generic_plugins = Configuration.get('ovs.plugins.generic') for plugin_name in generic_plugins: if plugin_name not in plugins: plugins[plugin_name] = [] plugins[plugin_name] += ['gui'] data['plugins'] = plugins # Fill identification data['identification'] = { 'cluster_id': Configuration.get('ovs.support.cid') } # Get authentication metadata authentication_metadata = {'ip': System.get_my_storagerouter().ip} for key in ['mode', 'authorize_uri', 'client_id', 'scope']: if Configuration.exists('ovs.webapps.oauth2.{0}'.format(key)): authentication_metadata[key] = Configuration.get( 'ovs.webapps.oauth2.{0}'.format(key)) data['authentication_metadata'] = authentication_metadata # Gather authorization metadata if 'HTTP_AUTHORIZATION' not in request.META: return HttpResponse, dict( data.items() + {'authentication_state': 'unauthenticated'}.items()) authorization_type, access_token = request.META[ 'HTTP_AUTHORIZATION'].split(' ') if authorization_type != 'Bearer': return HttpResponse, dict( data.items() + {'authentication_state': 'invalid_authorization_type' }.items()) tokens = BearerTokenList.get_by_access_token(access_token) if len(tokens) != 1: return HttpResponse, dict( data.items() + {'authentication_state': 'invalid_token'}.items()) token = tokens[0] if token.expiration < time.time(): for junction in token.roles.itersafe(): junction.delete() token.delete() return HttpResponse, dict( data.items() + {'authentication_state': 'token_expired'}.items()) # Gather user metadata user = token.client.user if not user.is_active: return HttpResponse, dict( data.items() + {'authentication_state': 'inactive_user'}.items()) roles = [j.role.code for j in token.roles] return HttpResponse, dict( data.items() + { 'authenticated': True, 'authentication_state': 'authenticated', 'username': user.username, 'userguid': user.guid, 'roles': roles, 'plugins': plugins }.items()) except Exception as ex: logger.exception('Unexpected exception: {0}'.format(ex)) return HttpResponse, dict( data.items() + {'authentication_state': 'unexpected_exception'}.items())
def _live_status(self): """ Retrieve the live status of the ALBA Backend to be displayed in the 'Backends' page in the GUI based on: - Maintenance agents presence - Maintenance agents status - Disk statuses :return: Status as reported by the plugin :rtype: str """ if self.backend.status == Backend.STATUSES.INSTALLING: return 'installing' if self.backend.status == Backend.STATUSES.DELETING: return 'deleting' # Verify failed disks devices = self.local_summary['devices'] if devices['red'] > 0: self._logger.warning( 'AlbaBackend {0} STATUS set to FAILURE due to {1} failed disks' .format(self.name, devices['red'])) return AlbaBackend.STATUSES.FAILURE # Verify remote OSDs remote_errors = False linked_backend_warning = False for remote_info in self.remote_stack.itervalues(): if remote_info['error'] == 'unknown' or remote_info[ 'live_status'] == AlbaBackend.STATUSES.FAILURE: message = None if remote_info['error'] == 'unknown': message = 'unknown remote error info' elif remote_info[ 'live_status'] == AlbaBackend.STATUSES.FAILURE: message = 'FAILURE in live_status' self._logger.warning( 'AlbaBackend {0} STATUS set to FAILURE due to OSD {1}: {2} ' .format(self.name, remote_info['name'], message)) return AlbaBackend.STATUSES.FAILURE if remote_info['error'] == 'not_allowed': remote_errors = True if remote_info['live_status'] == AlbaBackend.STATUSES.WARNING: linked_backend_warning = True # Retrieve ASD and maintenance service information def _get_node_information(_node): if _node not in nodes_used_by_this_backend: for slot_info in _node.stack.itervalues(): for osd_info in slot_info['osds'].itervalues(): if osd_info['claimed_by'] == self.guid: nodes_used_by_this_backend.add(_node) break if _node in nodes_used_by_this_backend: break try: services = _node.maintenance_services if self.name in services: for _service_name, _service_status in services[self.name]: services_for_this_backend[_service_name] = _node service_states[_service_name] = _service_status if _node.node_id not in services_per_node: services_per_node[_node.node_id] = 0 services_per_node[_node.node_id] += 1 except Exception: pass services_for_this_backend = {} services_per_node = {} service_states = {} nodes_used_by_this_backend = set() threads = [] all_nodes = AlbaNodeList.get_albanodes() for node in all_nodes: thread = Thread(target=_get_node_information, args=(node, )) thread.start() threads.append(thread) for thread in threads: thread.join() zero_services = False if len(services_for_this_backend) == 0: if len(all_nodes) > 0: AlbaBackend._logger.error( 'AlbaBackend {0} STATUS set to FAILURE due to no maintenance services' .format(self.name)) return AlbaBackend.STATUSES.FAILURE zero_services = True # Verify maintenance agents status for service_name, node in services_for_this_backend.iteritems(): try: service_status = service_states.get(service_name) if service_status is None or service_status != 'active': AlbaBackend._logger.error( 'AlbaBackend {0} STATUS set to FAILURE due to non-running maintenance service(s): {1}' .format(self.name, service_name)) return AlbaBackend.STATUSES.FAILURE except Exception: pass # Verify maintenance agents presence layout_key = '/ovs/alba/backends/{0}/maintenance/agents_layout'.format( self.guid) layout = None if Configuration.exists(layout_key): layout = Configuration.get(layout_key) if not isinstance(layout, list) or not any( node.node_id for node in all_nodes if node.node_id in layout): layout = None if layout is None: config_key = '/ovs/alba/backends/{0}/maintenance/nr_of_agents'.format( self.guid) expected_services = 3 if Configuration.exists(config_key): expected_services = Configuration.get(config_key) expected_services = min(expected_services, len(nodes_used_by_this_backend)) or 1 if len(services_for_this_backend) < expected_services: AlbaBackend._logger.warning( 'Live status for backend {0} is "warning": insufficient maintenance services' .format(self.name)) return AlbaBackend.STATUSES.WARNING else: for node_id in layout: if node_id not in services_per_node: AlbaBackend._logger.warning( 'Live status for backend {0} is "warning": invalid maintenance service layout' .format(self.name)) return AlbaBackend.STATUSES.WARNING # Verify local and remote OSDs if devices['orange'] > 0: AlbaBackend._logger.warning( 'Live status for backend {0} is "warning": one or more OSDs in warning' .format(self.name)) return AlbaBackend.STATUSES.WARNING if remote_errors is True or linked_backend_warning is True: AlbaBackend._logger.warning( 'Live status for backend {0} is "warning": errors/warnings on remote stack' .format(self.name)) return AlbaBackend.STATUSES.WARNING if zero_services is True: AlbaBackend._logger.warning( 'Live status for backend {0} is "warning": no maintenance services' .format(self.name)) return AlbaBackend.STATUSES.WARNING return AlbaBackend.STATUSES.RUNNING
def migrate(previous_version, master_ips=None, extra_ips=None): """ Migrates from a given version to the current version. It uses 'previous_version' to be smart wherever possible, but the code should be able to migrate any version towards the expected version. When this is not possible, the code can set a minimum version and raise when it is not met. :param previous_version: The previous version from which to start the migration :type previous_version: float :param master_ips: IP addresses of the MASTER nodes :type master_ips: list or None :param extra_ips: IP addresses of the EXTRA nodes :type extra_ips: list or None """ _ = master_ips, extra_ips working_version = previous_version # From here on, all actual migration should happen to get to the expected state for THIS RELEASE if working_version < ExtensionMigrator.THIS_VERSION: try: from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.vpoollist import VPoolList from ovs.extensions.generic.configuration import Configuration from ovs.extensions.services.servicefactory import ServiceFactory from ovs.extensions.generic.sshclient import SSHClient from ovs.extensions.generic.system import System local_machine_id = System.get_my_machine_id() local_ip = Configuration.get( '/ovs/framework/hosts/{0}/ip'.format(local_machine_id)) local_client = SSHClient(endpoint=local_ip, username='******') # Multiple Proxies if local_client.dir_exists( directory= '/opt/OpenvStorage/config/storagedriver/storagedriver' ): local_client.dir_delete(directories=[ '/opt/OpenvStorage/config/storagedriver/storagedriver' ]) # MDS safety granularity on vPool level mds_safety_key = '/ovs/framework/storagedriver' if Configuration.exists(key=mds_safety_key): current_mds_settings = Configuration.get( key=mds_safety_key) for vpool in VPoolList.get_vpools(): vpool_key = '/ovs/vpools/{0}'.format(vpool.guid) if Configuration.dir_exists(key=vpool_key): Configuration.set( key='{0}/mds_config'.format(vpool_key), value=current_mds_settings) Configuration.delete(key=mds_safety_key) # Introduction of edition key if Configuration.get(key=Configuration.EDITION_KEY, default=None) not in [ PackageFactory.EDITION_COMMUNITY, PackageFactory.EDITION_ENTERPRISE ]: for storagerouter in StorageRouterList.get_storagerouters( ): try: Configuration.set( key=Configuration.EDITION_KEY, value=storagerouter.features['alba'] ['edition']) break except: continue except: ExtensionMigrator._logger.exception( 'Error occurred while executing the migration code') # Don't update migration version with latest version, resulting in next migration trying again to execute this code return ExtensionMigrator.THIS_VERSION - 1 return ExtensionMigrator.THIS_VERSION
def create_hprm_config_files(vpool_guid, local_storagerouter_guid, parameters): """ Create the required configuration files to be able to make use of HPRM (aka PRACC) This configuration will be zipped and made available for download :param vpool_guid: The guid of the VPool for which a HPRM manager needs to be deployed :type vpool_guid: str :param local_storagerouter_guid: The guid of the StorageRouter the API was requested on :type local_storagerouter_guid: str :param parameters: Additional information required for the HPRM configuration files :type parameters: dict :return: Name of the zipfile containing the configuration files :rtype: str """ # Validations required_params = { 'port': (int, { 'min': 1, 'max': 65535 }), 'identifier': (str, ExtensionsToolbox.regex_vpool) } ExtensionsToolbox.verify_required_params( actual_params=parameters, required_params=required_params) vpool = VPool(vpool_guid) identifier = parameters['identifier'] config_path = None local_storagerouter = StorageRouter(local_storagerouter_guid) for sd in vpool.storagedrivers: if len(sd.alba_proxies) == 0: raise ValueError( 'No ALBA proxies configured for vPool {0} on StorageRouter {1}' .format(vpool.name, sd.storagerouter.name)) config_path = '/ovs/vpools/{0}/proxies/{1}/config/{{0}}'.format( vpool.guid, sd.alba_proxies[0].guid) if config_path is None: raise ValueError( 'vPool {0} has not been extended any StorageRouter'.format( vpool.name)) proxy_cfg = Configuration.get(key=config_path.format('main')) cache_info = {} arakoons = {} cache_types = VPool.CACHES.values() if not any(ctype in parameters for ctype in cache_types): raise ValueError( 'At least one cache type should be passed: {0}'.format( ', '.join(cache_types))) for ctype in cache_types: if ctype not in parameters: continue required_dict = {'read': (bool, None), 'write': (bool, None)} required_params.update({ctype: (dict, required_dict)}) ExtensionsToolbox.verify_required_params( actual_params=parameters, required_params=required_params) read = parameters[ctype]['read'] write = parameters[ctype]['write'] if read is False and write is False: cache_info[ctype] = ['none'] continue path = parameters[ctype].get('path') if path is not None: path = path.strip() if not path or path.endswith( '/.') or '..' in path or '/./' in path: raise ValueError('Invalid path specified') required_dict.update({ 'path': (str, None), 'size': (int, { 'min': 1, 'max': 10 * 1024 }) }) ExtensionsToolbox.verify_required_params( actual_params=parameters, required_params=required_params) while '//' in path: path = path.replace('//', '/') cache_info[ctype] = [ 'local', { 'path': path, 'max_size': parameters[ctype]['size'] * 1024**3, 'cache_on_read': read, 'cache_on_write': write } ] else: required_dict.update({ 'backend_info': (dict, { 'preset': (str, ExtensionsToolbox.regex_preset), 'alba_backend_guid': (str, ExtensionsToolbox.regex_guid), 'alba_backend_name': (str, ExtensionsToolbox.regex_backend) }), 'connection_info': (dict, { 'host': (str, ExtensionsToolbox.regex_ip, False), 'port': (int, { 'min': 1, 'max': 65535 }, False), 'client_id': (str, ExtensionsToolbox.regex_guid, False), 'client_secret': (str, None, False) }) }) ExtensionsToolbox.verify_required_params( actual_params=parameters, required_params=required_params) connection_info = parameters[ctype]['connection_info'] if connection_info[ 'host']: # Remote Backend for accelerated Backend alba_backend_guid = parameters[ctype]['backend_info'][ 'alba_backend_guid'] ovs_client = OVSClient.get_instance( connection_info=connection_info) arakoon_config = VPoolShared.retrieve_alba_arakoon_config( alba_backend_guid=alba_backend_guid, ovs_client=ovs_client) arakoons[ctype] = ArakoonClusterConfig.convert_config_to( arakoon_config, return_type='INI') else: # Local Backend for accelerated Backend alba_backend_name = parameters[ctype]['backend_info'][ 'alba_backend_name'] if Configuration.exists(key='/ovs/arakoon/{0}-abm/config'. format(alba_backend_name), raw=True) is False: raise ValueError( 'Arakoon cluster for ALBA Backend {0} could not be retrieved' .format(alba_backend_name)) arakoons[ctype] = Configuration.get( key='/ovs/arakoon/{0}-abm/config'.format( alba_backend_name), raw=True) cache_info[ctype] = [ 'alba', { 'albamgr_cfg_url': '/etc/hprm/{0}/{1}_cache_arakoon.ini'.format( identifier, ctype), 'bucket_strategy': [ '1-to-1', { 'prefix': vpool.guid, 'preset': parameters[ctype]['backend_info']['preset'] } ], 'manifest_cache_size': proxy_cfg['manifest_cache_size'], 'cache_on_read': read, 'cache_on_write': write } ] tgz_name = 'hprm_config_files_{0}_{1}.tgz'.format( identifier, vpool.name) config = { 'ips': ['127.0.0.1'], 'port': parameters['port'], 'pracc': { 'uds_path': '/var/run/hprm/{0}/uds_path'.format(identifier), 'max_clients': 1000, 'max_read_buf_size': 64 * 1024, # Buffer size for incoming requests (in bytes) 'thread_pool_size': 64 }, # Amount of threads 'transport': 'tcp', 'log_level': 'info', 'read_preference': proxy_cfg['read_preference'], 'albamgr_cfg_url': '/etc/hprm/{0}/arakoon.ini'.format(identifier), 'manifest_cache_size': proxy_cfg['manifest_cache_size'] } file_contents_map = {} for ctype in cache_types: if ctype in cache_info: config['{0}_cache'.format(ctype)] = cache_info[ctype] if ctype in arakoons: file_contents_map[ '/opt/OpenvStorage/config/{0}/{1}_cache_arakoon.ini'. format(identifier, ctype)] = arakoons[ctype] file_contents_map.update({ '/opt/OpenvStorage/config/{0}/config.json'.format(identifier): json.dumps(config, indent=4), '/opt/OpenvStorage/config/{0}/arakoon.ini'.format(identifier): Configuration.get(key=config_path.format('abm'), raw=True) }) local_client = SSHClient(endpoint=local_storagerouter) local_client.dir_create( directories='/opt/OpenvStorage/config/{0}'.format(identifier)) local_client.dir_create( directories='/opt/OpenvStorage/webapps/frontend/downloads') for file_name, contents in file_contents_map.iteritems(): local_client.file_write(contents=contents, filename=file_name) local_client.run(command=[ 'tar', '--transform', 's#^config/{0}#{0}#'.format(identifier), '-czf', '/opt/OpenvStorage/webapps/frontend/downloads/{0}'.format( tgz_name), 'config/{0}'.format(identifier) ]) local_client.dir_delete( directories='/opt/OpenvStorage/config/{0}'.format(identifier)) return tgz_name
def _local_stack(self): """ Returns a live list of all disks known to this AlbaBackend """ from ovs.dal.lists.albanodelist import AlbaNodeList from ovs.dal.lists.albabackendlist import AlbaBackendList if len(self.abm_services) == 0: return {} # No ABM services yet, so backend not fully installed yet alba_backend_map = {} for alba_backend in AlbaBackendList.get_albabackends(): alba_backend_map[alba_backend.alba_id] = alba_backend # Load information based on the model asd_map = {} storage_map = {} alba_nodes = AlbaNodeList.get_albanodes() for node in alba_nodes: node_id = node.node_id storage_map[node_id] = {} for disk in node.disks: disk_id = disk.aliases[0].split('/')[-1] storage_map[node_id][disk_id] = {'asds': {}, 'name': disk_id, 'guid': disk.guid, 'status': 'error', 'aliases': disk.aliases, 'status_detail': 'unknown'} for osd in disk.osds: osd_id = osd.osd_id data = {'asd_id': osd_id, 'guid': osd.guid, 'status': 'error', 'status_detail': 'unknown', 'alba_backend_guid': osd.alba_backend_guid} asd_map[osd_id] = data storage_map[node_id][disk_id]['asds'][osd_id] = data # Load information from node def _load_live_info(_node, _node_data): _data = _node.storage_stack if _data['status'] != 'ok': for disk_entry in _node_data.values(): disk_entry['status_detail'] = _data['status'] for entry in disk_entry.get('asds', {}).values(): entry['status_detail'] = _data['status'] else: for _disk_id, disk_asd_info in _data['stack'].iteritems(): if _disk_id not in _node_data: _node_data[_disk_id] = {'asds': {}} entry = _node_data[_disk_id] disk_info = copy.deepcopy(disk_asd_info) del disk_info['asds'] entry.update(disk_info) asds_info = disk_asd_info['asds'] for _asd_id, asd_info in asds_info.iteritems(): if _asd_id not in _node_data[_disk_id]['asds']: _node_data[_disk_id]['asds'][_asd_id] = asd_info else: _node_data[_disk_id]['asds'][_asd_id].update(asd_info) threads = [] for node in alba_nodes: thread = Thread(target=_load_live_info, args=(node, storage_map[node.node_id])) thread.start() threads.append(thread) for thread in threads: thread.join() # Mix in usage information for asd_id, stats in self.asd_statistics.iteritems(): if asd_id in asd_map: asd_map[asd_id]['usage'] = {'size': int(stats['capacity']), 'used': int(stats['disk_usage']), 'available': int(stats['capacity'] - stats['disk_usage'])} # Load information from alba backend_interval_key = '/ovs/alba/backends/{0}/gui_error_interval'.format(self.guid) if Configuration.exists(backend_interval_key): interval = Configuration.get(backend_interval_key) else: interval = Configuration.get('/ovs/alba/backends/global_gui_error_interval') config = Configuration.get_configuration_path('/ovs/arakoon/{0}-abm/config'.format(self.name)) asds = {} for found_osd in AlbaCLI.run(command='list-all-osds', config=config): asds[found_osd['long_id']] = found_osd for node_data in storage_map.values(): for _disk in node_data.values(): for asd_id, asd_data in _disk['asds'].iteritems(): if asd_id not in asds: continue found_osd = asds[asd_id] if 'state' not in asd_data: continue if found_osd.get('decommissioned') is True: asd_data['status'] = 'unavailable' asd_data['status_detail'] = 'decommissioned' continue state = asd_data['state'] if state == 'ok': if found_osd['id'] is None: alba_id = found_osd['alba_id'] if alba_id is None: asd_data['status'] = 'available' else: asd_data['status'] = 'unavailable' alba_backend = alba_backend_map.get(alba_id) if alba_backend is not None: asd_data['alba_backend_guid'] = alba_backend.guid else: asd_data['alba_backend_guid'] = self.guid asd_data['status'] = 'warning' asd_data['status_detail'] = 'recenterrors' read = found_osd['read'] or [0] write = found_osd['write'] or [0] errors = found_osd['errors'] if len(errors) == 0 or (len(read + write) > 0 and max(min(read), min(write)) > max(error[0] for error in errors) + interval): asd_data['status'] = 'claimed' asd_data['status_detail'] = '' else: asd_data['status'] = 'error' asd_data['status_detail'] = asd_data.get('state_detail', '') alba_backend = alba_backend_map.get(found_osd.get('alba_id')) if alba_backend is not None: asd_data['alba_backend_guid'] = alba_backend.guid return storage_map
def get(self, request, *args, **kwargs): """ Fetches metadata """ _ = args, kwargs data = {'authenticated': False, 'authentication_state': None, 'authentication_metadata': {}, 'username': None, 'userguid': None, 'roles': [], 'identification': {}, 'storagerouter_ips': [sr.ip for sr in StorageRouterList.get_storagerouters()], 'versions': list(settings.VERSION), 'plugins': {}} try: # Gather plugin metadata plugins = {} # - Backends. BackendType plugins must set the has_plugin flag on True for backend_type in BackendTypeList.get_backend_types(): if backend_type.has_plugin is True: if backend_type.code not in plugins: plugins[backend_type.code] = [] plugins[backend_type.code] += ['backend', 'gui'] # - Generic plugins, as added to the configuration file(s) generic_plugins = Configuration.get('/ovs/framework/plugins/installed|generic') for plugin_name in generic_plugins: if plugin_name not in plugins: plugins[plugin_name] = [] plugins[plugin_name] += ['gui'] data['plugins'] = plugins # Fill identification data['identification'] = {'cluster_id': Configuration.get('/ovs/framework/cluster_id')} # Get authentication metadata authentication_metadata = {'ip': System.get_my_storagerouter().ip} for key in ['mode', 'authorize_uri', 'client_id', 'scope']: if Configuration.exists('/ovs/framework/webapps|oauth2.{0}'.format(key)): authentication_metadata[key] = Configuration.get('/ovs/framework/webapps|oauth2.{0}'.format(key)) data['authentication_metadata'] = authentication_metadata # Gather authorization metadata if 'HTTP_AUTHORIZATION' not in request.META: return dict(data.items() + {'authentication_state': 'unauthenticated'}.items()) authorization_type, access_token = request.META['HTTP_AUTHORIZATION'].split(' ') if authorization_type != 'Bearer': return dict(data.items() + {'authentication_state': 'invalid_authorization_type'}.items()) tokens = BearerTokenList.get_by_access_token(access_token) if len(tokens) != 1: return dict(data.items() + {'authentication_state': 'invalid_token'}.items()) token = tokens[0] if token.expiration < time.time(): for junction in token.roles.itersafe(): junction.delete() token.delete() return dict(data.items() + {'authentication_state': 'token_expired'}.items()) # Gather user metadata user = token.client.user if not user.is_active: return dict(data.items() + {'authentication_state': 'inactive_user'}.items()) roles = [j.role.code for j in token.roles] return dict(data.items() + {'authenticated': True, 'authentication_state': 'authenticated', 'username': user.username, 'userguid': user.guid, 'roles': roles, 'plugins': plugins}.items()) except Exception as ex: MetadataView._logger.exception('Unexpected exception: {0}'.format(ex)) return dict(data.items() + {'authentication_state': 'unexpected_exception'}.items())
def migrate(): """ Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually executed. This code will typically contain: * "dangerous" migration code (it needs certain running services) * Migration code depending on a cluster-wide state * ... """ AlbaMigrationController._logger.info( 'Preparing out of band migrations...') from ovs.dal.hybrids.diskpartition import DiskPartition from ovs.dal.lists.albabackendlist import AlbaBackendList from ovs.dal.lists.albanodelist import AlbaNodeList from ovs.dal.lists.albaosdlist import AlbaOSDList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.extensions.generic.configuration import Configuration from ovs.extensions.generic.sshclient import SSHClient, UnableToConnectException from ovs.extensions.migration.migration.albamigrator import ExtensionMigrator from ovs.extensions.packages.albapackagefactory import PackageFactory from ovs.extensions.services.albaservicefactory import ServiceFactory from ovs.extensions.plugins.albacli import AlbaCLI, AlbaError from ovs.lib.alba import AlbaController from ovs.lib.disk import DiskController AlbaMigrationController._logger.info('Start out of band migrations...') ############################################# # Introduction of IP:port combination on OSDs osd_info_map = {} alba_backends = AlbaBackendList.get_albabackends() for alba_backend in alba_backends: AlbaMigrationController._logger.info( 'Verifying ALBA Backend {0}'.format(alba_backend.name)) if alba_backend.abm_cluster is None: AlbaMigrationController._logger.warning( 'ALBA Backend {0} does not have an ABM cluster registered'. format(alba_backend.name)) continue AlbaMigrationController._logger.debug( 'Retrieving configuration path for ALBA Backend {0}'.format( alba_backend.name)) try: config = Configuration.get_configuration_path( alba_backend.abm_cluster.config_location) except: AlbaMigrationController._logger.exception( 'Failed to retrieve the configuration path for ALBA Backend {0}' .format(alba_backend.name)) continue AlbaMigrationController._logger.info( 'Retrieving OSD information for ALBA Backend {0}'.format( alba_backend.name)) try: osd_info = AlbaCLI.run(command='list-all-osds', config=config) except (AlbaError, RuntimeError): AlbaMigrationController._logger.exception( 'Failed to retrieve OSD information for ALBA Backend {0}'. format(alba_backend.name)) continue for osd_info in osd_info: if osd_info.get('long_id'): osd_info_map[osd_info['long_id']] = { 'ips': osd_info.get('ips', []), 'port': osd_info.get('port') } for osd in AlbaOSDList.get_albaosds(): if osd.osd_id not in osd_info_map: AlbaMigrationController._logger.warning( 'OSD with ID {0} is modelled but could not be found through ALBA' .format(osd.osd_id)) continue ips = osd_info_map[osd.osd_id]['ips'] port = osd_info_map[osd.osd_id]['port'] changes = False if osd.ips is None: changes = True osd.ips = ips if osd.port is None: changes = True osd.port = port if changes is True: AlbaMigrationController._logger.info( 'Updating OSD with ID {0} with IPS {1} and port {2}'. format(osd.osd_id, ips, port)) osd.save() ################################################### # Read preference for GLOBAL ALBA Backends (1.10.3) (https://github.com/openvstorage/framework-alba-plugin/issues/452) if Configuration.get(key='/ovs/framework/migration|read_preference', default=False) is False: try: name_backend_map = dict((alba_backend.name, alba_backend) for alba_backend in alba_backends) for alba_node in AlbaNodeList.get_albanodes(): AlbaMigrationController._logger.info( 'Processing maintenance services running on ALBA Node {0} with ID {1}' .format(alba_node.ip, alba_node.node_id)) alba_node.invalidate_dynamics('maintenance_services') for alba_backend_name, services in alba_node.maintenance_services.iteritems( ): if alba_backend_name not in name_backend_map: AlbaMigrationController._logger.error( 'ALBA Node {0} has services for an ALBA Backend {1} which is not modelled' .format(alba_node.ip, alba_backend_name)) continue alba_backend = name_backend_map[alba_backend_name] AlbaMigrationController._logger.info( 'Processing {0} ALBA Backend {1} with GUID {2}'. format(alba_backend.scaling, alba_backend.name, alba_backend.guid)) if alba_backend.scaling == alba_backend.SCALINGS.LOCAL: read_preferences = [alba_node.node_id] else: read_preferences = AlbaController.get_read_preferences_for_global_backend( alba_backend=alba_backend, alba_node_id=alba_node.node_id, read_preferences=[]) for service_name, _ in services: AlbaMigrationController._logger.info( 'Processing service {0}'.format(service_name)) old_config_key = '/ovs/alba/backends/{0}/maintenance/config'.format( alba_backend.guid) new_config_key = '/ovs/alba/backends/{0}/maintenance/{1}/config'.format( alba_backend.guid, service_name) if Configuration.exists(key=old_config_key): new_config = Configuration.get( key=old_config_key) new_config[ 'read_preference'] = read_preferences Configuration.set(key=new_config_key, value=new_config) for alba_backend in alba_backends: Configuration.delete( key='/ovs/alba/backends/{0}/maintenance/config'.format( alba_backend.guid)) AlbaController.checkup_maintenance_agents.delay() Configuration.set( key='/ovs/framework/migration|read_preference', value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating read preferences for ALBA Backends failed') ####################################################### # Storing actual package name in version files (1.11.0) (https://github.com/openvstorage/framework/issues/1876) changed_clients = set() storagerouters = StorageRouterList.get_storagerouters() if Configuration.get( key= '/ovs/framework/migration|actual_package_name_in_version_file_alba', default=False) is False: try: service_manager = ServiceFactory.get_manager() alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for( component=PackageFactory.COMP_ALBA) for storagerouter in storagerouters: try: root_client = SSHClient( endpoint=storagerouter.ip, username='******' ) # Use '.ip' instead of StorageRouter object because this code is executed during post-update at which point the heartbeat has not been updated for some time except UnableToConnectException: AlbaMigrationController._logger.exception( 'Updating actual package name for version files failed on StorageRouter {0}' .format(storagerouter.ip)) continue for file_name in root_client.file_list( directory=ServiceFactory.RUN_FILE_DIR): if not file_name.endswith('.version'): continue file_path = '{0}/{1}'.format( ServiceFactory.RUN_FILE_DIR, file_name) contents = root_client.file_read(filename=file_path) if alba_pkg_name == PackageFactory.PKG_ALBA_EE and '{0}='.format( PackageFactory.PKG_ALBA) in contents: # Rewrite the version file in the RUN_FILE_DIR contents = contents.replace( PackageFactory.PKG_ALBA, PackageFactory.PKG_ALBA_EE) root_client.file_write(filename=file_path, contents=contents) # Regenerate the service and update the EXTRA_VERSION_CMD in the configuration management service_name = file_name.split('.')[0] service_config_key = ServiceFactory.SERVICE_CONFIG_KEY.format( storagerouter.machine_id, service_name) if Configuration.exists(key=service_config_key): service_config = Configuration.get( key=service_config_key) if 'EXTRA_VERSION_CMD' in service_config: service_config[ 'EXTRA_VERSION_CMD'] = '{0}=`{1}`'.format( alba_pkg_name, alba_version_cmd) Configuration.set(key=service_config_key, value=service_config) service_manager.regenerate_service( name='ovs-arakoon', client=root_client, target_name='ovs-{0}'.format( service_name) ) # Leave out .version changed_clients.add(root_client) Configuration.set( key= '/ovs/framework/migration|actual_package_name_in_version_file_alba', value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating actual package name for version files failed') for root_client in changed_clients: try: root_client.run(['systemctl', 'daemon-reload']) except Exception: AlbaMigrationController._logger.exception( 'Executing command "systemctl daemon-reload" failed') #################################### # Fix for migration version (1.11.0) # Previous code could potentially store a higher version number in the config management than the actual version number if Configuration.get( key='/ovs/framework/migration|alba_migration_version_fix', default=False) is False: try: for storagerouter in storagerouters: config_key = '/ovs/framework/hosts/{0}/versions'.format( storagerouter.machine_id) if Configuration.exists(key=config_key): versions = Configuration.get(key=config_key) if versions.get(PackageFactory.COMP_MIGRATION_ALBA, 0) > ExtensionMigrator.THIS_VERSION: versions[ PackageFactory. COMP_MIGRATION_ALBA] = ExtensionMigrator.THIS_VERSION Configuration.set(key=config_key, value=versions) Configuration.set( key='/ovs/framework/migration|alba_migration_version_fix', value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating migration version failed') #################################### # Enable auto-cleanup migration_auto_cleanup_key = '/ovs/framework/migration|alba_auto_cleanup' if Configuration.get(key=migration_auto_cleanup_key, default=False) is False: try: for storagerouter in StorageRouterList.get_storagerouters(): storagerouter.invalidate_dynamics( 'features') # New feature was added errors = [] for alba_backend in AlbaBackendList.get_albabackends(): try: AlbaController.set_auto_cleanup(alba_backend.guid) except Exception as ex: AlbaMigrationController._logger.exception( 'Failed to set the auto-cleanup for ALBA Backend {0}' .format(alba_backend.name)) errors.append(ex) if len(errors) == 0: Configuration.set(key=migration_auto_cleanup_key, value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating auto cleanup failed') #################################### # Change cache eviction migration_random_eviction_key = '/ovs/framework/migration|alba_cache_eviction_random' if Configuration.get(key=migration_random_eviction_key, default=False) is False: try: errors = [] for alba_backend in AlbaBackendList.get_albabackends(): try: AlbaController.set_cache_eviction(alba_backend.guid) except Exception as ex: AlbaMigrationController._logger.exception( 'Failed to set the auto-cleanup for ALBA Backend {0}' .format(alba_backend.name)) errors.append(ex) if len(errors) == 0: Configuration.set(key=migration_random_eviction_key, value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating auto cleanup failed') ################################################### # Sync all disks and apply the backend role. Backend role was removed with the AD (since 1.10) albanode_backend_role_sync_key = '/ovs/framework/migration|albanode_backend_role_sync' if not Configuration.get(key=albanode_backend_role_sync_key, default=False): try: errors = [] for alba_node in AlbaNodeList.get_albanodes(): try: if not alba_node.storagerouter: continue stack = alba_node.client.get_stack() # type: dict for slot_id, slot_information in stack.iteritems(): osds = slot_information.get('osds', {}) # type: dict slot_aliases = slot_information.get( 'aliases', []) # type: list if not osds: # No osds means no partition was made continue # Sync to add all potential partitions that will need a backend role DiskController.sync_with_reality( storagerouter_guid=alba_node.storagerouter_guid ) for disk in alba_node.storagerouter.disks: if set(disk.aliases).intersection( set(slot_aliases)): partition = disk.partitions[0] if DiskPartition.ROLES.BACKEND not in partition.roles: partition.roles.append( DiskPartition.ROLES.BACKEND) partition.save() except Exception as ex: AlbaMigrationController._logger.exception( 'Syncing for storagerouter/albanode {0} failed'. format(alba_node.storagerouter.ip)) errors.append(ex) if not errors: Configuration.set(key=albanode_backend_role_sync_key, value=True) except Exception: AlbaMigrationController._logger.exception( 'Syncing up the disks for backend roles failed') AlbaMigrationController._logger.info('Finished out of band migrations')
def check_vpool_cleanup(vpool_info, storagerouters=None): """ Check if everything related to a vPool has been cleaned up on the storagerouters provided vpool_info should be a dictionary containing: - type - guid - files - directories - name (optional) - vpool (optional) If vpool is provided: - storagerouters need to be provided, because on these Storage Routers, we check whether the vPool has been cleaned up If name is provided: - If storagerouters is NOT provided, all Storage Routers will be checked for a correct vPool removal - If storagerouters is provided, only these Storage Routers will be checked for a correct vPool removal :param vpool_info: Information about the vPool :param storagerouters: Storage Routers to check if vPool has been cleaned up :return: None """ for required_param in ["type", "guid", "files", "directories"]: if required_param not in vpool_info: raise ValueError("Incorrect vpool_info provided") if "vpool" in vpool_info and "name" in vpool_info: raise ValueError("vpool and name are mutually exclusive") if "vpool" not in vpool_info and "name" not in vpool_info: raise ValueError("Either vpool or vpool_name needs to be provided") vpool = vpool_info.get("vpool") vpool_name = vpool_info.get("name") vpool_guid = vpool_info["guid"] vpool_type = vpool_info["type"] files = vpool_info["files"] directories = vpool_info["directories"] supported_backend_types = GeneralBackend.get_valid_backendtypes() if vpool_type not in supported_backend_types: raise ValueError( "Unsupported Backend Type provided. Please choose from: {0}".format(", ".join(supported_backend_types)) ) if storagerouters is None: storagerouters = GeneralStorageRouter.get_storage_routers() if vpool_name is not None: assert ( GeneralVPool.get_vpool_by_name(vpool_name=vpool_name) is None ), "A vPool with name {0} still exists".format(vpool_name) # Prepare some fields to check vpool_name = vpool.name if vpool else vpool_name vpool_services = ["ovs-dtl_{0}".format(vpool_name), "ovs-volumedriver_{0}".format(vpool_name)] if vpool_type == "alba": vpool_services.append("ovs-albaproxy_{0}".format(vpool_name)) # Check configuration if vpool is None: assert ( Configuration.exists("/ovs/vpools/{0}".format(vpool_guid), raw=True) is False ), "vPool config still found in etcd" else: remaining_sd_ids = set([storagedriver.storagedriver_id for storagedriver in vpool.storagedrivers]) current_sd_ids = set([item for item in Configuration.list("/ovs/vpools/{0}/hosts".format(vpool_guid))]) assert not remaining_sd_ids.difference( current_sd_ids ), "There are more storagedrivers modelled than present in etcd" assert not current_sd_ids.difference( remaining_sd_ids ), "There are more storagedrivers in etcd than present in model" # Perform checks on all storagerouters where vpool was removed for storagerouter in storagerouters: # Check MDS services mds_services = GeneralService.get_services_by_name(ServiceType.SERVICE_TYPES.MD_SERVER) assert ( len( [ mds_service for mds_service in mds_services if mds_service.storagerouter_guid == storagerouter.guid ] ) == 0 ), "There are still MDS services present for Storage Router {0}".format(storagerouter.ip) # Check services root_client = SSHClient(storagerouter, username="******") for service in vpool_services: if ServiceManager.has_service(service, client=root_client): raise RuntimeError( "Service {0} is still configured on Storage Router {1}".format(service, storagerouter.ip) ) # Check KVM vpool if GeneralHypervisor.get_hypervisor_type() == "KVM": vpool_overview = root_client.run(["virsh", "pool-list", "--all"]).splitlines() vpool_overview.pop(1) vpool_overview.pop(0) for vpool_info in vpool_overview: kvm_vpool_name = vpool_info.split()[0].strip() if vpool_name == kvm_vpool_name: raise ValueError( "vPool {0} is still defined on Storage Router {1}".format(vpool_name, storagerouter.ip) ) # Check file and directory existence if storagerouter.guid not in directories: raise ValueError("Could not find directory information for Storage Router {0}".format(storagerouter.ip)) if storagerouter.guid not in files: raise ValueError("Could not find file information for Storage Router {0}".format(storagerouter.ip)) for directory in directories[storagerouter.guid]: assert ( root_client.dir_exists(directory) is False ), "Directory {0} still exists on Storage Router {1}".format(directory, storagerouter.ip) for file_name in files[storagerouter.guid]: assert ( root_client.file_exists(file_name) is False ), "File {0} still exists on Storage Router {1}".format(file_name, storagerouter.ip) # Look for errors in storagedriver log for error_type in ["error", "fatal"]: cmd = "cat -vet /var/log/ovs/volumedriver/{0}.log | tail -1000 | grep ' {1} '; echo true > /dev/null".format( vpool_name, error_type ) errors = [] for line in root_client.run(cmd, allow_insecure=True).splitlines(): if "HierarchicalArakoon" in line: continue errors.append(line) if len(errors) > 0: if error_type == "error": print "Volumedriver log file contains errors on Storage Router {0}\n - {1}".format( storagerouter.ip, "\n - ".join(errors) ) else: raise RuntimeError( "Fatal errors found in volumedriver log file on Storage Router {0}\n - {1}".format( storagerouter.ip, "\n - ".join(errors) ) )
def validate_arakoon_config_files(storagerouters, cluster_name=None): """ Verify whether all arakoon configurations are correct :param storagerouters: Storage Routers :param cluster_name: Name of the Arakoon cluster :return: """ storagerouters.sort(key=lambda k: k.ip) TestArakoon.logger.info('Validating arakoon files for {0}'.format(', '.join([sr.ip for sr in storagerouters]))) nr_of_configs_on_master = 0 nr_of_configs_on_extra = 0 node_ids = dict() matrix = dict() for sr in storagerouters: node_ids[sr.ip] = sr.machine_id configs_to_check = [] matrix[sr] = dict() if cluster_name is not None: if Configuration.exists(GeneralArakoon.CONFIG_KEY.format(cluster_name), raw=True): configs_to_check = [GeneralArakoon.CONFIG_KEY.format(cluster_name)] else: gen = Configuration.list(GeneralArakoon.CONFIG_ROOT) for entry in gen: if 'nsm_' not in entry: if Configuration.exists(GeneralArakoon.CONFIG_KEY.format(cluster_name), raw=True): configs_to_check.append(GeneralArakoon.CONFIG_KEY.format(entry)) for config_name in configs_to_check: config_contents = Configuration.get(configs_to_check[0], raw=True) matrix[sr][config_name] = hashlib.md5(config_contents).hexdigest() if sr.node_type == 'MASTER': nr_of_configs_on_master = len(matrix[sr]) else: nr_of_configs_on_extra = len(matrix[sr]) TestArakoon.logger.info('cluster_ids: {0}'.format(node_ids)) TestArakoon.logger.info('matrix: {0}'.format(matrix)) for config_file in matrix[storagerouters[0]].keys(): TestArakoon.validate_arakoon_config_content(config_file, node_ids) assert len(storagerouters) == len(matrix.keys()), "not all nodes have arakoon configs" incorrect_nodes = list() for sr in matrix: is_master = sr.node_type == 'MASTER' if (is_master is True and len(matrix[sr]) != nr_of_configs_on_master) or\ (is_master is False and len(matrix[sr]) != nr_of_configs_on_extra): incorrect_nodes.append(sr.ip) assert len(incorrect_nodes) == 0, "Incorrect nr of configs on nodes: {0}".format(incorrect_nodes) md5sum_matrix = dict() incorrect_configs = list() for cfg in matrix[storagerouters[0]]: for sr in storagerouters: if cfg not in md5sum_matrix: md5sum_matrix[cfg] = matrix[sr][cfg] elif matrix[sr][cfg] != md5sum_matrix[cfg]: incorrect_configs.append("Incorrect contents {0} for {1} on {2}, expected {3}" .format(matrix[sr][cfg], sr.ip, cfg, md5sum_matrix[cfg])) assert len(incorrect_configs) == 0,\ 'Incorrect arakoon config contents: \n{0}'.format('\n'.join(incorrect_configs))
def validate_vpool_sanity(expected_settings): """ Check if all requirements are met for a healthy vPool :param expected_settings: Parameters used to create a vPool, which will be verified :type expected_settings: dict :return: None """ if not isinstance(expected_settings, dict) or len(expected_settings) == 0: raise ValueError("Cannot validate vpool when no settings are passed") generic_settings = expected_settings.values()[0] vpool_name = generic_settings["vpool_name"] mountpoint = "/mnt/{0}".format(vpool_name) backend_type = generic_settings["type"] rdma_enabled = ( generic_settings["config_params"]["dtl_transport"] == StorageDriverClient.FRAMEWORK_DTL_TRANSPORT_RSOCKET ) vpool = GeneralVPool.get_vpool_by_name(vpool_name=vpool_name) assert vpool is not None, "Could not find vPool with name {0}".format(vpool_name) vpool_config = GeneralVPool.get_configuration(vpool) # Verify some basic vPool attributes assert vpool.name == vpool_name, "Expected name {0} for vPool".format(vpool_name) assert vpool.status == VPool.STATUSES.RUNNING, "vPool does not have RUNNING status" assert vpool.rdma_enabled == rdma_enabled, "RDMA enabled setting is incorrect" assert set(expected_settings.keys()) == set( [sd.storagerouter for sd in vpool.storagedrivers] ), "vPool storagerouters don't match the expected Storage Routers" # Verify vPool Storage Driver configuration expected_vpool_config = copy.deepcopy(generic_settings["config_params"]) for key, value in vpool_config.iteritems(): if key == "dtl_enabled" or key == "tlog_multiplier" or key == "dtl_config_mode": continue if key not in expected_vpool_config: raise ValueError("Expected settings does not contain key {0}".format(key)) if value != expected_vpool_config[key]: raise ValueError( "vPool does not have expected configuration {0} for key {1}".format(expected_vpool_config[key], key) ) expected_vpool_config.pop(key) if len(expected_vpool_config) > 0: raise ValueError( "Actual vPool configuration does not contain keys: {0}".format(", ".join(expected_vpool_config.keys())) ) # Prepare some fields to check config = generic_settings["config_params"] dtl_mode = config["dtl_mode"] sco_size = config["sco_size"] cluster_size = config["cluster_size"] write_buffer = config["write_buffer"] dtl_transport = config["dtl_transport"] # @TODO: Add more validations for other expected settings (instead of None) expected_config = { "backend_connection_manager": { "backend_interface_retries_on_error": 5, "backend_interface_retry_interval_secs": 1, "backend_interface_retry_backoff_multiplier": 2.0, }, "content_addressed_cache": { "clustercache_mount_points": None, "read_cache_serialization_path": u"/var/rsp/{0}".format(vpool.name), }, "distributed_lock_store": { "dls_arakoon_cluster_id": None, "dls_arakoon_cluster_nodes": None, "dls_type": u"Arakoon", }, "distributed_transaction_log": {"dtl_path": None, "dtl_transport": dtl_transport.upper()}, "event_publisher": {"events_amqp_routing_key": u"volumerouter", "events_amqp_uris": None}, "file_driver": {"fd_cache_path": None, "fd_extent_cache_capacity": u"1024", "fd_namespace": None}, "filesystem": { "fs_dtl_config_mode": u"Automatic", "fs_dtl_mode": u"{0}".format(StorageDriverClient.VPOOL_DTL_MODE_MAP[dtl_mode]), "fs_enable_shm_interface": 1, "fs_file_event_rules": None, "fs_metadata_backend_arakoon_cluster_nodes": None, "fs_metadata_backend_mds_nodes": None, "fs_metadata_backend_type": u"MDS", "fs_raw_disk_suffix": None, "fs_virtual_disk_format": None, }, "metadata_server": {"mds_nodes": None}, "scocache": {"backoff_gap": u"2GB", "scocache_mount_points": None, "trigger_gap": u"1GB"}, "threadpool_component": {"num_threads": 16}, "volume_manager": { "clean_interval": 1, "default_cluster_size": 1024 * cluster_size, "dtl_throttle_usecs": 4000, "metadata_path": None, "non_disposable_scos_factor": float(write_buffer) / StorageDriverClient.TLOG_MULTIPLIER_MAP[sco_size] / sco_size, "number_of_scos_in_tlog": StorageDriverClient.TLOG_MULTIPLIER_MAP[sco_size], "tlog_path": None, }, "volume_registry": {"vregistry_arakoon_cluster_id": u"voldrv", "vregistry_arakoon_cluster_nodes": None}, "volume_router": { "vrouter_backend_sync_timeout_ms": 5000, "vrouter_file_read_threshold": 1024, "vrouter_file_write_threshold": 1024, "vrouter_id": None, "vrouter_max_workers": 16, "vrouter_migrate_timeout_ms": 5000, "vrouter_min_workers": 4, "vrouter_redirect_timeout_ms": u"5000", "vrouter_routing_retries": 10, "vrouter_sco_multiplier": 1024, "vrouter_volume_read_threshold": 1024, "vrouter_volume_write_threshold": 1024, }, "volume_router_cluster": {"vrouter_cluster_id": None}, } vpool_services = { "all": [ "ovs-watcher-volumedriver", "ovs-dtl_{0}".format(vpool.name), "ovs-volumedriver_{0}".format(vpool.name), "ovs-volumerouter-consumer", ], "extra": [], "master": ["ovs-arakoon-voldrv"], } sd_partitions = {"DB": ["MD", "MDS", "TLOG"], "WRITE": ["FD", "DTL", "SCO"]} assert Configuration.exists("/ovs/arakoon/voldrv/config", raw=True), "Volumedriver arakoon does not exist" # Do some verifications for all SDs storage_ip = None voldrv_config = GeneralArakoon.get_config("voldrv") all_files = GeneralVPool.get_related_files(vpool=vpool) all_directories = GeneralVPool.get_related_directories(vpool=vpool) for storagedriver in vpool.storagedrivers: storagerouter = storagedriver.storagerouter root_client = SSHClient(storagerouter, username="******") assert Configuration.exists( "/ovs/vpools/{0}/hosts/{1}/config".format(vpool.guid, storagedriver.storagedriver_id), raw=True ), "vPool config not found in configuration" # @todo: replace next lines with implementation defined in: http://jira.openvstorage.com/browse/OVS-4577 # current_config_sections = set([item for item in Configuration.list('/ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, storagedriver.storagedriver_id))]) # assert not current_config_sections.difference(set(expected_config.keys())), 'New section appeared in the storage driver config in configuration' # assert not set(expected_config.keys()).difference(current_config_sections), 'Config section expected for storage driver, but not found in configuration' # # for key, values in expected_config.iteritems(): # current_config = Configuration.get('/ovs/vpools/{0}/hosts/{1}/config/{2}'.format(vpool.guid, storagedriver.storagedriver_id, key)) # assert set(current_config.keys()).union(set(values.keys())) == set(values.keys()), 'Not all expected keys match for key "{0}" on Storage Driver {1}'.format(key, storagedriver.name) # # for sub_key, value in current_config.iteritems(): # expected_value = values[sub_key] # if expected_value is None: # continue # assert value == expected_value, 'Key: {0} - Sub key: {1} - Value: {2} - Expected value: {3}'.format(key, sub_key, value, expected_value) # Check services if storagerouter.node_type == "MASTER": for service_name in vpool_services["all"] + vpool_services["master"]: if ( service_name == "ovs-arakoon-voldrv" and GeneralStorageDriver.has_role(storagedriver, "DB") is False ): continue exitcode, output = ServiceManager.get_service_status(name=service_name, client=root_client) if exitcode is not True: raise ValueError( "Service {0} is not running on node {1} - {2}".format( service_name, storagerouter.ip, output ) ) else: for service_name in vpool_services["all"] + vpool_services["extra"]: exitcode, output = ServiceManager.get_service_status(name=service_name, client=root_client) if exitcode is not True: raise ValueError( "Service {0} is not running on node {1} - {2}".format( service_name, storagerouter.ip, output ) ) # Check arakoon config if not voldrv_config.has_section(storagerouter.machine_id): raise ValueError("Voldrv arakoon cluster does not have section {0}".format(storagerouter.machine_id)) # Basic SD checks assert ( storagedriver.cluster_ip == storagerouter.ip ), "Incorrect cluster IP. Expected: {0} - Actual: {1}".format(storagerouter.ip, storagedriver.cluster_ip) assert storagedriver.mountpoint == "/mnt/{0}".format( vpool.name ), "Incorrect mountpoint. Expected: {0} - Actual: {1}".format(mountpoint, storagedriver.mountpoint) if storage_ip is not None: assert ( storagedriver.storage_ip == storage_ip ), "Incorrect storage IP. Expected: {0} - Actual: {1}".format(storage_ip, storagedriver.storage_ip) storage_ip = storagedriver.storage_ip # Check required directories and files if storagerouter.guid not in all_directories: raise ValueError("Could not find directory information for Storage Router {0}".format(storagerouter.ip)) if storagerouter.guid not in all_files: raise ValueError("Could not find file information for Storage Router {0}".format(storagerouter.ip)) for directory in all_directories[storagerouter.guid]: if root_client.dir_exists(directory) is False: raise ValueError( "Directory {0} does not exist on Storage Router {1}".format(directory, storagerouter.ip) ) for file_name in all_files[storagerouter.guid]: if root_client.file_exists(file_name) is False: raise ValueError( "File {0} does not exist on Storage Router {1}".format(file_name, storagerouter.ip) ) # @TODO: check roles and sub_roles for all storagedrivers and not just once for partition in storagedriver.partitions: if partition.role in sd_partitions and partition.sub_role in sd_partitions[partition.role]: sd_partitions[partition.role].remove(partition.sub_role) elif ( partition.role in sd_partitions and partition.sub_role is None and len(sd_partitions[partition.role]) ): sd_partitions[partition.role].remove("None") # Verify vPool writeable if GeneralHypervisor.get_hypervisor_type() == "VMWARE": GeneralVPool.mount_vpool(vpool=vpool, root_client=root_client) vdisk = GeneralVDisk.create_volume(size=10, vpool=vpool, root_client=root_client) GeneralVDisk.write_to_volume( vdisk=vdisk, vpool=vpool, root_client=root_client, count=10, bs="1M", input_type="random" ) GeneralVDisk.delete_volume(vdisk=vdisk, vpool=vpool, root_client=root_client) for role, sub_roles in sd_partitions.iteritems(): for sub_role in sub_roles: raise ValueError( "Not a single Storage Driver found with partition role {0} and sub-role {1}".format(role, sub_role) )
def remove_osd(node_guid, osd_id, expected_safety): """ Removes an OSD :param node_guid: Guid of the node to remove an OSD from :type node_guid: str :param osd_id: ID of the OSD to remove :type osd_id: str :param expected_safety: Expected safety after having removed the OSD :type expected_safety: dict or None :return: Aliases of the disk on which the OSD was removed :rtype: list """ # Retrieve corresponding OSD in model node = AlbaNode(node_guid) AlbaNodeController._logger.debug('Removing OSD {0} at node {1}'.format( osd_id, node.ip)) osd = AlbaOSDList.get_by_osd_id(osd_id) alba_backend = osd.alba_backend if expected_safety is None: AlbaNodeController._logger.warning( 'Skipping safety check for OSD {0} on backend {1} - this is dangerous' .format(osd_id, alba_backend.guid)) else: final_safety = AlbaController.calculate_safety( alba_backend_guid=alba_backend.guid, removal_osd_ids=[osd_id]) safety_lost = final_safety['lost'] safety_crit = final_safety['critical'] if (safety_crit != 0 or safety_lost != 0) and ( safety_crit != expected_safety['critical'] or safety_lost != expected_safety['lost']): raise RuntimeError( 'Cannot remove OSD {0} as the current safety is not as expected ({1} vs {2})' .format(osd_id, final_safety, expected_safety)) AlbaNodeController._logger.debug( 'Safety OK for OSD {0} on backend {1}'.format( osd_id, alba_backend.guid)) AlbaNodeController._logger.debug( 'Purging OSD {0} on backend {1}'.format(osd_id, alba_backend.guid)) AlbaController.remove_units(alba_backend_guid=alba_backend.guid, osd_ids=[osd_id]) # Delete the OSD result = node.client.delete_osd(slot_id=osd.slot_id, osd_id=osd_id) if result['_success'] is False: raise RuntimeError('Error removing OSD: {0}'.format( result['_error'])) # Clean configuration management and model - Well, just try it at least if Configuration.exists(ASD_CONFIG.format(osd_id), raw=True): Configuration.delete(ASD_CONFIG_DIR.format(osd_id), raw=True) osd.delete() node.invalidate_dynamics() if alba_backend is not None: alba_backend.invalidate_dynamics() alba_backend.backend.invalidate_dynamics() if node.storagerouter is not None: try: DiskController.sync_with_reality( storagerouter_guid=node.storagerouter_guid) except UnableToConnectException: AlbaNodeController._logger.warning( 'Skipping disk sync since StorageRouter {0} is offline'. format(node.storagerouter.name)) return [osd.slot_id]
def get(self, request, *args, **kwargs): """ Fetches metadata """ _ = args, kwargs data = {'authenticated': False, 'authentication_state': None, 'authentication_metadata': {}, 'username': None, 'userguid': None, 'roles': [], 'identification': {}, 'storagerouter_ips': [sr.ip for sr in StorageRouterList.get_storagerouters()], 'versions': list(settings.VERSION), 'plugins': {}, 'registration': {'registered': False, 'remaining': None}} try: # Gather plugin metadata plugins = {} # - Backends. BackendType plugins must set the has_plugin flag on True for backend_type in BackendTypeList.get_backend_types(): if backend_type.has_plugin is True: if backend_type.code not in plugins: plugins[backend_type.code] = [] plugins[backend_type.code] += ['backend', 'gui'] # - Generic plugins, as added to the configuration file(s) generic_plugins = Configuration.get('ovs.plugins.generic') for plugin_name in generic_plugins: if plugin_name not in plugins: plugins[plugin_name] = [] plugins[plugin_name] += ['gui'] data['plugins'] = plugins # Fill identification data['identification'] = {'cluster_id': Configuration.get('ovs.support.cid')} # Registration data registered = Configuration.get('ovs.core.registered') data['registration']['registered'] = registered if registered is False: cluster_install_time = None for storagerouter in StorageRouterList.get_storagerouters(): client = SSHClient(storagerouter) install_time = client.config_read('ovs.core.install_time') if cluster_install_time is None or (install_time is not None and install_time < cluster_install_time): cluster_install_time = install_time if cluster_install_time is not None: timeout_days = 30 * 24 * 60 * 60 data['registration']['remaining'] = (timeout_days - time.time() + cluster_install_time) / 24 / 60 / 60 # Get authentication metadata authentication_metadata = {'ip': System.get_my_storagerouter().ip} for key in ['mode', 'authorize_uri', 'client_id', 'scope']: if Configuration.exists('ovs.webapps.oauth2.{0}'.format(key)): authentication_metadata[key] = Configuration.get('ovs.webapps.oauth2.{0}'.format(key)) data['authentication_metadata'] = authentication_metadata # Gather authorization metadata if 'HTTP_AUTHORIZATION' not in request.META: return HttpResponse, dict(data.items() + {'authentication_state': 'unauthenticated'}.items()) authorization_type, access_token = request.META['HTTP_AUTHORIZATION'].split(' ') if authorization_type != 'Bearer': return HttpResponse, dict(data.items() + {'authentication_state': 'invalid_authorization_type'}.items()) tokens = BearerTokenList.get_by_access_token(access_token) if len(tokens) != 1: return HttpResponse, dict(data.items() + {'authentication_state': 'invalid_token'}.items()) token = tokens[0] if token.expiration < time.time(): for junction in token.roles.itersafe(): junction.delete() token.delete() return HttpResponse, dict(data.items() + {'authentication_state': 'token_expired'}.items()) # Gather user metadata user = token.client.user if not user.is_active: return HttpResponse, dict(data.items() + {'authentication_state': 'inactive_user'}.items()) roles = [j.role.code for j in token.roles] return HttpResponse, dict(data.items() + {'authenticated': True, 'authentication_state': 'authenticated', 'username': user.username, 'userguid': user.guid, 'roles': roles, 'plugins': plugins}.items()) except Exception as ex: logger.exception('Unexpected exception: {0}'.format(ex)) return HttpResponse, dict(data.items() + {'authentication_state': 'unexpected_exception'}.items())
def get_update_information_alba_plugin(information): """ Called when the 'Update' button in the GUI is pressed This call collects additional information about the packages which can be updated Eg: * Downtime for Arakoons * Downtime for StorageDrivers * Prerequisites that haven't been met * Services which will be stopped during update * Services which will be restarted after update """ # Verify arakoon info arakoon_ovs_info = {'down': False, 'name': None, 'internal': False} arakoon_cacc_info = {'down': False, 'name': None, 'internal': False} for cluster in ['cacc', 'ovsdb']: cluster_name = ArakoonClusterConfig.get_cluster_name(cluster) if cluster_name is None: continue if cluster == 'cacc': arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name, filesystem=True, ip=System.get_my_storagerouter().ip) else: arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name) if arakoon_metadata['internal'] is True: config = ArakoonClusterConfig(cluster_id=cluster_name, filesystem=(cluster == 'cacc')) config.load_config(System.get_my_storagerouter().ip if cluster == 'cacc' else None) if cluster == 'ovsdb': arakoon_ovs_info['down'] = len(config.nodes) < 3 arakoon_ovs_info['name'] = arakoon_metadata['cluster_name'] arakoon_ovs_info['internal'] = True else: arakoon_cacc_info['name'] = arakoon_metadata['cluster_name'] arakoon_cacc_info['internal'] = True # Verify StorageRouter downtime fwk_prerequisites = [] all_storagerouters = StorageRouterList.get_storagerouters() for storagerouter in all_storagerouters: try: SSHClient(endpoint=storagerouter, username='******') except UnableToConnectException: fwk_prerequisites.append(['node_down', storagerouter.name]) # Verify ALBA node responsiveness alba_prerequisites = [] for alba_node in AlbaNodeList.get_albanodes(): try: alba_node.client.get_metadata() except Exception: alba_prerequisites.append(['alba_node_unresponsive', alba_node.ip]) for key in ['framework', 'alba']: if key not in information: information[key] = {'packages': {}, 'downtime': [], 'prerequisites': fwk_prerequisites if key == 'framework' else alba_prerequisites, 'services_stop_start': set(), 'services_post_update': set()} for storagerouter in StorageRouterList.get_storagerouters(): if key not in storagerouter.package_information: continue # Retrieve Arakoon issues arakoon_downtime = [] arakoon_services = [] for service in storagerouter.services: if service.type.name not in [ServiceType.SERVICE_TYPES.ALBA_MGR, ServiceType.SERVICE_TYPES.NS_MGR]: continue if service.type.name == ServiceType.SERVICE_TYPES.ALBA_MGR: cluster_name = AlbaController.get_abm_cluster_name(alba_backend=service.abm_service.alba_backend) else: cluster_name = AlbaController.get_nsm_cluster_name(alba_backend=service.nsm_service.alba_backend, number=service.nsm_service.number) if Configuration.exists('/ovs/arakoon/{0}/config'.format(cluster_name), raw=True) is False: continue arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name) if arakoon_metadata['internal'] is True: arakoon_services.append('ovs-{0}'.format(service.name)) config = ArakoonClusterConfig(cluster_id=cluster_name, filesystem=False) config.load_config() if len(config.nodes) < 3: if service.type.name == ServiceType.SERVICE_TYPES.NS_MGR: arakoon_downtime.append(['backend', service.nsm_service.alba_backend.name]) else: arakoon_downtime.append(['backend', service.abm_service.alba_backend.name]) for package_name, package_info in storagerouter.package_information[key].iteritems(): if package_name not in AlbaUpdateController.alba_plugin_packages: continue # Only gather information for the core packages information[key]['services_post_update'].update(package_info.pop('services_to_restart')) if package_name not in information[key]['packages']: information[key]['packages'][package_name] = {} information[key]['packages'][package_name].update(package_info) if package_name == 'openvstorage-backend': if ['gui', None] not in information[key]['downtime']: information[key]['downtime'].append(['gui', None]) if ['api', None] not in information[key]['downtime']: information[key]['downtime'].append(['api', None]) information[key]['services_stop_start'].update({'watcher-framework', 'memcached'}) elif package_name == 'alba': for down in arakoon_downtime: if down not in information[key]['downtime']: information[key]['downtime'].append(down) information[key]['services_post_update'].update(arakoon_services) elif package_name == 'arakoon': if key == 'framework': framework_arakoons = set() if arakoon_ovs_info['internal'] is True: framework_arakoons.add('ovs-arakoon-{0}'.format(arakoon_ovs_info['name'])) if arakoon_cacc_info['internal'] is True: framework_arakoons.add('ovs-arakoon-{0}'.format(arakoon_cacc_info['name'])) information[key]['services_post_update'].update(framework_arakoons) if arakoon_ovs_info['down'] is True and ['ovsdb', None] not in information[key]['downtime']: information[key]['downtime'].append(['ovsdb', None]) else: for down in arakoon_downtime: if down not in information[key]['downtime']: information[key]['downtime'].append(down) information[key]['services_post_update'].update(arakoon_services) for alba_node in AlbaNodeList.get_albanodes(): for package_name, package_info in alba_node.package_information.get(key, {}).iteritems(): if package_name not in AlbaUpdateController.sdm_packages: continue # Only gather information for the SDM packages information[key]['services_post_update'].update(package_info.pop('services_to_restart')) if package_name not in information[key]['packages']: information[key]['packages'][package_name] = {} information[key]['packages'][package_name].update(package_info) return information
def _deploy_stack_and_scrub(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled (by reference) :type error_messages: list :return: None :rtype: NoneType """ if len(vpool.storagedrivers ) == 0 or not vpool.storagedrivers[0].storagedriver_id: error_messages.append( 'vPool {0} does not have any valid StorageDrivers configured'. format(vpool.name)) return service_manager = ServiceFactory.get_manager() client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] partition_guid = scrub_info['partition_guid'] alba_proxy_service = 'ovs-albaproxy_{0}_{1}_{2}_scrub'.format( vpool.name, storagerouter.name, partition_guid) scrub_directory = '{0}/scrub_work_{1}_{2}'.format( scrub_info['scrub_path'], vpool.name, partition_guid) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format( vpool.guid, partition_guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format( vpool.guid, partition_guid) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod( scrub_directory, 0777 ) # Celery task executed by 'ovs' user and should be able to write in it if service_manager.has_service( name=alba_proxy_service, client=client ) is True and service_manager.get_service_status( name=alba_proxy_service, client=client) == 'active': GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get( '/ovs/framework/hosts/{0}/ports|storagedriver'.format( machine_id)) with volatile_mutex('deploy_proxy_for_scrub_{0}'.format( storagerouter.guid), wait=30): port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] scrub_config = Configuration.get( 'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format( vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = { 'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path(alba_proxy_service), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key) } service_manager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) service_manager.start_service(name=alba_proxy_service, client=client) GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get( 'ovs/vpools/{0}/hosts/{1}/config'.format( vpool.guid, vpool.storagedrivers[0].storagedriver_id ))['backend_connection_manager'] if backend_config.get('backend_type') != 'MULTI': backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config[ 'port'] else: for value in backend_config.itervalues(): if isinstance(value, dict): value['alba_connection_host'] = '127.0.0.1' value['alba_connection_port'] = scrub_config[ 'port'] # Copy backend connection manager information in separate key Configuration.set( backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) GenericController._logger.exception(message) if client is not None and service_manager.has_service( name=alba_proxy_service, client=client) is True: if service_manager.get_service_status( name=alba_proxy_service, client=client) == 'active': service_manager.stop_service(name=alba_proxy_service, client=client) service_manager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) # Execute the actual scrubbing threads = [] threads_key = '/ovs/framework/hosts/{0}/config|scrub_stack_threads'.format( storagerouter.machine_id) amount_threads = Configuration.get( key=threads_key) if Configuration.exists(key=threads_key) else 2 if not isinstance(amount_threads, int): error_messages.append( 'Amount of threads to spawn must be an integer for StorageRouter with ID {0}' .format(storagerouter.machine_id)) return amount_threads = max(amount_threads, 1) # Make sure amount_threads is at least 1 amount_threads = min(min(queue.qsize(), amount_threads), 20) # Make sure amount threads is max 20 GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Spawning {2} threads for proxy service {3}' .format(vpool.name, storagerouter.name, amount_threads, alba_proxy_service)) for index in range(amount_threads): thread = Thread(name='execute_scrub_{0}_{1}_{2}'.format( vpool.guid, partition_guid, index), target=GenericController._execute_scrub, args=(queue, vpool, scrub_info, scrub_directory, error_messages)) thread.start() threads.append(thread) for thread in threads: thread.join() # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if service_manager.has_service(alba_proxy_service, client=client): service_manager.stop_service(alba_proxy_service, client=client) service_manager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) GenericController._logger.exception(message)
def cluster_registry_checkup(): """ Verify whether changes have occurred in the cluster registry for each vPool :return: Information whether changes occurred :rtype: dict """ changed_vpools = {} for vpool in VPoolList.get_vpools(): changed_vpools[vpool.guid] = {'changes': False, 'success': True} try: StorageDriverController._logger.info('Validating cluster registry settings for Vpool {0}'.format(vpool.guid)) current_configs = vpool.clusterregistry_client.get_node_configs() changes = len(current_configs) == 0 node_configs = [] for sd in vpool.storagedrivers: sd.invalidate_dynamics(['cluster_node_config']) new_config = sd.cluster_node_config node_configs.append(ClusterNodeConfig(**new_config)) if changes is False: current_node_configs = [config for config in current_configs if config.vrouter_id == sd.storagedriver_id] if len(current_node_configs) == 1: current_node_config = current_node_configs[0] for key in new_config: if getattr(current_node_config, key) != new_config[key]: changes = True break changed_vpools[vpool.guid]['changes'] = changes if changes is True: StorageDriverController._logger.info('Cluster registry settings for Vpool {0} needs to be updated'.format(vpool.guid)) available_storagedrivers = [] for sd in vpool.storagedrivers: storagerouter = sd.storagerouter try: SSHClient(storagerouter, username='******') with remote(storagerouter.ip, [LocalStorageRouterClient]) as rem: sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, sd.storagedriver_id) if Configuration.exists(sd_key) is True: path = Configuration.get_configuration_path(sd_key) lsrc = rem.LocalStorageRouterClient(path) lsrc.server_revision() # 'Cheap' call to verify whether volumedriver is responsive available_storagedrivers.append(sd) except UnableToConnectException: StorageDriverController._logger.warning('StorageRouter {0} not available.'.format(storagerouter.name)) except Exception as ex: if 'ClusterNotReachableException' in str(ex): StorageDriverController._logger.warning('StorageDriver {0} on StorageRouter {1} not available.'.format( sd.guid, storagerouter.name )) else: StorageDriverController._logger.exception('Got exception when validating StorageDriver {0} on StorageRouter {1}.'.format( sd.guid, storagerouter.name )) StorageDriverController._logger.info('Updating cluster node configs for VPool {0}'.format(vpool.guid)) vpool.clusterregistry_client.set_node_configs(node_configs) for sd in available_storagedrivers: StorageDriverController._logger.info('Trigger config reload for StorageDriver {0}'.format(sd.guid)) vpool.storagedriver_client.update_cluster_node_configs(str(sd.storagedriver_id), req_timeout_secs=10) StorageDriverController._logger.info('Updating cluster node configs for Vpool {0} completed'.format(vpool.guid)) else: StorageDriverController._logger.info('Cluster registry settings for Vpool {0} is up to date'.format(vpool.guid)) except Exception as ex: StorageDriverController._logger.exception('Got exception when validating cluster registry settings for Vpool {0}.'.format(vpool.name)) changed_vpools[vpool.guid]['success'] = False changed_vpools[vpool.guid]['error'] = ex.message return changed_vpools
def remove_asd(node_guid, asd_id, expected_safety): """ Removes an ASD :param node_guid: Guid of the node to remove an ASD from :type node_guid: str :param asd_id: ID of the ASD to remove :type asd_id: str :param expected_safety: Expected safety after having removed the ASD :type expected_safety: dict or None :return: Aliases of the disk on which the ASD was removed :rtype: list """ node = AlbaNode(node_guid) AlbaNodeController._logger.debug('Removing ASD {0} at node {1}'.format(asd_id, node.ip)) model_osd = None for disk in node.disks: for asd in disk.osds: if asd.osd_id == asd_id: model_osd = asd break if model_osd is not None: break if model_osd is not None: alba_backend = model_osd.alba_backend else: alba_backend = None asds = {} try: asds = node.client.get_asds() except (requests.ConnectionError, requests.Timeout, InvalidCredentialsError): AlbaNodeController._logger.warning('Could not connect to node {0} to validate ASD'.format(node.guid)) partition_alias = None for alias, asd_ids in asds.iteritems(): if asd_id in asd_ids: partition_alias = alias break if alba_backend is not None: if expected_safety is None: AlbaNodeController._logger.warning('Skipping safety check for ASD {0} on backend {1} - this is dangerous'.format(asd_id, alba_backend.guid)) else: final_safety = AlbaController.calculate_safety(alba_backend_guid=alba_backend.guid, removal_osd_ids=[asd_id]) safety_lost = final_safety['lost'] safety_crit = final_safety['critical'] if (safety_crit != 0 or safety_lost != 0) and (safety_crit != expected_safety['critical'] or safety_lost != expected_safety['lost']): raise RuntimeError('Cannot remove ASD {0} as the current safety is not as expected ({1} vs {2})'.format(asd_id, final_safety, expected_safety)) AlbaNodeController._logger.debug('Safety OK for ASD {0} on backend {1}'.format(asd_id, alba_backend.guid)) AlbaNodeController._logger.debug('Purging ASD {0} on backend {1}'.format(asd_id, alba_backend.guid)) AlbaController.remove_units(alba_backend_guid=alba_backend.guid, osd_ids=[asd_id]) else: AlbaNodeController._logger.warning('Could not match ASD {0} to any backend. Cannot purge'.format(asd_id)) disk_data = None if partition_alias is not None: AlbaNodeController._logger.debug('Removing ASD {0} from disk {1}'.format(asd_id, partition_alias)) for device_info in node.client.get_disks().itervalues(): if partition_alias in device_info['partition_aliases']: disk_data = device_info result = node.client.delete_asd(disk_id=device_info['aliases'][0].split('/')[-1], asd_id=asd_id) if result['_success'] is False: raise RuntimeError('Error removing ASD: {0}'.format(result['_error'])) if disk_data == {}: raise RuntimeError('Failed to find disk for partition with alias {0}'.format(partition_alias)) else: AlbaNodeController._logger.warning('Could not remove ASD from remote node (node down)'.format(asd_id)) if Configuration.exists(AlbaNodeController.ASD_CONFIG.format(asd_id), raw=True): Configuration.delete(AlbaNodeController.ASD_CONFIG_DIR.format(asd_id), raw=True) if model_osd is not None: model_osd.delete() if alba_backend is not None: alba_backend.invalidate_dynamics() alba_backend.backend.invalidate_dynamics() if node.storagerouter is not None: DiskController.sync_with_reality(storagerouter_guid=node.storagerouter_guid) return [] if disk_data is None else disk_data.get('aliases', [])
def migrate(): """ Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually executed. This code will typically contain: * "dangerous" migration code (it needs certain running services) * Migration code depending on a cluster-wide state * ... * Successfully finishing a piece of migration code, should create an entry in /ovs/framework/migration in case it should not be executed again * Eg: /ovs/framework/migration|stats_monkey_integration: True """ MigrationController._logger.info('Preparing out of band migrations...') from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.storagedriverlist import StorageDriverList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.vpoollist import VPoolList from ovs.extensions.db.arakooninstaller import ArakoonInstaller from ovs.extensions.generic.configuration import Configuration from ovs.extensions.generic.sshclient import SSHClient from ovs_extensions.generic.toolbox import ExtensionsToolbox from ovs.extensions.migration.migration.ovsmigrator import ExtensionMigrator from ovs.extensions.packages.packagefactory import PackageFactory from ovs_extensions.services.interfaces.systemd import Systemd from ovs.extensions.services.servicefactory import ServiceFactory from ovs.extensions.storageserver.storagedriver import StorageDriverConfiguration from ovs.lib.helpers.storagedriver.installer import StorageDriverInstaller MigrationController._logger.info('Start out of band migrations...') service_manager = ServiceFactory.get_manager() sr_client_map = {} for storagerouter in StorageRouterList.get_storagerouters(): sr_client_map[storagerouter.guid] = SSHClient(endpoint=storagerouter.ip, # Is triggered during post-update code too during which the ovs-watcher-framework service is still down and thus not refreshing the heartbeat --> use IP i/o StorageRouter username='******') ######################################################### # Addition of 'ExecReload' for AlbaProxy SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagedriver in StorageDriverList.get_storagedrivers(): root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: service = alba_proxy.service service_name = 'ovs-{0}'.format(service.name) if not service_manager.has_service(name=service_name, client=root_client): continue if 'ExecReload=' in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ################################################################## # Adjustment of open file descriptors for Arakoon services to 8192 changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-arakoon-'): continue if ServiceFactory.get_service_type() == 'systemd': path = '/lib/systemd/system/{0}.service'.format(service_name) check = 'LimitNOFILE=8192' else: path = '/etc/init/{0}.conf'.format(service_name) check = 'limit nofile 8192 8192' if not root_client.file_exists(path): continue if check in root_client.file_read(path): continue try: service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name) changed_clients.add(root_client) ExtensionsToolbox.edit_version_file(client=root_client, package_name='arakoon', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, service_name)) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ############################# # Migrate to multiple proxies for storagedriver in StorageDriverList.get_storagedrivers(): vpool = storagedriver.vpool root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: # Rename alba_proxy service in model service = alba_proxy.service old_service_name = 'albaproxy_{0}'.format(vpool.name) new_service_name = 'albaproxy_{0}_0'.format(vpool.name) if old_service_name != service.name: continue service.name = new_service_name service.save() if not service_manager.has_service(name=old_service_name, client=root_client): continue old_configuration_key = '/ovs/framework/hosts/{0}/services/{1}'.format(storagedriver.storagerouter.machine_id, old_service_name) if not Configuration.exists(key=old_configuration_key): continue # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, old_service_name), new_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, new_service_name)) # Register new service and remove old service service_manager.add_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, params=Configuration.get(old_configuration_key), target_name='ovs-{0}'.format(new_service_name)) # Update scrub proxy config proxy_config_key = '/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid) proxy_config = None if Configuration.exists(key=proxy_config_key) is False else Configuration.get(proxy_config_key) if proxy_config is not None: fragment_cache = proxy_config.get(StorageDriverConfiguration.CACHE_FRAGMENT, ['none', {}]) if fragment_cache[0] == 'alba' and fragment_cache[1].get('cache_on_write') is True: # Accelerated ALBA configured fragment_cache_scrub_info = copy.deepcopy(fragment_cache) fragment_cache_scrub_info[1]['cache_on_read'] = False proxy_scrub_config_key = '/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid) proxy_scrub_config = None if Configuration.exists(key=proxy_scrub_config_key) is False else Configuration.get(proxy_scrub_config_key) if proxy_scrub_config is not None and proxy_scrub_config[StorageDriverConfiguration.CACHE_FRAGMENT] == ['none']: proxy_scrub_config[StorageDriverConfiguration.CACHE_FRAGMENT] = fragment_cache_scrub_info Configuration.set(key=proxy_scrub_config_key, value=proxy_scrub_config) # Update 'backend_connection_manager' section changes = False storagedriver_config = StorageDriverConfiguration(vpool.guid, storagedriver.storagedriver_id) if 'backend_connection_manager' not in storagedriver_config.configuration: continue current_config = storagedriver_config.configuration['backend_connection_manager'] if current_config.get('backend_type') != 'MULTI': changes = True backend_connection_manager = {'backend_type': 'MULTI'} for index, proxy in enumerate(sorted(storagedriver.alba_proxies, key=lambda pr: pr.service.ports[0])): backend_connection_manager[str(index)] = copy.deepcopy(current_config) # noinspection PyUnresolvedReferences backend_connection_manager[str(index)]['alba_connection_use_rora'] = True # noinspection PyUnresolvedReferences backend_connection_manager[str(index)]['alba_connection_rora_manifest_cache_capacity'] = 5000 # noinspection PyUnresolvedReferences for key, value in backend_connection_manager[str(index)].items(): if key.startswith('backend_interface'): backend_connection_manager[key] = value # noinspection PyUnresolvedReferences del backend_connection_manager[str(index)][key] for key, value in {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}.iteritems(): if key not in backend_connection_manager: backend_connection_manager[key] = value else: backend_connection_manager = current_config for value in backend_connection_manager.values(): if isinstance(value, dict): for key, val in value.items(): if key.startswith('backend_interface'): backend_connection_manager[key] = val changes = True del value[key] for key, value in {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}.iteritems(): if key not in backend_connection_manager: changes = True backend_connection_manager[key] = value if changes is True: storagedriver_config.clear_backend_connection_manager() storagedriver_config.configure_backend_connection_manager(**backend_connection_manager) storagedriver_config.save(root_client) # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section) ExtensionsToolbox.edit_version_file(client=root_client, package_name='volumedriver', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, 'volumedriver_{0}'.format(vpool.name))) if service_manager.__class__ == Systemd: root_client.run(['systemctl', 'daemon-reload']) ######################################## # Update metadata_store_bits information vpools = VPoolList.get_vpools() for vpool in vpools: bits = None for storagedriver in vpool.storagedrivers: key = '/ovs/framework/hosts/{0}/services/volumedriver_{1}'.format(storagedriver.storagerouter.machine_id, vpool.name) if Configuration.exists(key=key) and 'METADATASTORE_BITS' not in Configuration.get(key=key): if bits is None: entries = service_manager.extract_from_service_file(name='ovs-volumedriver_{0}'.format(vpool.name), client=sr_client_map[storagedriver.storagerouter_guid], entries=['METADATASTORE_BITS=']) if len(entries) == 1: bits = entries[0].split('=')[-1] bits = int(bits) if bits.isdigit() else 5 if bits is not None: try: content = Configuration.get(key=key) content['METADATASTORE_BITS'] = bits Configuration.set(key=key, value=content) except: MigrationController._logger.exception('Error updating volumedriver info for vPool {0} on StorageRouter {1}'.format(vpool.name, storagedriver.storagerouter.name)) if bits is not None: vpool.metadata_store_bits = bits vpool.save() ##################################### # Update the vPool metadata structure def _update_metadata_structure(metadata): metadata = copy.deepcopy(metadata) cache_structure = {'read': False, 'write': False, 'is_backend': False, 'quota': None, 'backend_info': {'name': None, # Will be filled in when is_backend is true 'backend_guid': None, 'alba_backend_guid': None, 'policies': None, 'preset': None, 'arakoon_config': None, 'connection_info': {'client_id': None, 'client_secret': None, 'host': None, 'port': None, 'local': None}} } structure_map = {StorageDriverConfiguration.CACHE_BLOCK: {'read': 'block_cache_on_read', 'write': 'block_cache_on_write', 'quota': 'quota_bc', 'backend_prefix': 'backend_bc_{0}'}, StorageDriverConfiguration.CACHE_FRAGMENT: {'read': 'fragment_cache_on_read', 'write': 'fragment_cache_on_write', 'quota': 'quota_fc', 'backend_prefix': 'backend_aa_{0}'}} if 'arakoon_config' in metadata['backend']: # Arakoon config should be placed under the backend info metadata['backend']['backend_info']['arakoon_config'] = metadata['backend'].pop('arakoon_config') if 'connection_info' in metadata['backend']: # Connection info sohuld be placed under the backend info metadata['backend']['backend_info']['connection_info'] = metadata['backend'].pop('connection_info') if 'caching_info' not in metadata: # Caching info is the new key would_be_caching_info = {} metadata['caching_info'] = would_be_caching_info # Extract all caching data for every storagerouter current_caching_info = metadata['backend'].pop('caching_info') # Pop to mutate metadata for storagerouter_guid in current_caching_info.iterkeys(): current_cache_data = current_caching_info[storagerouter_guid] storagerouter_caching_info = {} would_be_caching_info[storagerouter_guid] = storagerouter_caching_info for cache_type, cache_type_mapping in structure_map.iteritems(): new_cache_structure = copy.deepcopy(cache_structure) storagerouter_caching_info[cache_type] = new_cache_structure for new_structure_key, old_structure_key in cache_type_mapping.iteritems(): if new_structure_key == 'backend_prefix': # Get possible backend related info metadata_key = old_structure_key.format(storagerouter_guid) if metadata_key not in metadata: continue backend_data = metadata.pop(metadata_key) # Pop to mutate metadata new_cache_structure['is_backend'] = True # Copy over the old data new_cache_structure['backend_info']['arakoon_config'] = backend_data['arakoon_config'] new_cache_structure['backend_info'].update(backend_data['backend_info']) new_cache_structure['backend_info']['connection_info'].update(backend_data['connection_info']) else: new_cache_structure[new_structure_key] = current_cache_data.get(old_structure_key) return metadata vpools = VPoolList.get_vpools() for vpool in vpools: try: new_metadata = _update_metadata_structure(vpool.metadata) vpool.metadata = new_metadata vpool.save() except KeyError: MigrationController._logger.exception('Exceptions occurred when updating the metadata for vPool {0}'.format(vpool.name)) ############################################## # Always use indent=4 during Configuration set def _resave_all_config_entries(config_path='/ovs'): """ Recursive functions which checks every config management key if its a directory or not. If not a directory, we retrieve the config and just save it again using the new indentation logic """ for item in Configuration.list(config_path): new_path = config_path + '/' + item print new_path if Configuration.dir_exists(new_path) is True: _resave_all_config_entries(config_path=new_path) else: try: _config = Configuration.get(new_path) Configuration.set(new_path, _config) except: _config = Configuration.get(new_path, raw=True) Configuration.set(new_path, _config, raw=True) if ExtensionMigrator.THIS_VERSION <= 13: # There is no way of checking whether this new indentation logic has been applied, so we only perform this for version 13 and lower MigrationController._logger.info('Re-saving every configuration setting with new indentation rules') _resave_all_config_entries() ############################ # Update some default values def _update_manifest_cache_size(_proxy_config_key): updated = False manifest_cache_size = 500 * 1024 * 1024 if Configuration.exists(key=_proxy_config_key): _proxy_config = Configuration.get(key=_proxy_config_key) for cache_type in [StorageDriverConfiguration.CACHE_BLOCK, StorageDriverConfiguration.CACHE_FRAGMENT]: if cache_type in _proxy_config and _proxy_config[cache_type][0] == 'alba': if _proxy_config[cache_type][1]['manifest_cache_size'] != manifest_cache_size: updated = True _proxy_config[cache_type][1]['manifest_cache_size'] = manifest_cache_size if _proxy_config['manifest_cache_size'] != manifest_cache_size: updated = True _proxy_config['manifest_cache_size'] = manifest_cache_size if updated is True: Configuration.set(key=_proxy_config_key, value=_proxy_config) return updated for storagedriver in StorageDriverList.get_storagedrivers(): try: vpool = storagedriver.vpool root_client = sr_client_map[storagedriver.storagerouter_guid] _update_manifest_cache_size('/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)) # Generic scrub proxy is deployed every time scrubbing kicks in, so no need to restart these services for alba_proxy in storagedriver.alba_proxies: if _update_manifest_cache_size('/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid)) is True: # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, alba_proxy.service.name)) # Update 'backend_connection_manager' section changes = False storagedriver_config = StorageDriverConfiguration(vpool.guid, storagedriver.storagedriver_id) if 'backend_connection_manager' not in storagedriver_config.configuration: continue current_config = storagedriver_config.configuration['backend_connection_manager'] for key, value in current_config.iteritems(): if key.isdigit() is True: if value.get('alba_connection_asd_connection_pool_capacity') != 10: changes = True value['alba_connection_asd_connection_pool_capacity'] = 10 if value.get('alba_connection_timeout') != 30: changes = True value['alba_connection_timeout'] = 30 if value.get('alba_connection_rora_manifest_cache_capacity') != 25000: changes = True value['alba_connection_rora_manifest_cache_capacity'] = 25000 if changes is True: storagedriver_config.clear_backend_connection_manager() storagedriver_config.configure_backend_connection_manager(**current_config) storagedriver_config.save(root_client) # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section) ExtensionsToolbox.edit_version_file(client=root_client, package_name='volumedriver', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, 'volumedriver_{0}'.format(vpool.name))) except Exception: MigrationController._logger.exception('Updating default configuration values failed for StorageDriver {0}'.format(storagedriver.storagedriver_id)) #################################################### # Adding proxy fail fast as env variable for proxies changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-albaproxy_'): continue if ServiceFactory.get_service_type() == 'systemd': path = '/lib/systemd/system/{0}.service'.format(service_name) check = 'Environment=ALBA_FAIL_FAST=true' else: path = '/etc/init/{0}.conf'.format(service_name) check = 'env ALBA_FAIL_FAST=true' if not root_client.file_exists(path): continue if check in root_client.file_read(path): continue try: service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name=service_name) changed_clients.add(root_client) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, service_name)) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ###################################### # Integration of stats monkey (2.10.2) if Configuration.get(key='/ovs/framework/migration|stats_monkey_integration', default=False) is False: try: # Get content of old key into new key old_stats_monkey_key = '/statsmonkey/statsmonkey' if Configuration.exists(key=old_stats_monkey_key) is True: Configuration.set(key='/ovs/framework/monitoring/stats_monkey', value=Configuration.get(key=old_stats_monkey_key)) Configuration.delete(key=old_stats_monkey_key) # Make sure to disable the stats monkey by default or take over the current schedule if it was configured manually before celery_key = '/ovs/framework/scheduling/celery' current_value = None scheduling_config = Configuration.get(key=celery_key, default={}) if 'statsmonkey.run_all_stats' in scheduling_config: # Old celery task name of the stats monkey current_value = scheduling_config.pop('statsmonkey.run_all_stats') scheduling_config['ovs.stats_monkey.run_all'] = current_value scheduling_config['alba.stats_monkey.run_all'] = current_value Configuration.set(key=celery_key, value=scheduling_config) support_key = '/ovs/framework/support' support_config = Configuration.get(key=support_key) support_config['support_agent'] = support_config.pop('enabled', True) support_config['remote_access'] = support_config.pop('enablesupport', False) Configuration.set(key=support_key, value=support_config) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|stats_monkey_integration', value=True) except Exception: MigrationController._logger.exception('Integration of stats monkey failed') ###################################################### # Write away cluster ID to a file for back-up purposes try: cluster_id = Configuration.get(key='/ovs/framework/cluster_id', default=None) with open(Configuration.CONFIG_STORE_LOCATION, 'r') as config_file: config = json.load(config_file) if cluster_id is not None and config.get('cluster_id', None) is None: config['cluster_id'] = cluster_id with open(Configuration.CONFIG_STORE_LOCATION, 'w') as config_file: json.dump(config, config_file, indent=4) except Exception: MigrationController._logger.exception('Writing cluster id to a file failed.') ######################################################### # Additional string formatting in Arakoon services (2.11) try: if Configuration.get(key='/ovs/framework/migration|arakoon_service_update', default=False) is False: arakoon_service_names = [ArakoonInstaller.get_service_name_for_cluster(cluster_name=cluster_name) for cluster_name in Configuration.list(key='ovs/arakoon')] for storagerouter in StorageRouterList.get_masters(): for service_name in arakoon_service_names: config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(storagerouter.machine_id, service_name) if Configuration.exists(key=config_key): config = Configuration.get(key=config_key) config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR config['ARAKOON_PKG_NAME'] = PackageFactory.PKG_ARAKOON config['ARAKOON_VERSION_CMD'] = PackageFactory.VERSION_CMD_ARAKOON Configuration.set(key=config_key, value=config) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|arakoon_service_update', value=True) except Exception: MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed') ############################################################ # Additional string formatting in ALBA proxy services (2.11) changed_clients = set() try: if Configuration.get(key='/ovs/framework/migration|alba_proxy_service_update', default=False) is False: alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_ALBA) for service in ServiceTypeList.get_by_name('AlbaProxy').services: root_client = sr_client_map[service.storagerouter_guid] config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(service.storagerouter.machine_id, service.name) if Configuration.exists(key=config_key): config = Configuration.get(key=config_key) config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR config['ALBA_PKG_NAME'] = alba_pkg_name config['ALBA_VERSION_CMD'] = alba_version_cmd Configuration.set(key=config_key, value=config) service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name='ovs-{0}'.format(service.name)) changed_clients.add(root_client) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|alba_proxy_service_update', value=True) except Exception: MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed') ############################################################ # Additional string formatting in DTL/VOLDRV services (2.11) try: if Configuration.get(key='/ovs/framework/migration|voldrv_service_update', default=False) is False: sd_pkg_name, sd_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD) for vpool in VPoolList.get_vpools(): for storagedriver in vpool.storagedrivers: root_client = sr_client_map[storagedriver.storagerouter_guid] for entry in ['dtl', 'volumedriver']: service_name = '{0}_{1}'.format(entry, vpool.name) service_template = StorageDriverInstaller.SERVICE_TEMPLATE_DTL if entry == 'dtl' else StorageDriverInstaller.SERVICE_TEMPLATE_SD config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(storagedriver.storagerouter.machine_id, service_name) if Configuration.exists(key=config_key): config = Configuration.get(key=config_key) config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR config['VOLDRV_PKG_NAME'] = sd_pkg_name config['VOLDRV_VERSION_CMD'] = sd_version_cmd Configuration.set(key=config_key, value=config) service_manager.regenerate_service(name=service_template, client=root_client, target_name='ovs-{0}'.format(service_name)) changed_clients.add(root_client) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|voldrv_service_update', value=True) except Exception: MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed') ####################################################### # Storing actual package name in version files (2.11.0) (https://github.com/openvstorage/framework/issues/1876) if Configuration.get(key='/ovs/framework/migration|actual_package_name_in_version_file', default=False) is False: try: voldrv_pkg_name, _ = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD) for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map.get(storagerouter.guid) if root_client is None: continue for file_name in root_client.file_list(directory=ServiceFactory.RUN_FILE_DIR): if not file_name.endswith('.version'): continue file_path = '{0}/{1}'.format(ServiceFactory.RUN_FILE_DIR, file_name) contents = root_client.file_read(filename=file_path) regenerate = False if voldrv_pkg_name == PackageFactory.PKG_VOLDRV_SERVER: if 'volumedriver-server' in contents: regenerate = True contents = contents.replace('volumedriver-server', PackageFactory.PKG_VOLDRV_SERVER) root_client.file_write(filename=file_path, contents=contents) elif voldrv_pkg_name == PackageFactory.PKG_VOLDRV_SERVER_EE: if 'volumedriver-server' in contents or PackageFactory.PKG_VOLDRV_SERVER in contents: regenerate = True contents = contents.replace('volumedriver-server', PackageFactory.PKG_VOLDRV_SERVER_EE) contents = contents.replace(PackageFactory.PKG_VOLDRV_SERVER, PackageFactory.PKG_VOLDRV_SERVER_EE) root_client.file_write(filename=file_path, contents=contents) if regenerate is True: service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_DTL if file_name.startswith('dtl') else StorageDriverInstaller.SERVICE_TEMPLATE_SD, client=root_client, target_name='ovs-{0}'.format(file_name.split('.')[0])) # Leave out .version changed_clients.add(root_client) Configuration.set(key='/ovs/framework/migration|actual_package_name_in_version_file', value=True) except Exception: MigrationController._logger.exception('Updating actual package name for version files failed') for root_client in changed_clients: try: root_client.run(['systemctl', 'daemon-reload']) except Exception: MigrationController._logger.exception('Executing command "systemctl daemon-reload" failed') ######################################################### # Addition of 'Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50' for AlbaProxy SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagedriver in StorageDriverList.get_storagedrivers(): root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: service = alba_proxy.service service_name = 'ovs-{0}'.format(service.name) if not service_manager.has_service(name=service_name, client=root_client): continue if "Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50" in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name='ovs-albaproxy', client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ######################################################### # Addition of 'Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50' for Arakoon SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-arakoon-'): continue if not service_manager.has_service(name=service_name, client=root_client): continue if "Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50" in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) MigrationController._logger.info('Finished out of band migrations')
def validate_alba_backend_sanity_without_claimed_disks(alba_backend): """ Validate whether the ALBA backend is configured correctly :param alba_backend: ALBA backend :return: None """ # Attribute validation assert alba_backend.available is True,\ 'ALBA backend {0} is not available'.format(alba_backend.backend.name) assert len(alba_backend.presets) >= 1,\ 'No preset found for ALBA backend {0}'.format(alba_backend.backend.name) assert len([default for default in alba_backend.presets if default['is_default'] is True]) == 1,\ 'Could not find default preset for backend {0}'.format(alba_backend.backend.name) assert alba_backend.backend.backend_type.code == 'alba',\ 'Backend type for ALBA backend is {0}'.format(alba_backend.backend.backend_type.code) assert alba_backend.backend.status == 'RUNNING',\ 'Status for ALBA backend is {0}'.format(alba_backend.backend.status) # Validate ABM and NSM services storagerouters = GeneralStorageRouter.get_storage_routers() storagerouters_with_db_role = [sr for sr in storagerouters if GeneralStorageRouter.has_roles(storagerouter=sr, roles='DB') is True and sr.node_type == 'MASTER'] assert len(alba_backend.abm_services) == len(storagerouters_with_db_role),\ 'Not enough ABM services found' assert len(alba_backend.nsm_services) == len(storagerouters_with_db_role),\ 'Not enough NSM services found' # Validate ALBA backend configuration structure alba_backend_key = '/ovs/alba/backends' assert Configuration.dir_exists(key=alba_backend_key) is True,\ 'Configuration does not contain key {0}'.format(alba_backend_key) actual_config_keys = [key for key in Configuration.list(alba_backend_key)] expected_config_keys = ['global_gui_error_interval', alba_backend.guid, 'default_nsm_hosts'] optional_config_keys = ['verification_factor'] expected_keys_amount = 0 for optional_key in optional_config_keys: if optional_key in actual_config_keys: expected_keys_amount += 1 for expected_key in expected_config_keys: if not re.match(Toolbox.regex_guid, expected_key): expected_keys_amount += 1 assert expected_key in actual_config_keys,\ 'Key {0} was not found in tree {1}'.format(expected_key, alba_backend_key) for actual_key in list(actual_config_keys): if re.match(Toolbox.regex_guid, actual_key): actual_config_keys.remove(actual_key) # Remove all alba backend keys assert len(actual_config_keys) == expected_keys_amount,\ 'Another key was added to the {0} tree'.format(alba_backend_key) this_alba_backend_key = '{0}/{1}'.format(alba_backend_key, alba_backend.guid) actual_keys = [key for key in Configuration.list(this_alba_backend_key)] expected_keys = ['maintenance'] assert actual_keys == expected_keys,\ 'Actual keys: {0} - Expected keys: {1}'.format(actual_keys, expected_keys) maintenance_key = '{0}/maintenance'.format(this_alba_backend_key) actual_keys = [key for key in Configuration.list(maintenance_key)] expected_keys = ['nr_of_agents', 'config'] assert set(actual_keys) == set(expected_keys),\ 'Actual keys: {0} - Expected keys: {1}'.format(actual_keys, expected_keys) # @TODO: Add validation for config values # Validate ASD node configuration structure alba_nodes = GeneralAlba.get_alba_nodes() assert len(alba_nodes) > 0,\ 'Could not find any ALBA nodes in the model' alba_node_key = '/ovs/alba/asdnodes' actual_keys = [key for key in Configuration.list(alba_node_key)] assert len(alba_nodes) == len(actual_keys),\ 'Amount of ALBA nodes in model: {0} >< amount of ALBA nodes in configuration: {1}.'.format(len(alba_nodes), len(actual_keys)) for alba_node in alba_nodes: assert alba_node.node_id in actual_keys,\ 'ALBA node with ID {0} not present in configuration'.format(alba_node.node_id) actual_asdnode_keys = [key for key in Configuration.list('{0}/{1}'.format(alba_node_key, alba_node.node_id))] expected_asdnode_keys = ['config', 'services'] assert actual_asdnode_keys == expected_asdnode_keys,\ 'Actual keys: {0} - Expected keys: {1}'.format(actual_asdnode_keys, expected_asdnode_keys) actual_config_keys = [key for key in Configuration.list('{0}/{1}/config'.format(alba_node_key, alba_node.node_id))] expected_config_keys = ['main', 'network'] assert set(actual_config_keys) == set(expected_config_keys),\ 'Actual keys: {0} - Expected keys: {1}'.format(actual_config_keys, expected_config_keys) # @TODO: Add validation for main and network values # Validate Arakoon configuration structure arakoon_abm_key = '/ovs/arakoon/{0}/config'.format(alba_backend.abm_services[0].service.name).replace('arakoon-', '') arakoon_nsm_key = '/ovs/arakoon/{0}/config'.format(alba_backend.nsm_services[0].service.name).replace('arakoon-', '') assert Configuration.exists(key=arakoon_abm_key, raw=True) is True,\ 'Configuration key {0} does not exist'.format(arakoon_abm_key) assert Configuration.exists(key=arakoon_nsm_key, raw=True) is True,\ 'Configuration key {0} does not exist'.format(arakoon_nsm_key) # @TODO: Add validation for config values # Validate maintenance agents actual_amount_agents = len([service for node_services in [alba_node.client.list_maintenance_services() for alba_node in alba_nodes] for service in node_services]) expected_amount_agents = 1 assert actual_amount_agents == expected_amount_agents,\ 'Amount of maintenance agents is incorrect. Found {0} - Expected {1}'.format(actual_amount_agents, expected_amount_agents) # Validate arakoon services machine_ids = [sr.machine_id for sr in storagerouters_with_db_role] abm_service_name = alba_backend.abm_services[0].service.name nsm_service_name = alba_backend.nsm_services[0].service.name for storagerouter in storagerouters_with_db_role: root_client = SSHClient(endpoint=storagerouter, username='******') for service_name in [abm_service_name, nsm_service_name]: assert GeneralService.has_service(name=service_name, client=root_client) is True,\ 'Service {0} not deployed on Storage Router {1}'.format(service_name, storagerouter.name) exitcode, output = GeneralService.get_service_status(name=service_name, client=root_client) assert exitcode is True,\ 'Service {0} not running on Storage Router {1} - {2}'.format(service_name, storagerouter.name, output) out, err, _ = General.execute_command('arakoon --who-master -config {0}'.format(Configuration.get_configuration_path('/ovs/arakoon/{0}/config'.format(abm_service_name.replace('arakoon-', ''))))) assert out.strip() in machine_ids,\ 'Arakoon master is {0}, but should be 1 of "{1}"'.format(out.strip(), ', '.join(machine_ids))
def shrink_vpool(cls, storagedriver_guid, offline_storage_router_guids=list()): """ Removes a StorageDriver (if its the last StorageDriver for a vPool, the vPool is removed as well) :param storagedriver_guid: Guid of the StorageDriver to remove :type storagedriver_guid: str :param offline_storage_router_guids: Guids of StorageRouters which are offline and will be removed from cluster. WHETHER VPOOL WILL BE DELETED DEPENDS ON THIS :type offline_storage_router_guids: list :return: None :rtype: NoneType """ # TODO: Add logging # TODO: Unit test individual pieces of code # Validations storagedriver = StorageDriver(storagedriver_guid) storagerouter = storagedriver.storagerouter cls._logger.info( 'StorageDriver {0} - Deleting StorageDriver {1}'.format( storagedriver.guid, storagedriver.name)) vp_installer = VPoolInstaller(name=storagedriver.vpool.name) vp_installer.validate(storagedriver=storagedriver) sd_installer = StorageDriverInstaller(vp_installer=vp_installer, storagedriver=storagedriver) cls._logger.info( 'StorageDriver {0} - Checking availability of related StorageRouters' .format(storagedriver.guid, storagedriver.name)) sr_client_map = SSHClient.get_clients(endpoints=[ sd.storagerouter for sd in vp_installer.vpool.storagedrivers ], user_names=['root']) sr_installer = StorageRouterInstaller(root_client=sr_client_map.get( storagerouter, {}).get('root'), storagerouter=storagerouter, vp_installer=vp_installer, sd_installer=sd_installer) offline_srs = sr_client_map.pop('offline') if sorted([sr.guid for sr in offline_srs ]) != sorted(offline_storage_router_guids): raise RuntimeError('Not all StorageRouters are reachable') if storagerouter not in offline_srs: mtpt_pids = sr_installer.root_client.run( "lsof -t +D '/mnt/{0}' || true".format( vp_installer.name.replace(r"'", r"'\''")), allow_insecure=True).splitlines() if len(mtpt_pids) > 0: raise RuntimeError( 'vPool cannot be deleted. Following processes keep the vPool mount point occupied: {0}' .format(', '.join(mtpt_pids))) # Retrieve reachable StorageDrivers reachable_storagedrivers = [] for sd in vp_installer.vpool.storagedrivers: if sd.storagerouter not in sr_client_map: # StorageRouter is offline continue sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format( vp_installer.vpool.guid, sd.storagedriver_id) if Configuration.exists(sd_key) is True: path = Configuration.get_configuration_path(sd_key) with remote(sd.storagerouter.ip, [LocalStorageRouterClient]) as rem: try: lsrc = rem.LocalStorageRouterClient(path) lsrc.server_revision( ) # 'Cheap' call to verify whether volumedriver is responsive cls._logger.info( 'StorageDriver {0} - Responsive StorageDriver {1} on node with IP {2}' .format(storagedriver.guid, sd.name, sd.storagerouter.ip)) reachable_storagedrivers.append(sd) except Exception as exception: if not is_connection_failure(exception): raise if len(reachable_storagedrivers) == 0: raise RuntimeError( 'Could not find any responsive node in the cluster') # Start removal if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.SHRINKING) else: vp_installer.update_status(status=VPool.STATUSES.DELETING) # Clean up stale vDisks cls._logger.info('StorageDriver {0} - Removing stale vDisks'.format( storagedriver.guid)) VDiskController.remove_stale_vdisks(vpool=vp_installer.vpool) # Reconfigure the MDSes cls._logger.info('StorageDriver {0} - Reconfiguring MDSes'.format( storagedriver.guid)) for vdisk_guid in storagerouter.vdisks_guids: try: MDSServiceController.ensure_safety( vdisk_guid=vdisk_guid, excluded_storagerouter_guids=[storagerouter.guid] + offline_storage_router_guids) except Exception: cls._logger.exception( 'StorageDriver {0} - vDisk {1} - Ensuring MDS safety failed' .format(storagedriver.guid, vdisk_guid)) # Validate that all MDSes on current StorageRouter have been moved away # Ensure safety does not always throw an error, that's why we perform this check here instead of in the Exception clause of above code vdisks = [] for mds in vp_installer.mds_services: for junction in mds.vdisks: vdisk = junction.vdisk if vdisk in vdisks: continue vdisks.append(vdisk) cls._logger.critical( 'StorageDriver {0} - vDisk {1} {2} - MDS Services have not been migrated away' .format(storagedriver.guid, vdisk.guid, vdisk.name)) if len(vdisks) > 0: # Put back in RUNNING, so it can be used again. Errors keep on displaying in GUI now anyway vp_installer.update_status(status=VPool.STATUSES.RUNNING) raise RuntimeError( 'Not all MDS Services have been successfully migrated away') # Start with actual removal errors_found = False if storagerouter not in offline_srs: errors_found &= sd_installer.stop_services() errors_found &= vp_installer.configure_cluster_registry( exclude=[storagedriver], apply_on=reachable_storagedrivers) errors_found &= vp_installer.update_node_distance_map() errors_found &= vp_installer.remove_mds_services() errors_found &= sd_installer.clean_config_management() errors_found &= sd_installer.clean_model() if storagerouter not in offline_srs: errors_found &= sd_installer.clean_directories( mountpoints=StorageRouterController.get_mountpoints( client=sr_installer.root_client)) try: DiskController.sync_with_reality( storagerouter_guid=storagerouter.guid) except Exception: cls._logger.exception( 'StorageDriver {0} - Synchronizing disks with reality failed' .format(storagedriver.guid)) errors_found = True if vp_installer.storagedriver_amount > 1: # Update the vPool metadata and run DTL checkup vp_installer.vpool.metadata['caching_info'].pop( sr_installer.storagerouter.guid, None) vp_installer.vpool.save() try: VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600) except Exception: cls._logger.exception( 'StorageDriver {0} - DTL checkup failed for vPool {1} with guid {2}' .format(storagedriver.guid, vp_installer.name, vp_installer.vpool.guid)) else: cls._logger.info( 'StorageDriver {0} - Removing vPool from model'.format( storagedriver.guid)) # Clean up model try: vp_installer.vpool.delete() except Exception: errors_found = True cls._logger.exception( 'StorageDriver {0} - Cleaning up vPool from the model failed' .format(storagedriver.guid)) Configuration.delete('/ovs/vpools/{0}'.format( vp_installer.vpool.guid)) cls._logger.info('StorageDriver {0} - Running MDS checkup'.format( storagedriver.guid)) try: MDSServiceController.mds_checkup() except Exception: cls._logger.exception( 'StorageDriver {0} - MDS checkup failed'.format( storagedriver.guid)) # Update vPool status if errors_found is True: if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.FAILURE) raise RuntimeError( '1 or more errors occurred while trying to remove the StorageDriver. Please check the logs for more information' ) if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.RUNNING) cls._logger.info( 'StorageDriver {0} - Deleted StorageDriver {1}'.format( storagedriver.guid, storagedriver.name)) if len(VPoolList.get_vpools()) == 0: cluster_name = ArakoonInstaller.get_cluster_name('voldrv') if ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name)['internal'] is True: cls._logger.debug( 'StorageDriver {0} - Removing Arakoon cluster {1}'.format( storagedriver.guid, cluster_name)) try: installer = ArakoonInstaller(cluster_name=cluster_name) installer.load() installer.delete_cluster() except Exception: cls._logger.exception( 'StorageDriver {0} - Delete voldrv Arakoon cluster failed' .format(storagedriver.guid)) service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) service_name = ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name) for service in list(service_type.services): if service.name == service_name: service.delete() # Remove watcher volumedriver service if last StorageDriver on current StorageRouter if len( storagerouter.storagedrivers ) == 0 and storagerouter not in offline_srs: # ensure client is initialized for StorageRouter try: if cls._service_manager.has_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client): cls._service_manager.stop_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client) cls._service_manager.remove_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client) except Exception: cls._logger.exception( 'StorageDriver {0} - {1} service deletion failed'.format( storagedriver.guid, ServiceFactory.SERVICE_WATCHER_VOLDRV))
def migrate(): """ Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually executed. This code will typically contain: * "dangerous" migration code (it needs certain running services) * Migration code depending on a cluster-wide state * ... """ MigrationController._logger.info('Preparing out of band migrations...') from ovs.dal.lists.storagedriverlist import StorageDriverList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.vpoollist import VPoolList from ovs.extensions.generic.configuration import Configuration from ovs.extensions.generic.sshclient import SSHClient from ovs_extensions.generic.toolbox import ExtensionsToolbox from ovs_extensions.services.interfaces.systemd import Systemd from ovs.extensions.services.servicefactory import ServiceFactory from ovs.extensions.storageserver.storagedriver import StorageDriverConfiguration from ovs.lib.generic import GenericController MigrationController._logger.info('Start out of band migrations...') service_manager = ServiceFactory.get_manager() sr_client_map = {} for storagerouter in StorageRouterList.get_storagerouters(): sr_client_map[storagerouter.guid] = SSHClient(endpoint=storagerouter, username='******') ######################################################### # Addition of 'ExecReload' for AlbaProxy SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagedriver in StorageDriverList.get_storagedrivers(): root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: service = alba_proxy.service service_name = 'ovs-{0}'.format(service.name) if not service_manager.has_service(name=service_name, client=root_client): continue if 'ExecReload=' in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name='ovs-albaproxy', client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ################################################################## # Adjustment of open file descriptors for Arakoon services to 8192 changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-arakoon-'): continue if ServiceFactory.get_service_type() == 'systemd': path = '/lib/systemd/system/{0}.service'.format(service_name) check = 'LimitNOFILE=8192' else: path = '/etc/init/{0}.conf'.format(service_name) check = 'limit nofile 8192 8192' if not root_client.file_exists(path): continue if check in root_client.file_read(path): continue try: service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name) changed_clients.add(root_client) ExtensionsToolbox.edit_version_file(client=root_client, package_name='arakoon', old_service_name=service_name) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ############################# # Migrate to multiple proxies for storagedriver in StorageDriverList.get_storagedrivers(): vpool = storagedriver.vpool root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: # Rename alba_proxy service in model service = alba_proxy.service old_service_name = 'albaproxy_{0}'.format(vpool.name) new_service_name = 'albaproxy_{0}_0'.format(vpool.name) if old_service_name != service.name: continue service.name = new_service_name service.save() if not service_manager.has_service(name=old_service_name, client=root_client): continue old_configuration_key = '/ovs/framework/hosts/{0}/services/{1}'.format(storagedriver.storagerouter.machine_id, old_service_name) if not Configuration.exists(key=old_configuration_key): continue # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_service_name=old_service_name, new_service_name=new_service_name) # Register new service and remove old service service_manager.add_service(name='ovs-albaproxy', client=root_client, params=Configuration.get(old_configuration_key), target_name='ovs-{0}'.format(new_service_name)) # Update scrub proxy config proxy_config_key = '/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid) proxy_config = None if Configuration.exists(key=proxy_config_key) is False else Configuration.get(proxy_config_key) if proxy_config is not None: fragment_cache = proxy_config.get('fragment_cache', ['none', {}]) if fragment_cache[0] == 'alba' and fragment_cache[1].get('cache_on_write') is True: # Accelerated ALBA configured fragment_cache_scrub_info = copy.deepcopy(fragment_cache) fragment_cache_scrub_info[1]['cache_on_read'] = False proxy_scrub_config_key = '/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid) proxy_scrub_config = None if Configuration.exists(key=proxy_scrub_config_key) is False else Configuration.get(proxy_scrub_config_key) if proxy_scrub_config is not None and proxy_scrub_config['fragment_cache'] == ['none']: proxy_scrub_config['fragment_cache'] = fragment_cache_scrub_info Configuration.set(proxy_scrub_config_key, json.dumps(proxy_scrub_config, indent=4), raw=True) # Update 'backend_connection_manager' section changes = False storagedriver_config = StorageDriverConfiguration('storagedriver', vpool.guid, storagedriver.storagedriver_id) storagedriver_config.load() if 'backend_connection_manager' not in storagedriver_config.configuration: continue current_config = storagedriver_config.configuration['backend_connection_manager'] if current_config.get('backend_type') != 'MULTI': changes = True backend_connection_manager = {'backend_type': 'MULTI'} for index, proxy in enumerate(sorted(storagedriver.alba_proxies, key=lambda pr: pr.service.ports[0])): backend_connection_manager[str(index)] = copy.deepcopy(current_config) # noinspection PyUnresolvedReferences backend_connection_manager[str(index)]['alba_connection_use_rora'] = True # noinspection PyUnresolvedReferences backend_connection_manager[str(index)]['alba_connection_rora_manifest_cache_capacity'] = 5000 # noinspection PyUnresolvedReferences for key, value in backend_connection_manager[str(index)].items(): if key.startswith('backend_interface'): backend_connection_manager[key] = value # noinspection PyUnresolvedReferences del backend_connection_manager[str(index)][key] for key, value in {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}.iteritems(): if key not in backend_connection_manager: backend_connection_manager[key] = value else: backend_connection_manager = current_config for value in backend_connection_manager.values(): if isinstance(value, dict): for key, val in value.items(): if key.startswith('backend_interface'): backend_connection_manager[key] = val changes = True del value[key] for key, value in {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}.iteritems(): if key not in backend_connection_manager: changes = True backend_connection_manager[key] = value if changes is True: storagedriver_config.clear_backend_connection_manager() storagedriver_config.configure_backend_connection_manager(**backend_connection_manager) storagedriver_config.save(root_client) # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section) ExtensionsToolbox.edit_version_file(client=root_client, package_name='volumedriver', old_service_name='volumedriver_{0}'.format(vpool.name)) if service_manager.ImplementationClass == Systemd: root_client.run(['systemctl', 'daemon-reload']) ######################################## # Update metadata_store_bits information for vpool in VPoolList.get_vpools(): bits = None for storagedriver in vpool.storagedrivers: key = '/ovs/framework/hosts/{0}/services/volumedriver_{1}'.format(storagedriver.storagerouter.machine_id, vpool.name) if Configuration.exists(key=key) and 'METADATASTORE_BITS' not in Configuration.get(key=key): if bits is None: entries = service_manager.extract_from_service_file(name='ovs-volumedriver_{0}'.format(vpool.name), client=sr_client_map[storagedriver.storagerouter_guid], entries=['METADATASTORE_BITS=']) if len(entries) == 1: bits = entries[0].split('=')[-1] bits = int(bits) if bits.isdigit() else 5 if bits is not None: try: content = Configuration.get(key=key) content['METADATASTORE_BITS'] = bits Configuration.set(key=key, value=content) except: MigrationController._logger.exception('Error updating volumedriver info for vPool {0} on StorageRouter {1}'.format(vpool.name, storagedriver.storagerouter.name)) if bits is not None: vpool.metadata_store_bits = bits vpool.save() MigrationController._logger.info('Finished out of band migrations') GenericController.refresh_package_information()
def execute_scrub_work(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled :type error_messages: list :return: a list of error messages :rtype: list """ def _verify_mds_config(current_vdisk): current_vdisk.invalidate_dynamics('info') vdisk_configs = current_vdisk.info['metadata_backend_config'] if len(vdisk_configs) == 0: raise RuntimeError('Could not load MDS configuration') return vdisk_configs client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] scrub_directory = '{0}/scrub_work_{1}_{2}'.format( scrub_info['scrub_path'], vpool.name, storagerouter.name) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format( vpool.guid, storagerouter.guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format( vpool.guid, storagerouter.guid) alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format( vpool.name, storagerouter.name) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod( scrub_directory, 0777 ) # Celery task executed by 'ovs' user and should be able to write in it if ServiceManager.has_service( name=alba_proxy_service, client=client ) is True and ServiceManager.get_service_status( name=alba_proxy_service, client=client) is True: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get( '/ovs/framework/hosts/{0}/ports|storagedriver'.format( machine_id)) port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] # Scrub config # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini', # u'fragment_cache': [u'none'], # u'ips': [u'127.0.0.1'], # u'log_level': u'info', # u'manifest_cache_size': 17179869184, # u'port': 0, # u'transport': u'tcp'} # Backend config # {u'alba_connection_host': u'10.100.193.155', # u'alba_connection_port': 26204, # u'alba_connection_preset': u'preset', # u'alba_connection_timeout': 15, # u'alba_connection_transport': u'TCP', # u'backend_interface_retries_on_error': 5, # u'backend_interface_retry_backoff_multiplier': 2.0, # u'backend_interface_retry_interval_secs': 1, # u'backend_type': u'ALBA'} scrub_config = Configuration.get( 'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format( vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = { 'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path('alba_proxy'), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key) } ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) ServiceManager.start_service(name=alba_proxy_service, client=client) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get( 'ovs/vpools/{0}/hosts/{1}/config'.format( vpool.guid, vpool.storagedrivers[0].storagedriver_id ))['backend_connection_manager'] backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config['port'] Configuration.set( backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message) if client is not None and ServiceManager.has_service( name=alba_proxy_service, client=client) is True: if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ServiceManager.stop_service(name=alba_proxy_service, client=client) ServiceManager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) try: # Empty the queue with vDisks to scrub with remote(storagerouter.ip, [VDisk]) as rem: while True: vdisk = None vdisk_guid = queue.get(False) try: # Check MDS master is local. Trigger MDS handover if necessary vdisk = rem.VDisk(vdisk_guid) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}' .format(vpool.name, storagerouter.name, vdisk.name, scrub_directory)) configs = _verify_mds_config(current_vdisk=vdisk) storagedriver = StorageDriverList.get_by_storagedriver_id( vdisk.storagedriver_id) if configs[0].get( 'ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover' .format(vpool.name, storagerouter.name, vdisk.name)) MDSServiceController.ensure_safety( VDisk(vdisk_guid) ) # Do not use a remote VDisk instance here configs = _verify_mds_config(current_vdisk=vdisk) if configs[0].get( 'ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.warning( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local' .format(vpool.name, storagerouter.name, vdisk.name)) continue # Do the actual scrubbing with vdisk.storagedriver_client.make_locked_client( str(vdisk.volume_id)) as locked_client: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work' .format(vpool.name, storagerouter.name, vdisk.name)) work_units = locked_client.get_scrubbing_workunits( ) for work_unit in work_units: res = locked_client.scrub( work_unit=work_unit, scratch_dir=scrub_directory, log_sinks=[ LogHandler.get_sink_path( 'scrubber', allow_override=True) ], backend_config=Configuration. get_configuration_path(backend_config_key)) locked_client.apply_scrubbing_result( scrubbing_work_result=res) if work_units: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied' .format(vpool.name, storagerouter.name, vdisk.name, len(work_units))) else: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required' .format(vpool.name, storagerouter.name, vdisk.name)) except Exception: if vdisk is None: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format( vpool.name, storagerouter.name, vdisk_guid) else: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format( vpool.name, storagerouter.name, vdisk.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) except Empty: # Raised when all items have been fetched from the queue ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed' .format(vpool.name, storagerouter.name)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format( vpool.name, storagerouter.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if ServiceManager.has_service(alba_proxy_service, client=client): ServiceManager.stop_service(alba_proxy_service, client=client) ServiceManager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message)
def migrate(previous_version, master_ips=None, extra_ips=None): """ Migrates from any version to any version, running all migrations required If previous_version is for example 0 and this script is at verison 3 it will execute two steps: - 1 > 2 - 2 > 3 :param previous_version: The previous version from which to start the migration. :param master_ips: IP addresses of the MASTER nodes :param extra_ips: IP addresses of the EXTRA nodes """ working_version = previous_version # Version 1 introduced: # - Flexible SSD layout if working_version < 1: try: from ovs.extensions.generic.configuration import Configuration if Configuration.exists('ovs.arakoon'): Configuration.delete('ovs.arakoon', remove_root=True) Configuration.set('ovs.core.ovsdb', '/opt/OpenvStorage/db') except: logger.exception('Error migrating to version 1') working_version = 1 # Version 2 introduced: # - Registration if working_version < 2: try: import time from ovs.extensions.generic.configuration import Configuration if not Configuration.exists('ovs.core.registered'): Configuration.set('ovs.core.registered', False) Configuration.set('ovs.core.install_time', time.time()) except: logger.exception('Error migrating to version 2') working_version = 2 # Version 3 introduced: # - New arakoon clients if working_version < 3: try: from ovs.extensions.db.arakoon import ArakoonInstaller reload(ArakoonInstaller) from ovs.extensions.db.arakoon.ArakoonInstaller import ArakoonInstaller from ovs.extensions.generic.sshclient import SSHClient from ovs.extensions.generic.configuration import Configuration if master_ips is not None: for ip in master_ips: client = SSHClient(ip) if client.dir_exists(ArakoonInstaller.ARAKOON_CONFIG_DIR): for cluster_name in client.dir_list(ArakoonInstaller.ARAKOON_CONFIG_DIR): try: ArakoonInstaller.deploy_cluster(cluster_name, ip) except: pass if Configuration.exists('ovs.core.storage.persistent'): Configuration.set('ovs.core.storage.persistent', 'pyrakoon') except: logger.exception('Error migrating to version 3') working_version = 3 # Version 4 introduced: # - Etcd if working_version < 4: try: import os import json from ConfigParser import RawConfigParser from ovs.extensions.db.etcd import installer reload(installer) from ovs.extensions.db.etcd.installer import EtcdInstaller from ovs.extensions.db.etcd.configuration import EtcdConfiguration from ovs.extensions.generic.system import System host_id = System.get_my_machine_id() etcd_migrate = False if EtcdInstaller.has_cluster('127.0.0.1', 'config'): etcd_migrate = True else: if master_ips is not None and extra_ips is not None: cluster_ip = None for ip in master_ips + extra_ips: if EtcdInstaller.has_cluster(ip, 'config'): cluster_ip = ip break node_ip = None path = '/opt/OpenvStorage/config/ovs.json' if os.path.exists(path): with open(path) as config_file: config = json.load(config_file) node_ip = config['grid']['ip'] if node_ip is not None: if cluster_ip is None: EtcdInstaller.create_cluster('config', node_ip) EtcdConfiguration.initialize() EtcdConfiguration.initialize_host(host_id) else: EtcdInstaller.extend_cluster(cluster_ip, node_ip, 'config') EtcdConfiguration.initialize_host(host_id) etcd_migrate = True if etcd_migrate is True: # Migrating configuration files path = '/opt/OpenvStorage/config/ovs.json' if os.path.exists(path): with open(path) as config_file: config = json.load(config_file) EtcdConfiguration.set('/ovs/framework/cluster_id', config['support']['cid']) if not EtcdConfiguration.exists('/ovs/framework/install_time'): EtcdConfiguration.set('/ovs/framework/install_time', config['core']['install_time']) else: EtcdConfiguration.set('/ovs/framework/install_time', min(EtcdConfiguration.get('/ovs/framework/install_time'), config['core']['install_time'])) EtcdConfiguration.set('/ovs/framework/registered', config['core']['registered']) EtcdConfiguration.set('/ovs/framework/plugins/installed', config['plugins']) EtcdConfiguration.set('/ovs/framework/stores', config['core']['storage']) EtcdConfiguration.set('/ovs/framework/paths', {'cfgdir': config['core']['cfgdir'], 'basedir': config['core']['basedir'], 'ovsdb': config['core']['ovsdb']}) EtcdConfiguration.set('/ovs/framework/support', {'enablesupport': config['support']['enablesupport'], 'enabled': config['support']['enabled'], 'interval': config['support']['interval']}) EtcdConfiguration.set('/ovs/framework/storagedriver', {'mds_safety': config['storagedriver']['mds']['safety'], 'mds_tlogs': config['storagedriver']['mds']['tlogs'], 'mds_maxload': config['storagedriver']['mds']['maxload']}) EtcdConfiguration.set('/ovs/framework/webapps', {'html_endpoint': config['webapps']['html_endpoint'], 'oauth2': config['webapps']['oauth2']}) EtcdConfiguration.set('/ovs/framework/messagequeue', {'endpoints': [], 'protocol': config['core']['broker']['protocol'], 'user': config['core']['broker']['login'], 'port': config['core']['broker']['port'], 'password': config['core']['broker']['password'], 'queues': config['core']['broker']['queues']}) host_key = '/ovs/framework/hosts/{0}{{0}}'.format(host_id) EtcdConfiguration.set(host_key.format('/storagedriver'), {'rsp': config['storagedriver']['rsp'], 'vmware_mode': config['storagedriver']['vmware_mode']}) EtcdConfiguration.set(host_key.format('/ports'), config['ports']) EtcdConfiguration.set(host_key.format('/setupcompleted'), config['core']['setupcompleted']) EtcdConfiguration.set(host_key.format('/versions'), config['core'].get('versions', {})) EtcdConfiguration.set(host_key.format('/type'), config['core']['nodetype']) EtcdConfiguration.set(host_key.format('/ip'), config['grid']['ip']) path = '{0}/memcacheclient.cfg'.format(EtcdConfiguration.get('/ovs/framework/paths|cfgdir')) if os.path.exists(path): config = RawConfigParser() config.read(path) nodes = [config.get(node.strip(), 'location').strip() for node in config.get('main', 'nodes').split(',')] EtcdConfiguration.set('/ovs/framework/memcache|endpoints', nodes) os.remove(path) path = '{0}/rabbitmqclient.cfg'.format(EtcdConfiguration.get('/ovs/framework/paths|cfgdir')) if os.path.exists(path): config = RawConfigParser() config.read(path) nodes = [config.get(node.strip(), 'location').strip() for node in config.get('main', 'nodes').split(',')] EtcdConfiguration.set('/ovs/framework/messagequeue|endpoints', nodes) os.remove(path) # Migrate arakoon configuration files from ovs.extensions.db.arakoon import ArakoonInstaller reload(ArakoonInstaller) from ovs.extensions.db.arakoon.ArakoonInstaller import ArakoonInstaller, ArakoonClusterConfig from ovs.extensions.generic.sshclient import SSHClient if master_ips is not None: config_dir = '/opt/OpenvStorage/config/arakoon/' for ip in master_ips: client = SSHClient(ip) if client.dir_exists(config_dir): for cluster_name in client.dir_list(config_dir): try: with open('{0}/{1}/{1}.cfg'.format(config_dir, cluster_name)) as config_file: EtcdConfiguration.set(ArakoonClusterConfig.ETCD_CONFIG_KEY.format(cluster_name), config_file.read(), raw=True) ArakoonInstaller.deploy_cluster(cluster_name, ip) except: logger.exception('Error migrating {0} on {1}'.format(cluster_name, ip)) client.dir_delete(config_dir) except: logger.exception('Error migrating to version 4') working_version = 4 return working_version
def migrate(previous_version, master_ips=None, extra_ips=None): """ Migrates from any version to any version, running all migrations required If previous_version is for example 0 and this script is at verison 3 it will execute two steps: - 1 > 2 - 2 > 3 :param previous_version: The previous version from which to start the migration. :param master_ips: IP addresses of the MASTER nodes :param extra_ips: IP addresses of the EXTRA nodes """ logger = LogHandler.get('extensions', name='migration') working_version = previous_version # Version 1 introduced: # - Flexible SSD layout if working_version < 1: try: from ovs.extensions.generic.configuration import Configuration if Configuration.exists('ovs.arakoon'): Configuration.delete('ovs.arakoon', remove_root=True) Configuration.set('ovs.core.ovsdb', '/opt/OpenvStorage/db') except: logger.exception('Error migrating to version 1') working_version = 1 # Version 2 introduced: # - Registration if working_version < 2: try: import time from ovs.extensions.generic.configuration import Configuration if not Configuration.exists('ovs.core.registered'): Configuration.set('ovs.core.registered', False) Configuration.set('ovs.core.install_time', time.time()) except: logger.exception('Error migrating to version 2') working_version = 2 # Version 3 introduced: # - New arakoon clients if working_version < 3: try: from ovs.extensions.db.arakoon import ArakoonInstaller reload(ArakoonInstaller) from ovs.extensions.db.arakoon.ArakoonInstaller import ArakoonInstaller from ovs.extensions.generic.sshclient import SSHClient from ovs.extensions.generic.configuration import Configuration if master_ips is not None: for ip in master_ips: client = SSHClient(ip) if client.dir_exists( ArakoonInstaller.ARAKOON_CONFIG_DIR): for cluster_name in client.dir_list( ArakoonInstaller.ARAKOON_CONFIG_DIR): try: ArakoonInstaller.deploy_cluster( cluster_name, ip) except: pass if Configuration.exists('ovs.core.storage.persistent'): Configuration.set('ovs.core.storage.persistent', 'pyrakoon') except: logger.exception('Error migrating to version 3') working_version = 3 # Version 4 introduced: # - Etcd if working_version < 4: try: import os import json from ConfigParser import RawConfigParser from ovs.extensions.db.etcd import installer reload(installer) from ovs.extensions.db.etcd.installer import EtcdInstaller from ovs.extensions.db.etcd.configuration import EtcdConfiguration from ovs.extensions.generic.system import System host_id = System.get_my_machine_id() etcd_migrate = False if EtcdInstaller.has_cluster('127.0.0.1', 'config'): etcd_migrate = True else: if master_ips is not None and extra_ips is not None: cluster_ip = None for ip in master_ips + extra_ips: if EtcdInstaller.has_cluster(ip, 'config'): cluster_ip = ip break node_ip = None path = '/opt/OpenvStorage/config/ovs.json' if os.path.exists(path): with open(path) as config_file: config = json.load(config_file) node_ip = config['grid']['ip'] if node_ip is not None: if cluster_ip is None: EtcdInstaller.create_cluster('config', node_ip) EtcdConfiguration.initialize() EtcdConfiguration.initialize_host(host_id) else: EtcdInstaller.extend_cluster( cluster_ip, node_ip, 'config') EtcdConfiguration.initialize_host(host_id) etcd_migrate = True if etcd_migrate is True: # Migrating configuration files path = '/opt/OpenvStorage/config/ovs.json' if os.path.exists(path): with open(path) as config_file: config = json.load(config_file) EtcdConfiguration.set('/ovs/framework/cluster_id', config['support']['cid']) if not EtcdConfiguration.exists( '/ovs/framework/install_time'): EtcdConfiguration.set( '/ovs/framework/install_time', config['core']['install_time']) else: EtcdConfiguration.set( '/ovs/framework/install_time', min( EtcdConfiguration.get( '/ovs/framework/install_time'), config['core']['install_time'])) EtcdConfiguration.set('/ovs/framework/registered', config['core']['registered']) EtcdConfiguration.set( '/ovs/framework/plugins/installed', config['plugins']) EtcdConfiguration.set('/ovs/framework/stores', config['core']['storage']) EtcdConfiguration.set( '/ovs/framework/paths', { 'cfgdir': config['core']['cfgdir'], 'basedir': config['core']['basedir'], 'ovsdb': config['core']['ovsdb'] }) EtcdConfiguration.set( '/ovs/framework/support', { 'enablesupport': config['support']['enablesupport'], 'enabled': config['support']['enabled'], 'interval': config['support']['interval'] }) EtcdConfiguration.set( '/ovs/framework/storagedriver', { 'mds_safety': config['storagedriver']['mds']['safety'], 'mds_tlogs': config['storagedriver']['mds']['tlogs'], 'mds_maxload': config['storagedriver']['mds']['maxload'] }) EtcdConfiguration.set( '/ovs/framework/webapps', { 'html_endpoint': config['webapps']['html_endpoint'], 'oauth2': config['webapps']['oauth2'] }) EtcdConfiguration.set( '/ovs/framework/messagequeue', { 'endpoints': [], 'protocol': config['core']['broker']['protocol'], 'user': config['core']['broker']['login'], 'port': config['core']['broker']['port'], 'password': config['core']['broker']['password'], 'queues': config['core']['broker']['queues'] }) host_key = '/ovs/framework/hosts/{0}{{0}}'.format( host_id) EtcdConfiguration.set( host_key.format('/storagedriver'), { 'rsp': config['storagedriver']['rsp'], 'vmware_mode': config['storagedriver']['vmware_mode'] }) EtcdConfiguration.set(host_key.format('/ports'), config['ports']) EtcdConfiguration.set( host_key.format('/setupcompleted'), config['core']['setupcompleted']) EtcdConfiguration.set( host_key.format('/versions'), config['core'].get('versions', {})) EtcdConfiguration.set(host_key.format('/type'), config['core']['nodetype']) EtcdConfiguration.set(host_key.format('/ip'), config['grid']['ip']) path = '{0}/memcacheclient.cfg'.format( EtcdConfiguration.get('/ovs/framework/paths|cfgdir')) if os.path.exists(path): config = RawConfigParser() config.read(path) nodes = [ config.get(node.strip(), 'location').strip() for node in config.get('main', 'nodes').split(',') ] EtcdConfiguration.set( '/ovs/framework/memcache|endpoints', nodes) os.remove(path) path = '{0}/rabbitmqclient.cfg'.format( EtcdConfiguration.get('/ovs/framework/paths|cfgdir')) if os.path.exists(path): config = RawConfigParser() config.read(path) nodes = [ config.get(node.strip(), 'location').strip() for node in config.get('main', 'nodes').split(',') ] EtcdConfiguration.set( '/ovs/framework/messagequeue|endpoints', nodes) os.remove(path) # Migrate arakoon configuration files from ovs.extensions.db.arakoon import ArakoonInstaller reload(ArakoonInstaller) from ovs.extensions.db.arakoon.ArakoonInstaller import ArakoonInstaller, ArakoonClusterConfig from ovs.extensions.generic.sshclient import SSHClient if master_ips is not None: config_dir = '/opt/OpenvStorage/config/arakoon/' for ip in master_ips: client = SSHClient(ip) if client.dir_exists(config_dir): for cluster_name in client.dir_list( config_dir): try: with open('{0}/{1}/{1}.cfg'.format( config_dir, cluster_name)) as config_file: EtcdConfiguration.set( ArakoonClusterConfig. ETCD_CONFIG_KEY.format( cluster_name), config_file.read(), raw=True) ArakoonInstaller.deploy_cluster( cluster_name, ip) except: logger.exception( 'Error migrating {0} on {1}'. format(cluster_name, ip)) client.dir_delete(config_dir) except: logger.exception('Error migrating to version 4') working_version = 4 return working_version
def execute_scrub_work(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled :type error_messages: list :return: a list of error messages :rtype: list """ def _verify_mds_config(current_vdisk): current_vdisk.invalidate_dynamics('info') vdisk_configs = current_vdisk.info['metadata_backend_config'] if len(vdisk_configs) == 0: raise RuntimeError('Could not load MDS configuration') return vdisk_configs client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] scrub_directory = '{0}/scrub_work_{1}_{2}'.format(scrub_info['scrub_path'], vpool.name, storagerouter.name) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(vpool.guid, storagerouter.guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(vpool.guid, storagerouter.guid) alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format(vpool.name, storagerouter.name) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod(scrub_directory, 0777) # Celery task executed by 'ovs' user and should be able to write in it if ServiceManager.has_service(name=alba_proxy_service, client=client) is True and ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get('/ovs/framework/hosts/{0}/ports|storagedriver'.format(machine_id)) port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] # Scrub config # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini', # u'fragment_cache': [u'none'], # u'ips': [u'127.0.0.1'], # u'log_level': u'info', # u'manifest_cache_size': 17179869184, # u'port': 0, # u'transport': u'tcp'} # Backend config # {u'alba_connection_host': u'10.100.193.155', # u'alba_connection_port': 26204, # u'alba_connection_preset': u'preset', # u'alba_connection_timeout': 15, # u'alba_connection_transport': u'TCP', # u'backend_interface_retries_on_error': 5, # u'backend_interface_retry_backoff_multiplier': 2.0, # u'backend_interface_retry_interval_secs': 1, # u'backend_type': u'ALBA'} scrub_config = Configuration.get('ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = {'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path('alba_proxy'), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key)} ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) ServiceManager.start_service(name=alba_proxy_service, client=client) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get('ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, vpool.storagedrivers[0].storagedriver_id))['backend_connection_manager'] backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config['port'] Configuration.set(backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message) if client is not None and ServiceManager.has_service(name=alba_proxy_service, client=client) is True: if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ServiceManager.stop_service(name=alba_proxy_service, client=client) ServiceManager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) try: # Empty the queue with vDisks to scrub with remote(storagerouter.ip, [VDisk]) as rem: while True: vdisk = None vdisk_guid = queue.get(False) try: # Check MDS master is local. Trigger MDS handover if necessary vdisk = rem.VDisk(vdisk_guid) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'.format(vpool.name, storagerouter.name, vdisk.name, scrub_directory)) configs = _verify_mds_config(current_vdisk=vdisk) storagedriver = StorageDriverList.get_by_storagedriver_id(vdisk.storagedriver_id) if configs[0].get('ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'.format(vpool.name, storagerouter.name, vdisk.name)) MDSServiceController.ensure_safety(VDisk(vdisk_guid)) # Do not use a remote VDisk instance here configs = _verify_mds_config(current_vdisk=vdisk) if configs[0].get('ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.warning('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'.format(vpool.name, storagerouter.name, vdisk.name)) continue # Do the actual scrubbing with vdisk.storagedriver_client.make_locked_client(str(vdisk.volume_id)) as locked_client: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'.format(vpool.name, storagerouter.name, vdisk.name)) work_units = locked_client.get_scrubbing_workunits() for work_unit in work_units: res = locked_client.scrub(work_unit=work_unit, scratch_dir=scrub_directory, log_sinks=[LogHandler.get_sink_path('scrubber', allow_override=True)], backend_config=Configuration.get_configuration_path(backend_config_key)) locked_client.apply_scrubbing_result(scrubbing_work_result=res) if work_units: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'.format(vpool.name, storagerouter.name, vdisk.name, len(work_units))) else: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'.format(vpool.name, storagerouter.name, vdisk.name)) except Exception: if vdisk is None: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(vpool.name, storagerouter.name, vdisk_guid) else: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(vpool.name, storagerouter.name, vdisk.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) except Empty: # Raised when all items have been fetched from the queue ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'.format(vpool.name, storagerouter.name)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(vpool.name, storagerouter.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if ServiceManager.has_service(alba_proxy_service, client=client): ServiceManager.stop_service(alba_proxy_service, client=client) ServiceManager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message)
def migrate(previous_version, master_ips=None, extra_ips=None): """ Migrates from any version to any version, running all migrations required If previous_version is for example 0 and this script is at verison 3 it will execute two steps: - 1 > 2 - 2 > 3 :param previous_version: The previous version from which to start the migration. :param master_ips: IP addresses of the MASTER nodes :param extra_ips: IP addresses of the EXTRA nodes """ working_version = previous_version # Version 1 introduced: # - Flexible SSD layout if working_version < 1: from ovs.extensions.generic.configuration import Configuration if Configuration.exists('ovs.arakoon'): Configuration.delete('ovs.arakoon', remove_root=True) Configuration.set('ovs.core.ovsdb', '/opt/OpenvStorage/db') working_version = 1 # Version 2 introduced: # - Registration if working_version < 2: import time from ovs.extensions.generic.configuration import Configuration if not Configuration.exists('ovs.core.registered'): Configuration.set('ovs.core.registered', False) Configuration.set('ovs.core.install_time', time.time()) working_version = 2 # Version 3 introduced: # - New arakoon clients if working_version < 3: from ovs.extensions.db.arakoon import ArakoonInstaller reload(ArakoonInstaller) from ovs.extensions.db.arakoon import ArakoonInstaller from ovs.extensions.generic.sshclient import SSHClient from ovs.extensions.generic.configuration import Configuration if master_ips is not None: for ip in master_ips: client = SSHClient(ip) if client.dir_exists(ArakoonInstaller.ArakoonInstaller. ARAKOON_CONFIG_DIR): for cluster_name in client.dir_list( ArakoonInstaller.ArakoonInstaller. ARAKOON_CONFIG_DIR): try: ArakoonInstaller.ArakoonInstaller.deploy_cluster( cluster_name, ip) except: pass if Configuration.exists('ovs.core.storage.persistent'): Configuration.set('ovs.core.storage.persistent', 'pyrakoon') working_version = 3 return working_version
def execute_update(components): """ Update the specified components on all StorageRouters This is called upon by 'at' :return: None """ filemutex = file_mutex('system_update', wait=2) ssh_clients = [] services_stop_start = set() try: filemutex.acquire() UpdateController._logger.debug('+++ Starting update +++') from ovs.dal.lists.storagerouterlist import StorageRouterList # Create SSHClients to all nodes UpdateController._logger.debug('Generating SSH client connections for each storage router') storage_routers = StorageRouterList.get_storagerouters() master_ips = [] extra_ips = [] for sr in storage_routers: try: ssh_clients.append(SSHClient(sr.ip, username='******')) if sr.node_type == 'MASTER': master_ips.append(sr.ip) elif sr.node_type == 'EXTRA': extra_ips.append(sr.ip) except UnableToConnectException: raise Exception('Update is only allowed on systems where all nodes are online and fully functional') # Create locks for client in ssh_clients: UpdateController._logger.debug('{0}: Creating lock files'.format(client.ip)) client.run(['touch', UpdateController._update_file]) # Prevents manual install or update individual packages client.run(['touch', UpdateController._update_ongoing_file]) # Check requirements packages_to_update = {} services_post_update = set() update_information = UpdateController.get_update_information_all() for component, component_info in update_information.iteritems(): if component in components: UpdateController._logger.debug('Verifying update information for component: {0}'.format(component.upper())) Toolbox.verify_required_params(actual_params=component_info, required_params={'downtime': (list, None), 'packages': (dict, None), 'prerequisites': (list, None), 'services_stop_start': (set, None), 'services_post_update': (set, None)}) if len(component_info['prerequisites']) > 0: raise Exception('Update is only allowed when all prerequisites have been met') packages_to_update.update(component_info['packages']) services_stop_start.update(component_info['services_stop_start']) services_post_update.update(component_info['services_post_update']) if len(packages_to_update) > 0: UpdateController._logger.debug('Packages to be updated: {0}'.format(', '.join(sorted(packages_to_update.keys())))) if len(services_stop_start) > 0: UpdateController._logger.debug('Services to stop before package update: {0}'.format(', '.join(sorted(services_stop_start)))) if len(services_post_update) > 0: UpdateController._logger.debug('Services which will be restarted after update: {0}'.format(', '.join(sorted(services_post_update)))) # Stop services if UpdateController.change_services_state(services=services_stop_start, ssh_clients=ssh_clients, action='stop') is False: raise Exception('Stopping all services on every node failed, cannot continue') # Install packages # First install packages on all StorageRouters individually if packages_to_update: failures = False for client in ssh_clients: UpdateController._logger.debug('{0}: Installing packages'.format(client.ip)) for function in Toolbox.fetch_hooks('update', 'package_install_multi'): try: function(client=client, package_info=packages_to_update, components=components) except Exception as ex: UpdateController._logger.error('{0}: Package installation hook {1} failed with error: {2}'.format(client.ip, function.__name__, ex)) failures = True if set(components).difference({'framework', 'storagedriver'}): # Second install packages on all ALBA nodes for function in Toolbox.fetch_hooks('update', 'package_install_single'): try: function(package_info=packages_to_update, components=components) except Exception as ex: UpdateController._logger.exception('Package installation hook {0} failed with error: {1}'.format(function.__name__, ex)) failures = True if failures is True: raise Exception('Installing the packages failed on 1 or more nodes') # Remove update file for client in ssh_clients: client.file_delete(UpdateController._update_file) # Migrate code if 'framework' in components: failures = [] for client in ssh_clients: UpdateController._logger.debug('{0}: Verifying extensions code migration is required'.format(client.ip)) try: key = '/ovs/framework/hosts/{0}/versions'.format(System.get_my_machine_id(client=client)) old_versions = Configuration.get(key) if Configuration.exists(key) else {} try: with remote(client.ip, [Migrator]) as rem: rem.Migrator.migrate(master_ips, extra_ips) except EOFError as eof: UpdateController._logger.warning('{0}: EOFError during code migration, retrying {1}'.format(client.ip, eof)) with remote(client.ip, [Migrator]) as rem: rem.Migrator.migrate(master_ips, extra_ips) new_versions = Configuration.get(key) if Configuration.exists(key) else {} if old_versions != new_versions: UpdateController._logger.debug('{0}: Finished extensions code migration. Old versions: {1} --> New versions: {2}'.format(client.ip, old_versions, new_versions)) except Exception as ex: failures.append('{0}: {1}'.format(client.ip, str(ex))) if len(failures) > 0: raise Exception('Failed to run the extensions migrate code on all nodes. Errors found:\n\n{0}'.format('\n\n'.join(failures))) # Start memcached if 'memcached' in services_stop_start: services_stop_start.remove('memcached') UpdateController._logger.debug('Starting memcached') UpdateController.change_services_state(services=['memcached'], ssh_clients=ssh_clients, action='start') # Migrate model if 'framework' in components: UpdateController._logger.debug('Verifying DAL code migration is required') old_versions = PersistentFactory.get_client().get('ovs_model_version') if PersistentFactory.get_client().exists('ovs_model_version') else {} from ovs.dal.helpers import Migration with remote(ssh_clients[0].ip, [Migration]) as rem: rem.Migration.migrate() new_versions = PersistentFactory.get_client().get('ovs_model_version') if PersistentFactory.get_client().exists('ovs_model_version') else {} if old_versions != new_versions: UpdateController._logger.debug('Finished DAL code migration. Old versions: {0} --> New versions: {1}'.format(old_versions, new_versions)) # Post update actions for client in ssh_clients: UpdateController._logger.debug('{0}: Executing post-update actions'.format(client.ip)) for function in Toolbox.fetch_hooks('update', 'post_update_multi'): try: function(client=client, components=components) except Exception as ex: UpdateController._logger.exception('{0}: Post update hook {1} failed with error: {2}'.format(client.ip, function.__name__, ex)) for function in Toolbox.fetch_hooks('update', 'post_update_single'): try: function(components=components) except Exception as ex: UpdateController._logger.exception('Post update hook {0} failed with error: {1}'.format(function.__name__, ex)) # Start services UpdateController.change_services_state(services=services_stop_start, ssh_clients=ssh_clients, action='start') UpdateController._refresh_package_information() UpdateController._logger.debug('+++ Finished updating +++') except NoLockAvailableException: UpdateController._logger.debug('Another update is currently in progress!') except Exception as ex: UpdateController._logger.exception('Error during update: {0}'.format(ex)) if len(ssh_clients) > 0: UpdateController.change_services_state(services=services_stop_start, ssh_clients=ssh_clients, action='start') UpdateController._refresh_package_information() UpdateController._logger.error('Failed to update. Please check all the logs for more information') finally: filemutex.release() for ssh_client in ssh_clients: for file_name in [UpdateController._update_file, UpdateController._update_ongoing_file]: try: if ssh_client.file_exists(file_name): ssh_client.file_delete(file_name) except: UpdateController._logger.warning('[0}: Failed to remove lock file {1}'.format(ssh_client.ip, file_name))
def cluster_registry_checkup(): """ Verify whether changes have occurred in the cluster registry for each vPool :return: Information whether changes occurred :rtype: dict """ changed_vpools = {} for vpool in VPoolList.get_vpools(): changed_vpools[vpool.guid] = {'changes': False, 'success': True} try: StorageDriverController._logger.info( 'Validating cluster registry settings for Vpool {0}'. format(vpool.guid)) current_configs = vpool.clusterregistry_client.get_node_configs( ) changes = len(current_configs) == 0 node_configs = [] for sd in vpool.storagedrivers: sd.invalidate_dynamics(['cluster_node_config']) new_config = sd.cluster_node_config node_configs.append(ClusterNodeConfig(**new_config)) if changes is False: current_node_configs = [ config for config in current_configs if config.vrouter_id == sd.storagedriver_id ] if len(current_node_configs) == 1: current_node_config = current_node_configs[0] for key in new_config: if getattr(current_node_config, key) != new_config[key]: changes = True break changed_vpools[vpool.guid]['changes'] = changes if changes is True: StorageDriverController._logger.info( 'Cluster registry settings for Vpool {0} needs to be updated' .format(vpool.guid)) available_storagedrivers = [] for sd in vpool.storagedrivers: storagerouter = sd.storagerouter try: SSHClient(storagerouter, username='******') except UnableToConnectException: StorageDriverController._logger.warning( 'StorageRouter {0} not available.'.format( storagerouter.name)) continue with remote(storagerouter.ip, [LocalStorageRouterClient]) as rem: sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format( vpool.guid, sd.storagedriver_id) if Configuration.exists(sd_key) is True: path = Configuration.get_configuration_path( sd_key) try: lsrc = rem.LocalStorageRouterClient(path) lsrc.server_revision( ) # 'Cheap' call to verify whether volumedriver is responsive available_storagedrivers.append(sd) except Exception as ex: if 'ClusterNotReachableException' in str( ex): StorageDriverController._logger.warning( 'StorageDriver {0} on StorageRouter {1} not available.' .format(sd.guid, storagerouter.name)) else: StorageDriverController._logger.exception( 'Got exception when validating StorageDriver {0} on StorageRouter {1}.' .format(sd.guid, storagerouter.name)) StorageDriverController._logger.info( 'Updating cluster node configs for VPool {0}'.format( vpool.guid)) vpool.clusterregistry_client.set_node_configs(node_configs) for sd in available_storagedrivers: StorageDriverController._logger.info( 'Trigger config reload for StorageDriver {0}'. format(sd.guid)) vpool.storagedriver_client.update_cluster_node_configs( str(sd.storagedriver_id), req_timeout_secs=10) StorageDriverController._logger.info( 'Updating cluster node configs for Vpool {0} completed' .format(vpool.guid)) else: StorageDriverController._logger.info( 'Cluster registry settings for Vpool {0} is up to date' .format(vpool.guid)) except Exception as ex: StorageDriverController._logger.exception( 'Got exception when validating cluster registry settings for Vpool {0}.' .format(vpool.name)) changed_vpools[vpool.guid]['success'] = False changed_vpools[vpool.guid]['error'] = ex.message return changed_vpools
def stop_services(self): """ Stop all services related to the Storagedriver :return: A boolean indicating whether something went wrong :rtype: bool """ if self.sr_installer is None: raise RuntimeError('No StorageRouterInstaller instance found') root_client = self.sr_installer.root_client errors_found = False for service in [self.sd_service, self.dtl_service]: try: if self.service_manager.has_service(name=service, client=root_client): self._logger.debug('StorageDriver {0} - Stopping service {1}'.format(self.storagedriver.guid, service)) self.service_manager.stop_service(name=service, client=root_client) self._logger.debug('StorageDriver {0} - Removing service {1}'.format(self.storagedriver.guid, service)) self.service_manager.remove_service(name=service, client=root_client) except Exception: self._logger.exception('StorageDriver {0} - Disabling/stopping service {1} failed'.format(self.storagedriver.guid, service)) errors_found = True sd_config_key = '/ovs/vpools/{0}/hosts/{1}/config'.format(self.vp_installer.vpool.guid, self.storagedriver.storagedriver_id) if self.vp_installer.storagedriver_amount <= 1 and Configuration.exists(sd_config_key): try: for proxy in self.storagedriver.alba_proxies: if self.service_manager.has_service(name=proxy.service.name, client=root_client): self._logger.debug('StorageDriver {0} - Starting proxy {1}'.format(self.storagedriver.guid, proxy.service.name)) self.service_manager.start_service(name=proxy.service.name, client=root_client) tries = 10 running = False port = proxy.service.ports[0] while running is False and tries > 0: self._logger.debug('StorageDriver {0} - Waiting for the proxy {1} to start up'.format(self.storagedriver.guid, proxy.service.name)) tries -= 1 time.sleep(10 - tries) try: root_client.run(['alba', 'proxy-statistics', '--host', self.storagedriver.storage_ip, '--port', str(port)]) running = True except CalledProcessError as ex: self._logger.error('StorageDriver {0} - Fetching alba proxy-statistics failed with error (but ignoring): {1}'.format(self.storagedriver.guid, ex)) if running is False: raise RuntimeError('Alba proxy {0} failed to start'.format(proxy.service.name)) self._logger.debug('StorageDriver {0} - Alba proxy {0} running'.format(self.storagedriver.guid, proxy.service.name)) self._logger.debug('StorageDriver {0} - Destroying filesystem and erasing node configs'.format(self.storagedriver.guid)) with remote(root_client.ip, [LocalStorageRouterClient], username='******') as rem: path = Configuration.get_configuration_path(sd_config_key) storagedriver_client = rem.LocalStorageRouterClient(path) try: storagedriver_client.destroy_filesystem() except RuntimeError as rte: # If backend has already been deleted, we cannot delete the filesystem anymore --> storage leak!!! if 'MasterLookupResult.Error' not in rte.message: raise self.vp_installer.vpool.clusterregistry_client.erase_node_configs() except RuntimeError: self._logger.exception('StorageDriver {0} - Destroying filesystem and erasing node configs failed'.format(self.storagedriver.guid)) errors_found = True for proxy in self.storagedriver.alba_proxies: service_name = proxy.service.name try: if self.service_manager.has_service(name=service_name, client=root_client): self._logger.debug('StorageDriver {0} - Stopping service {1}'.format(self.storagedriver.guid, service_name)) self.service_manager.stop_service(name=service_name, client=root_client) self._logger.debug('StorageDriver {0} - Removing service {1}'.format(self.storagedriver.guid, service_name)) self.service_manager.remove_service(name=service_name, client=root_client) except Exception: self._logger.exception('StorageDriver {0} - Disabling/stopping service {1} failed'.format(self.storagedriver.guid, service_name)) errors_found = True return errors_found