def _revert_vpool_status(vpool, status=VPool.STATUSES.RUNNING, storagedriver=None, client=None, dirs_created=None): """ Remove the vPool being created or revert the vPool being extended :return: None :rtype: NoneType """ vpool.status = status vpool.save() if status == VPool.STATUSES.RUNNING: if len(dirs_created) > 0: try: client.dir_delete(directories=dirs_created) except Exception: StorageRouterController._logger.warning( 'Failed to clean up following directories: {0}'.format( ', '.join(dirs_created))) if storagedriver is not None: for sdp in storagedriver.partitions: sdp.delete() for proxy in storagedriver.alba_proxies: proxy.delete() storagedriver.delete() if len(vpool.storagedrivers) == 0: vpool.delete() if Configuration.dir_exists( key='/ovs/vpools/{0}'.format(vpool.guid)): Configuration.delete( key='/ovs/vpools/{0}'.format(vpool.guid))
def remove_node(node_guid): """ Removes an ALBA node :param node_guid: Guid of the ALBA node to remove :type node_guid: str :return: None """ node = AlbaNode(node_guid) for disk in node.disks: for osd in disk.osds: AlbaNodeController.remove_asd(node_guid=osd.alba_disk.alba_node_guid, asd_id=osd.osd_id, expected_safety=None) AlbaNodeController.remove_disk(node_guid=disk.alba_node_guid, device_alias=disk.aliases[0]) try: for service_name in node.client.list_maintenance_services(): node.client.remove_maintenance_service(service_name) except (requests.ConnectionError, requests.Timeout): AlbaNodeController._logger.exception('Could not connect to node {0} to retrieve the maintenance services'.format(node.guid)) except InvalidCredentialsError: AlbaNodeController._logger.warning('Failed to retrieve the maintenance services for ALBA node {0}'.format(node.node_id)) if Configuration.dir_exists('/ovs/alba/asdnodes/{0}'.format(node.node_id)): Configuration.delete('/ovs/alba/asdnodes/{0}'.format(node.node_id)) node.delete()
def migrate(previous_version): """ Migrates from any version to any version, running all migrations required If previous_version is for example 0 and this script is at verison 3 it will execute two steps: - 1 > 2 - 2 > 3 @param previous_version: The previous version from which to start the migration. """ working_version = previous_version # Version 1 introduced: # - Flexible SSD layout if working_version < 1: from ovs.extensions.generic.configuration import Configuration if Configuration.exists('ovs.arakoon'): Configuration.delete('ovs.arakoon', remove_root=True) Configuration.set('ovs.core.ovsdb', '/opt/OpenvStorage/db') working_version = 1 # Version 2 introduced: # - Registration if working_version < 2: import time from ovs.extensions.generic.configuration import Configuration if not Configuration.exists('ovs.core.registered'): Configuration.set('ovs.core.registered', False) Configuration.set('ovs.core.install_time', time.time()) working_version = 2 return working_version
def migrate(previous_version, master_ips=None, extra_ips=None): """ Migrates from any version to any version, running all migrations required If previous_version is for example 0 and this script is at verison 3 it will execute two steps: - 1 > 2 - 2 > 3 :param previous_version: The previous version from which to start the migration. :param master_ips: IP addresses of the MASTER nodes :param extra_ips: IP addresses of the EXTRA nodes """ working_version = previous_version # Version 1 introduced: # - Flexible SSD layout if working_version < 1: from ovs.extensions.generic.configuration import Configuration if Configuration.exists('ovs.arakoon'): Configuration.delete('ovs.arakoon', remove_root=True) Configuration.set('ovs.core.ovsdb', '/opt/OpenvStorage/db') working_version = 1 # Version 2 introduced: # - Registration if working_version < 2: import time from ovs.extensions.generic.configuration import Configuration if not Configuration.exists('ovs.core.registered'): Configuration.set('ovs.core.registered', False) Configuration.set('ovs.core.install_time', time.time()) working_version = 2 # Version 3 introduced: # - New arakoon clients if working_version < 3: from ovs.extensions.db.arakoon import ArakoonInstaller reload(ArakoonInstaller) from ovs.extensions.db.arakoon import ArakoonInstaller from ovs.extensions.generic.sshclient import SSHClient from ovs.extensions.generic.configuration import Configuration if master_ips is not None: for ip in master_ips: client = SSHClient(ip) if client.dir_exists(ArakoonInstaller.ArakoonInstaller.ARAKOON_CONFIG_DIR): for cluster_name in client.dir_list(ArakoonInstaller.ArakoonInstaller.ARAKOON_CONFIG_DIR): try: ArakoonInstaller.ArakoonInstaller.deploy_cluster(cluster_name, ip) except: pass if Configuration.exists('ovs.core.storage.persistent'): Configuration.set('ovs.core.storage.persistent', 'pyrakoon') working_version = 3 return working_version
def delete_config(cluster_name): """ Remove the configuration entry for arakoon cluster_name :param cluster_name: Name of the arakoon cluster :return: None """ config_key = GeneralArakoon.CONFIG_KEY.format(cluster_name) if Configuration.exists(config_key, raw=True): Configuration.delete(os.path.dirname(config_key))
def test_delete(self): set_data = [(int, '/fooint', 1, False), (basestring, '/foostr', 'foo', True), (dict, '/foodict', {'foo': 'bar'}, False)] get_data = [(int, 'fooint', 1, False), (basestring, 'foostr', 'foo', True), (dict, 'foodict', {'foo': 'bar'}, False)] self._assert_set_get(set_data, get_data) Configuration.delete('/fooint') with self.assertRaises(ConfigurationNotFoundException): Configuration.get('fooint') Configuration.delete('/foostr') with self.assertRaises(ConfigurationNotFoundException): Configuration.get('foostr')
def unregister_service(node_name, service_name): """ Un-register the metadata of a service from the configuration management :param node_name: Name of the node on which to un-register the service :type node_name: str :param service_name: Name of the service to clean from the configuration management :type service_name: str :return: None """ Configuration.delete(key='/ovs/framework/hosts/{0}/services/{1}'.format(node_name, Toolbox.remove_prefix(service_name, 'ovs-')))
def delete_config(self, ip=None): """ Deletes a configuration file """ if self.filesystem is False: key = self.config_path if Configuration.exists(key, raw=True): Configuration.delete(key, raw=True) else: client = self._load_client(ip) client.file_delete(self.config_path)
def delete_config(self, ip=None): """ Deletes a configuration file """ if self.filesystem is False: key = self.config_path if Configuration.exists(key, raw=True): Configuration.delete(key, raw=True) else: client = self._load_client(ip) client.file_delete(self.config_path)
def clean_config_management(self): """ Remove the configuration management entries related to a StorageDriver removal :return: A boolean indicating whether something went wrong :rtype: bool """ try: for proxy in self.storagedriver.alba_proxies: config_tree = '/ovs/vpools/{0}/proxies/{1}'.format(self.vp_installer.vpool.guid, proxy.guid) Configuration.delete(config_tree) Configuration.delete('/ovs/vpools/{0}/hosts/{1}'.format(self.vp_installer.vpool.guid, self.storagedriver.storagedriver_id)) return False except Exception: self._logger.exception('Cleaning configuration management failed') return True
def teardown(): """ Teardown for Arakoon package, will be executed when all started tests in this package have ended Removal actions of possible things left over after the test-run :return: None """ for storagerouter in GeneralStorageRouter.get_masters(): root_client = SSHClient(storagerouter, username='******') if GeneralService.get_service_status(name='ovs-scheduled-tasks', client=root_client) is False: GeneralService.start_service(name='ovs-scheduled-tasks', client=root_client) for location in TEST_CLEANUP: root_client.run(['rm', '-rf', location]) for key in KEY_CLEANUP: if Configuration.exists('{0}/{1}'.format(GeneralArakoon.CONFIG_ROOT, key), raw=True): Configuration.delete('{0}/{1}'.format(GeneralArakoon.CONFIG_ROOT, key))
def mark_storagerouter_reachable_for_ha(cls, storagerouter): # type: (StorageRouter) -> None """ Update the node distance map to add the storagerouter back into the HA pool :param storagerouter: Storagerouter to put back into the distance map :type storagerouter: StorageRouter :return: None """ cls.logger.info("Marking Storagerouter {} as available for HA".format( storagerouter.name)) Configuration.delete(os.path.join(VPOOL_UPDATE_KEY, storagerouter.guid)) # Trigger a complete reload of node distance maps StorageDriverController.cluster_registry_checkup() # Wait a few moment for the edge to catch up all the configs sleep_time = cls.get_edge_sync_time() cls.logger.info( "Waiting {} to sync up all edge clients".format(sleep_time)) time.sleep(sleep_time)
def revert_vpool(self, status): """ Remove the vPool being created or revert the vPool being extended :param status: Status to put the vPool in :type status: ovs.dal.hybrids.vpool.VPool.STATUSES :return: None :rtype: NoneType """ self.vpool.status = status self.vpool.save() if status == VPool.STATUSES.RUNNING: if self.sr_installer is not None: try: self.sr_installer.root_client.dir_delete( directories=self.sr_installer.created_dirs) except Exception: self._logger.warning( 'Failed to clean up following directories: {0}'.format( ', '.join(self.sr_installer.created_dirs))) if self.sd_installer is not None and self.sd_installer.storagedriver is not None: for sdp in self.sd_installer.storagedriver.partitions: sdp.delete() for proxy in self.sd_installer.storagedriver.alba_proxies: proxy.delete() self.sd_installer.storagedriver.delete() if len(self.vpool.storagedrivers) == 0: self.vpool.delete() if Configuration.dir_exists( key='/ovs/vpools/{0}'.format(self.vpool.guid)): Configuration.delete( key='/ovs/vpools/{0}'.format(self.vpool.guid)) elif status == VPool.STATUSES.FAILURE: # In case of failure status the cluster registry settings have already been adapted, so revert self.configure_cluster_registry( exclude=[self.sd_installer.storagedriver])
def remove_asd(node_guid, asd_id, expected_safety): """ Removes an ASD :param node_guid: Guid of the node to remove an ASD from :type node_guid: str :param asd_id: ID of the ASD to remove :type asd_id: str :param expected_safety: Expected safety after having removed the ASD :type expected_safety: dict or None :return: Aliases of the disk on which the ASD was removed :rtype: list """ node = AlbaNode(node_guid) AlbaNodeController._logger.debug('Removing ASD {0} at node {1}'.format(asd_id, node.ip)) model_osd = None for disk in node.disks: for asd in disk.osds: if asd.osd_id == asd_id: model_osd = asd break if model_osd is not None: break if model_osd is not None: alba_backend = model_osd.alba_backend else: alba_backend = None asds = {} try: asds = node.client.get_asds() except (requests.ConnectionError, requests.Timeout, InvalidCredentialsError): AlbaNodeController._logger.warning('Could not connect to node {0} to validate ASD'.format(node.guid)) partition_alias = None for alias, asd_ids in asds.iteritems(): if asd_id in asd_ids: partition_alias = alias break if alba_backend is not None: if expected_safety is None: AlbaNodeController._logger.warning('Skipping safety check for ASD {0} on backend {1} - this is dangerous'.format(asd_id, alba_backend.guid)) else: final_safety = AlbaController.calculate_safety(alba_backend_guid=alba_backend.guid, removal_osd_ids=[asd_id]) safety_lost = final_safety['lost'] safety_crit = final_safety['critical'] if (safety_crit != 0 or safety_lost != 0) and (safety_crit != expected_safety['critical'] or safety_lost != expected_safety['lost']): raise RuntimeError('Cannot remove ASD {0} as the current safety is not as expected ({1} vs {2})'.format(asd_id, final_safety, expected_safety)) AlbaNodeController._logger.debug('Safety OK for ASD {0} on backend {1}'.format(asd_id, alba_backend.guid)) AlbaNodeController._logger.debug('Purging ASD {0} on backend {1}'.format(asd_id, alba_backend.guid)) AlbaController.remove_units(alba_backend_guid=alba_backend.guid, osd_ids=[asd_id]) else: AlbaNodeController._logger.warning('Could not match ASD {0} to any backend. Cannot purge'.format(asd_id)) disk_data = None if partition_alias is not None: AlbaNodeController._logger.debug('Removing ASD {0} from disk {1}'.format(asd_id, partition_alias)) for device_info in node.client.get_disks().itervalues(): if partition_alias in device_info['partition_aliases']: disk_data = device_info result = node.client.delete_asd(disk_id=device_info['aliases'][0].split('/')[-1], asd_id=asd_id) if result['_success'] is False: raise RuntimeError('Error removing ASD: {0}'.format(result['_error'])) if disk_data == {}: raise RuntimeError('Failed to find disk for partition with alias {0}'.format(partition_alias)) else: AlbaNodeController._logger.warning('Could not remove ASD from remote node (node down)'.format(asd_id)) if Configuration.exists(AlbaNodeController.ASD_CONFIG.format(asd_id), raw=True): Configuration.delete(AlbaNodeController.ASD_CONFIG_DIR.format(asd_id), raw=True) if model_osd is not None: model_osd.delete() if alba_backend is not None: alba_backend.invalidate_dynamics() alba_backend.backend.invalidate_dynamics() if node.storagerouter is not None: DiskController.sync_with_reality(storagerouter_guid=node.storagerouter_guid) return [] if disk_data is None else disk_data.get('aliases', [])
def remove_node(node_ip, silent=None): """ Remove the node with specified IP from the cluster :param node_ip: IP of the node to remove :type node_ip: str :param silent: If silent == '--force-yes' no question will be asked to confirm the removal :type silent: str :return: None """ from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.lib.storagedriver import StorageDriverController from ovs.lib.vpool import VPoolController Toolbox.log(logger=NodeRemovalController._logger, messages='Remove node', boxed=True) Toolbox.log( logger=NodeRemovalController._logger, messages= 'WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n' ) service_manager = ServiceFactory.get_manager() ############### # VALIDATIONS # ############### try: node_ip = node_ip.strip() if not isinstance(node_ip, str): raise ValueError('Node IP must be a string') if not re.match(SSHClient.IP_REGEX, node_ip): raise ValueError('Invalid IP {0} specified'.format(node_ip)) storage_router_all = sorted(StorageRouterList.get_storagerouters(), key=lambda k: k.name) storage_router_masters = StorageRouterList.get_masters() storage_router_all_ips = set( [storage_router.ip for storage_router in storage_router_all]) storage_router_master_ips = set([ storage_router.ip for storage_router in storage_router_masters ]) storage_router_to_remove = StorageRouterList.get_by_ip(node_ip) offline_reasons = {} if node_ip not in storage_router_all_ips: raise ValueError( 'Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}' .format('\n - '.join(storage_router_all_ips), node_ip)) if len(storage_router_all_ips) == 1: raise RuntimeError("Removing the only node is not possible") if node_ip in storage_router_master_ips and len( storage_router_master_ips) == 1: raise RuntimeError( "Removing the only master node is not possible") if System.get_my_storagerouter() == storage_router_to_remove: raise RuntimeError( 'The node to be removed cannot be identical to the node on which the removal is initiated' ) Toolbox.log( logger=NodeRemovalController._logger, messages='Creating SSH connections to remaining master nodes') master_ip = None ip_client_map = {} storage_routers_offline = [] storage_router_to_remove_online = True for storage_router in storage_router_all: try: client = SSHClient(storage_router, username='******', timeout=10) except (UnableToConnectException, NotAuthenticatedException, TimeOutException) as ex: if isinstance(ex, UnableToConnectException): msg = 'Unable to connect' elif isinstance(ex, NotAuthenticatedException): msg = 'Could not authenticate' elif isinstance(ex, TimeOutException): msg = 'Connection timed out' Toolbox.log( logger=NodeRemovalController._logger, messages=' * Node with IP {0:<15}- {1}'.format( storage_router.ip, msg)) offline_reasons[storage_router.ip] = msg storage_routers_offline.append(storage_router) if storage_router == storage_router_to_remove: storage_router_to_remove_online = False continue Toolbox.log( logger=NodeRemovalController._logger, messages=' * Node with IP {0:<15}- Successfully connected' .format(storage_router.ip)) ip_client_map[storage_router.ip] = client if storage_router != storage_router_to_remove and storage_router.node_type == 'MASTER': master_ip = storage_router.ip if len(ip_client_map) == 0 or master_ip is None: raise RuntimeError( 'Could not connect to any master node in the cluster') storage_router_to_remove.invalidate_dynamics('vdisks_guids') if len( storage_router_to_remove.vdisks_guids ) > 0: # vDisks are supposed to be moved away manually before removing a node raise RuntimeError( "Still vDisks attached to Storage Router {0}".format( storage_router_to_remove.name)) internal_memcached = Toolbox.is_service_internally_managed( service='memcached') internal_rabbit_mq = Toolbox.is_service_internally_managed( service='rabbitmq') memcached_endpoints = Configuration.get( key='/ovs/framework/memcache|endpoints') rabbit_mq_endpoints = Configuration.get( key='/ovs/framework/messagequeue|endpoints') copy_memcached_endpoints = list(memcached_endpoints) copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints) for endpoint in memcached_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_memcached_endpoints.remove(endpoint) for endpoint in rabbit_mq_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_rabbit_mq_endpoints.remove(endpoint) if len(copy_memcached_endpoints ) == 0 and internal_memcached is True: raise RuntimeError( 'Removal of provided nodes will result in a complete removal of the memcached service' ) if len(copy_rabbit_mq_endpoints ) == 0 and internal_rabbit_mq is True: raise RuntimeError( 'Removal of provided nodes will result in a complete removal of the messagequeue service' ) Toolbox.run_hooks(component='noderemoval', sub_component='validate_removal', logger=NodeRemovalController._logger, cluster_ip=storage_router_to_remove.ip) except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages= 'Removal has been aborted during the validation step. No changes have been applied.', boxed=True, loglevel='warning') sys.exit(1) except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel='exception') sys.exit(1) ################# # CONFIRMATIONS # ################# try: interactive = silent != '--force-yes' remove_asd_manager = not interactive # Remove ASD manager if non-interactive else ask if interactive is True: if len(storage_routers_offline) > 0: Toolbox.log( logger=NodeRemovalController._logger, messages= 'Certain nodes appear to be offline. These will not fully removed and will cause issues if they are not really offline.' ) Toolbox.log( logger=NodeRemovalController._logger, messages='Offline nodes: {0}'.format(''.join( ('\n * {0:<15}- {1}.'.format(ip, message) for ip, message in offline_reasons.iteritems())))) valid_node_info = Interactive.ask_yesno( message= 'Continue the removal with these being presumably offline?', default_value=False) if valid_node_info is False: Toolbox.log( logger=NodeRemovalController._logger, messages= 'Please validate the state of the nodes before removing.', title=True) sys.exit(1) proceed = Interactive.ask_yesno( message='Are you sure you want to remove node {0}?'.format( storage_router_to_remove.name), default_value=False) if proceed is False: Toolbox.log(logger=NodeRemovalController._logger, messages='Abort removal', title=True) sys.exit(1) remove_asd_manager = True if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username='******') if service_manager.has_service(name='asd-manager', client=client): remove_asd_manager = Interactive.ask_yesno( message= 'Do you also want to remove the ASD manager and related ASDs?', default_value=False) if remove_asd_manager is True or storage_router_to_remove_online is False: for fct in Toolbox.fetch_hooks('noderemoval', 'validate_asd_removal'): validation_output = fct(storage_router_to_remove.ip) if validation_output['confirm'] is True: if Interactive.ask_yesno( message=validation_output['question'], default_value=False) is False: remove_asd_manager = False break except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages= 'Removal has been aborted during the confirmation step. No changes have been applied.', boxed=True, loglevel='warning') sys.exit(1) except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel='exception') sys.exit(1) ########### # REMOVAL # ########### try: Toolbox.log(logger=NodeRemovalController._logger, messages='Starting removal of node {0} - {1}'.format( storage_router_to_remove.name, storage_router_to_remove.ip)) if storage_router_to_remove_online is False: Toolbox.log( logger=NodeRemovalController._logger, messages= ' Marking all Storage Drivers served by Storage Router {0} as offline' .format(storage_router_to_remove.ip)) StorageDriverController.mark_offline( storagerouter_guid=storage_router_to_remove.guid) # Remove vPools Toolbox.log(logger=NodeRemovalController._logger, messages=' Removing vPools from node'.format( storage_router_to_remove.ip)) storage_routers_offline_guids = [ sr.guid for sr in storage_routers_offline if sr.guid != storage_router_to_remove.guid ] for storage_driver in storage_router_to_remove.storagedrivers: Toolbox.log(logger=NodeRemovalController._logger, messages=' Removing vPool {0} from node'.format( storage_driver.vpool.name)) VPoolController.shrink_vpool( storagedriver_guid=storage_driver.guid, offline_storage_router_guids=storage_routers_offline_guids) # Demote if MASTER if storage_router_to_remove.node_type == 'MASTER': NodeTypeController.demote_node( cluster_ip=storage_router_to_remove.ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=storage_router_to_remove.machine_id, unconfigure_memcached=internal_memcached, unconfigure_rabbitmq=internal_rabbit_mq, offline_nodes=storage_routers_offline) # Stop / remove services Toolbox.log(logger=NodeRemovalController._logger, messages='Stopping and removing services') if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username='******') NodeRemovalController.remove_services( client=client, node_type=storage_router_to_remove.node_type.lower(), logger=NodeRemovalController._logger) service = 'watcher-config' if service_manager.has_service(service, client=client): Toolbox.log( logger=NodeRemovalController._logger, messages='Removing service {0}'.format(service)) service_manager.stop_service(service, client=client) service_manager.remove_service(service, client=client) Toolbox.run_hooks(component='noderemoval', sub_component='remove', logger=NodeRemovalController._logger, cluster_ip=storage_router_to_remove.ip, complete_removal=remove_asd_manager) # Clean up model Toolbox.log(logger=NodeRemovalController._logger, messages='Removing node from model') for service in storage_router_to_remove.services: service.delete() for disk in storage_router_to_remove.disks: for partition in disk.partitions: partition.delete() disk.delete() for j_domain in storage_router_to_remove.domains: j_domain.delete() Configuration.delete('/ovs/framework/hosts/{0}'.format( storage_router_to_remove.machine_id)) NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, offline_node_ips=[node.ip for node in storage_routers_offline], logger=NodeRemovalController._logger) if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username='******') client.file_delete(filenames=[CACC_LOCATION]) client.file_delete(filenames=[CONFIG_STORE_LOCATION]) storage_router_to_remove.delete() Toolbox.log(logger=NodeRemovalController._logger, messages='Successfully removed node\n') except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages=['An unexpected error occurred:', str(exception)], boxed=True, loglevel='exception') sys.exit(1) except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages= 'This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.', boxed=True, loglevel='error') sys.exit(1) if remove_asd_manager is True and storage_router_to_remove_online is True: Toolbox.log(logger=NodeRemovalController._logger, messages='\nRemoving ASD Manager') with remote(storage_router_to_remove.ip, [os]) as rem: rem.os.system('asd-manager remove --force-yes') Toolbox.log(logger=NodeRemovalController._logger, messages='Remove nodes finished', title=True)
def migrate(previous_version, master_ips=None, extra_ips=None): """ Migrates from any version to any version, running all migrations required If previous_version is for example 0 and this script is at verison 3 it will execute two steps: - 1 > 2 - 2 > 3 :param previous_version: The previous version from which to start the migration. :param master_ips: IP addresses of the MASTER nodes :param extra_ips: IP addresses of the EXTRA nodes """ logger = LogHandler.get('extensions', name='migration') working_version = previous_version # Version 1 introduced: # - Flexible SSD layout if working_version < 1: try: from ovs.extensions.generic.configuration import Configuration if Configuration.exists('ovs.arakoon'): Configuration.delete('ovs.arakoon', remove_root=True) Configuration.set('ovs.core.ovsdb', '/opt/OpenvStorage/db') except: logger.exception('Error migrating to version 1') working_version = 1 # Version 2 introduced: # - Registration if working_version < 2: try: import time from ovs.extensions.generic.configuration import Configuration if not Configuration.exists('ovs.core.registered'): Configuration.set('ovs.core.registered', False) Configuration.set('ovs.core.install_time', time.time()) except: logger.exception('Error migrating to version 2') working_version = 2 # Version 3 introduced: # - New arakoon clients if working_version < 3: try: from ovs.extensions.db.arakoon import ArakoonInstaller reload(ArakoonInstaller) from ovs.extensions.db.arakoon.ArakoonInstaller import ArakoonInstaller from ovs.extensions.generic.sshclient import SSHClient from ovs.extensions.generic.configuration import Configuration if master_ips is not None: for ip in master_ips: client = SSHClient(ip) if client.dir_exists( ArakoonInstaller.ARAKOON_CONFIG_DIR): for cluster_name in client.dir_list( ArakoonInstaller.ARAKOON_CONFIG_DIR): try: ArakoonInstaller.deploy_cluster( cluster_name, ip) except: pass if Configuration.exists('ovs.core.storage.persistent'): Configuration.set('ovs.core.storage.persistent', 'pyrakoon') except: logger.exception('Error migrating to version 3') working_version = 3 # Version 4 introduced: # - Etcd if working_version < 4: try: import os import json from ConfigParser import RawConfigParser from ovs.extensions.db.etcd import installer reload(installer) from ovs.extensions.db.etcd.installer import EtcdInstaller from ovs.extensions.db.etcd.configuration import EtcdConfiguration from ovs.extensions.generic.system import System host_id = System.get_my_machine_id() etcd_migrate = False if EtcdInstaller.has_cluster('127.0.0.1', 'config'): etcd_migrate = True else: if master_ips is not None and extra_ips is not None: cluster_ip = None for ip in master_ips + extra_ips: if EtcdInstaller.has_cluster(ip, 'config'): cluster_ip = ip break node_ip = None path = '/opt/OpenvStorage/config/ovs.json' if os.path.exists(path): with open(path) as config_file: config = json.load(config_file) node_ip = config['grid']['ip'] if node_ip is not None: if cluster_ip is None: EtcdInstaller.create_cluster('config', node_ip) EtcdConfiguration.initialize() EtcdConfiguration.initialize_host(host_id) else: EtcdInstaller.extend_cluster( cluster_ip, node_ip, 'config') EtcdConfiguration.initialize_host(host_id) etcd_migrate = True if etcd_migrate is True: # Migrating configuration files path = '/opt/OpenvStorage/config/ovs.json' if os.path.exists(path): with open(path) as config_file: config = json.load(config_file) EtcdConfiguration.set('/ovs/framework/cluster_id', config['support']['cid']) if not EtcdConfiguration.exists( '/ovs/framework/install_time'): EtcdConfiguration.set( '/ovs/framework/install_time', config['core']['install_time']) else: EtcdConfiguration.set( '/ovs/framework/install_time', min( EtcdConfiguration.get( '/ovs/framework/install_time'), config['core']['install_time'])) EtcdConfiguration.set('/ovs/framework/registered', config['core']['registered']) EtcdConfiguration.set( '/ovs/framework/plugins/installed', config['plugins']) EtcdConfiguration.set('/ovs/framework/stores', config['core']['storage']) EtcdConfiguration.set( '/ovs/framework/paths', { 'cfgdir': config['core']['cfgdir'], 'basedir': config['core']['basedir'], 'ovsdb': config['core']['ovsdb'] }) EtcdConfiguration.set( '/ovs/framework/support', { 'enablesupport': config['support']['enablesupport'], 'enabled': config['support']['enabled'], 'interval': config['support']['interval'] }) EtcdConfiguration.set( '/ovs/framework/storagedriver', { 'mds_safety': config['storagedriver']['mds']['safety'], 'mds_tlogs': config['storagedriver']['mds']['tlogs'], 'mds_maxload': config['storagedriver']['mds']['maxload'] }) EtcdConfiguration.set( '/ovs/framework/webapps', { 'html_endpoint': config['webapps']['html_endpoint'], 'oauth2': config['webapps']['oauth2'] }) EtcdConfiguration.set( '/ovs/framework/messagequeue', { 'endpoints': [], 'protocol': config['core']['broker']['protocol'], 'user': config['core']['broker']['login'], 'port': config['core']['broker']['port'], 'password': config['core']['broker']['password'], 'queues': config['core']['broker']['queues'] }) host_key = '/ovs/framework/hosts/{0}{{0}}'.format( host_id) EtcdConfiguration.set( host_key.format('/storagedriver'), { 'rsp': config['storagedriver']['rsp'], 'vmware_mode': config['storagedriver']['vmware_mode'] }) EtcdConfiguration.set(host_key.format('/ports'), config['ports']) EtcdConfiguration.set( host_key.format('/setupcompleted'), config['core']['setupcompleted']) EtcdConfiguration.set( host_key.format('/versions'), config['core'].get('versions', {})) EtcdConfiguration.set(host_key.format('/type'), config['core']['nodetype']) EtcdConfiguration.set(host_key.format('/ip'), config['grid']['ip']) path = '{0}/memcacheclient.cfg'.format( EtcdConfiguration.get('/ovs/framework/paths|cfgdir')) if os.path.exists(path): config = RawConfigParser() config.read(path) nodes = [ config.get(node.strip(), 'location').strip() for node in config.get('main', 'nodes').split(',') ] EtcdConfiguration.set( '/ovs/framework/memcache|endpoints', nodes) os.remove(path) path = '{0}/rabbitmqclient.cfg'.format( EtcdConfiguration.get('/ovs/framework/paths|cfgdir')) if os.path.exists(path): config = RawConfigParser() config.read(path) nodes = [ config.get(node.strip(), 'location').strip() for node in config.get('main', 'nodes').split(',') ] EtcdConfiguration.set( '/ovs/framework/messagequeue|endpoints', nodes) os.remove(path) # Migrate arakoon configuration files from ovs.extensions.db.arakoon import ArakoonInstaller reload(ArakoonInstaller) from ovs.extensions.db.arakoon.ArakoonInstaller import ArakoonInstaller, ArakoonClusterConfig from ovs.extensions.generic.sshclient import SSHClient if master_ips is not None: config_dir = '/opt/OpenvStorage/config/arakoon/' for ip in master_ips: client = SSHClient(ip) if client.dir_exists(config_dir): for cluster_name in client.dir_list( config_dir): try: with open('{0}/{1}/{1}.cfg'.format( config_dir, cluster_name)) as config_file: EtcdConfiguration.set( ArakoonClusterConfig. ETCD_CONFIG_KEY.format( cluster_name), config_file.read(), raw=True) ArakoonInstaller.deploy_cluster( cluster_name, ip) except: logger.exception( 'Error migrating {0} on {1}'. format(cluster_name, ip)) client.dir_delete(config_dir) except: logger.exception('Error migrating to version 4') working_version = 4 return working_version
def shrink_vpool(cls, storagedriver_guid, offline_storage_router_guids=list()): """ Removes a StorageDriver (if its the last StorageDriver for a vPool, the vPool is removed as well) :param storagedriver_guid: Guid of the StorageDriver to remove :type storagedriver_guid: str :param offline_storage_router_guids: Guids of StorageRouters which are offline and will be removed from cluster. WHETHER VPOOL WILL BE DELETED DEPENDS ON THIS :type offline_storage_router_guids: list :return: None :rtype: NoneType """ # TODO: Add logging # TODO: Unit test individual pieces of code # Validations storagedriver = StorageDriver(storagedriver_guid) storagerouter = storagedriver.storagerouter cls._logger.info( 'StorageDriver {0} - Deleting StorageDriver {1}'.format( storagedriver.guid, storagedriver.name)) vp_installer = VPoolInstaller(name=storagedriver.vpool.name) vp_installer.validate(storagedriver=storagedriver) sd_installer = StorageDriverInstaller(vp_installer=vp_installer, storagedriver=storagedriver) cls._logger.info( 'StorageDriver {0} - Checking availability of related StorageRouters' .format(storagedriver.guid, storagedriver.name)) sr_client_map = SSHClient.get_clients(endpoints=[ sd.storagerouter for sd in vp_installer.vpool.storagedrivers ], user_names=['root']) sr_installer = StorageRouterInstaller(root_client=sr_client_map.get( storagerouter, {}).get('root'), storagerouter=storagerouter, vp_installer=vp_installer, sd_installer=sd_installer) offline_srs = sr_client_map.pop('offline') if sorted([sr.guid for sr in offline_srs ]) != sorted(offline_storage_router_guids): raise RuntimeError('Not all StorageRouters are reachable') if storagerouter not in offline_srs: mtpt_pids = sr_installer.root_client.run( "lsof -t +D '/mnt/{0}' || true".format( vp_installer.name.replace(r"'", r"'\''")), allow_insecure=True).splitlines() if len(mtpt_pids) > 0: raise RuntimeError( 'vPool cannot be deleted. Following processes keep the vPool mount point occupied: {0}' .format(', '.join(mtpt_pids))) # Retrieve reachable StorageDrivers reachable_storagedrivers = [] for sd in vp_installer.vpool.storagedrivers: if sd.storagerouter not in sr_client_map: # StorageRouter is offline continue sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format( vp_installer.vpool.guid, sd.storagedriver_id) if Configuration.exists(sd_key) is True: path = Configuration.get_configuration_path(sd_key) with remote(sd.storagerouter.ip, [LocalStorageRouterClient]) as rem: try: lsrc = rem.LocalStorageRouterClient(path) lsrc.server_revision( ) # 'Cheap' call to verify whether volumedriver is responsive cls._logger.info( 'StorageDriver {0} - Responsive StorageDriver {1} on node with IP {2}' .format(storagedriver.guid, sd.name, sd.storagerouter.ip)) reachable_storagedrivers.append(sd) except Exception as exception: if not is_connection_failure(exception): raise if len(reachable_storagedrivers) == 0: raise RuntimeError( 'Could not find any responsive node in the cluster') # Start removal if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.SHRINKING) else: vp_installer.update_status(status=VPool.STATUSES.DELETING) # Clean up stale vDisks cls._logger.info('StorageDriver {0} - Removing stale vDisks'.format( storagedriver.guid)) VDiskController.remove_stale_vdisks(vpool=vp_installer.vpool) # Reconfigure the MDSes cls._logger.info('StorageDriver {0} - Reconfiguring MDSes'.format( storagedriver.guid)) for vdisk_guid in storagerouter.vdisks_guids: try: MDSServiceController.ensure_safety( vdisk_guid=vdisk_guid, excluded_storagerouter_guids=[storagerouter.guid] + offline_storage_router_guids) except Exception: cls._logger.exception( 'StorageDriver {0} - vDisk {1} - Ensuring MDS safety failed' .format(storagedriver.guid, vdisk_guid)) # Validate that all MDSes on current StorageRouter have been moved away # Ensure safety does not always throw an error, that's why we perform this check here instead of in the Exception clause of above code vdisks = [] for mds in vp_installer.mds_services: for junction in mds.vdisks: vdisk = junction.vdisk if vdisk in vdisks: continue vdisks.append(vdisk) cls._logger.critical( 'StorageDriver {0} - vDisk {1} {2} - MDS Services have not been migrated away' .format(storagedriver.guid, vdisk.guid, vdisk.name)) if len(vdisks) > 0: # Put back in RUNNING, so it can be used again. Errors keep on displaying in GUI now anyway vp_installer.update_status(status=VPool.STATUSES.RUNNING) raise RuntimeError( 'Not all MDS Services have been successfully migrated away') # Start with actual removal errors_found = False if storagerouter not in offline_srs: errors_found &= sd_installer.stop_services() errors_found &= vp_installer.configure_cluster_registry( exclude=[storagedriver], apply_on=reachable_storagedrivers) errors_found &= vp_installer.update_node_distance_map() errors_found &= vp_installer.remove_mds_services() errors_found &= sd_installer.clean_config_management() errors_found &= sd_installer.clean_model() if storagerouter not in offline_srs: errors_found &= sd_installer.clean_directories( mountpoints=StorageRouterController.get_mountpoints( client=sr_installer.root_client)) try: DiskController.sync_with_reality( storagerouter_guid=storagerouter.guid) except Exception: cls._logger.exception( 'StorageDriver {0} - Synchronizing disks with reality failed' .format(storagedriver.guid)) errors_found = True if vp_installer.storagedriver_amount > 1: # Update the vPool metadata and run DTL checkup vp_installer.vpool.metadata['caching_info'].pop( sr_installer.storagerouter.guid, None) vp_installer.vpool.save() try: VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600) except Exception: cls._logger.exception( 'StorageDriver {0} - DTL checkup failed for vPool {1} with guid {2}' .format(storagedriver.guid, vp_installer.name, vp_installer.vpool.guid)) else: cls._logger.info( 'StorageDriver {0} - Removing vPool from model'.format( storagedriver.guid)) # Clean up model try: vp_installer.vpool.delete() except Exception: errors_found = True cls._logger.exception( 'StorageDriver {0} - Cleaning up vPool from the model failed' .format(storagedriver.guid)) Configuration.delete('/ovs/vpools/{0}'.format( vp_installer.vpool.guid)) cls._logger.info('StorageDriver {0} - Running MDS checkup'.format( storagedriver.guid)) try: MDSServiceController.mds_checkup() except Exception: cls._logger.exception( 'StorageDriver {0} - MDS checkup failed'.format( storagedriver.guid)) # Update vPool status if errors_found is True: if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.FAILURE) raise RuntimeError( '1 or more errors occurred while trying to remove the StorageDriver. Please check the logs for more information' ) if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.RUNNING) cls._logger.info( 'StorageDriver {0} - Deleted StorageDriver {1}'.format( storagedriver.guid, storagedriver.name)) if len(VPoolList.get_vpools()) == 0: cluster_name = ArakoonInstaller.get_cluster_name('voldrv') if ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name)['internal'] is True: cls._logger.debug( 'StorageDriver {0} - Removing Arakoon cluster {1}'.format( storagedriver.guid, cluster_name)) try: installer = ArakoonInstaller(cluster_name=cluster_name) installer.load() installer.delete_cluster() except Exception: cls._logger.exception( 'StorageDriver {0} - Delete voldrv Arakoon cluster failed' .format(storagedriver.guid)) service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) service_name = ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name) for service in list(service_type.services): if service.name == service_name: service.delete() # Remove watcher volumedriver service if last StorageDriver on current StorageRouter if len( storagerouter.storagedrivers ) == 0 and storagerouter not in offline_srs: # ensure client is initialized for StorageRouter try: if cls._service_manager.has_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client): cls._service_manager.stop_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client) cls._service_manager.remove_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client) except Exception: cls._logger.exception( 'StorageDriver {0} - {1} service deletion failed'.format( storagedriver.guid, ServiceFactory.SERVICE_WATCHER_VOLDRV))
def migrate(): """ Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually executed. This code will typically contain: * "dangerous" migration code (it needs certain running services) * Migration code depending on a cluster-wide state * ... """ AlbaMigrationController._logger.info( 'Preparing out of band migrations...') from ovs.dal.hybrids.diskpartition import DiskPartition from ovs.dal.lists.albabackendlist import AlbaBackendList from ovs.dal.lists.albanodelist import AlbaNodeList from ovs.dal.lists.albaosdlist import AlbaOSDList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.extensions.generic.configuration import Configuration from ovs.extensions.generic.sshclient import SSHClient, UnableToConnectException from ovs.extensions.migration.migration.albamigrator import ExtensionMigrator from ovs.extensions.packages.albapackagefactory import PackageFactory from ovs.extensions.services.albaservicefactory import ServiceFactory from ovs.extensions.plugins.albacli import AlbaCLI, AlbaError from ovs.lib.alba import AlbaController from ovs.lib.disk import DiskController AlbaMigrationController._logger.info('Start out of band migrations...') ############################################# # Introduction of IP:port combination on OSDs osd_info_map = {} alba_backends = AlbaBackendList.get_albabackends() for alba_backend in alba_backends: AlbaMigrationController._logger.info( 'Verifying ALBA Backend {0}'.format(alba_backend.name)) if alba_backend.abm_cluster is None: AlbaMigrationController._logger.warning( 'ALBA Backend {0} does not have an ABM cluster registered'. format(alba_backend.name)) continue AlbaMigrationController._logger.debug( 'Retrieving configuration path for ALBA Backend {0}'.format( alba_backend.name)) try: config = Configuration.get_configuration_path( alba_backend.abm_cluster.config_location) except: AlbaMigrationController._logger.exception( 'Failed to retrieve the configuration path for ALBA Backend {0}' .format(alba_backend.name)) continue AlbaMigrationController._logger.info( 'Retrieving OSD information for ALBA Backend {0}'.format( alba_backend.name)) try: osd_info = AlbaCLI.run(command='list-all-osds', config=config) except (AlbaError, RuntimeError): AlbaMigrationController._logger.exception( 'Failed to retrieve OSD information for ALBA Backend {0}'. format(alba_backend.name)) continue for osd_info in osd_info: if osd_info.get('long_id'): osd_info_map[osd_info['long_id']] = { 'ips': osd_info.get('ips', []), 'port': osd_info.get('port') } for osd in AlbaOSDList.get_albaosds(): if osd.osd_id not in osd_info_map: AlbaMigrationController._logger.warning( 'OSD with ID {0} is modelled but could not be found through ALBA' .format(osd.osd_id)) continue ips = osd_info_map[osd.osd_id]['ips'] port = osd_info_map[osd.osd_id]['port'] changes = False if osd.ips is None: changes = True osd.ips = ips if osd.port is None: changes = True osd.port = port if changes is True: AlbaMigrationController._logger.info( 'Updating OSD with ID {0} with IPS {1} and port {2}'. format(osd.osd_id, ips, port)) osd.save() ################################################### # Read preference for GLOBAL ALBA Backends (1.10.3) (https://github.com/openvstorage/framework-alba-plugin/issues/452) if Configuration.get(key='/ovs/framework/migration|read_preference', default=False) is False: try: name_backend_map = dict((alba_backend.name, alba_backend) for alba_backend in alba_backends) for alba_node in AlbaNodeList.get_albanodes(): AlbaMigrationController._logger.info( 'Processing maintenance services running on ALBA Node {0} with ID {1}' .format(alba_node.ip, alba_node.node_id)) alba_node.invalidate_dynamics('maintenance_services') for alba_backend_name, services in alba_node.maintenance_services.iteritems( ): if alba_backend_name not in name_backend_map: AlbaMigrationController._logger.error( 'ALBA Node {0} has services for an ALBA Backend {1} which is not modelled' .format(alba_node.ip, alba_backend_name)) continue alba_backend = name_backend_map[alba_backend_name] AlbaMigrationController._logger.info( 'Processing {0} ALBA Backend {1} with GUID {2}'. format(alba_backend.scaling, alba_backend.name, alba_backend.guid)) if alba_backend.scaling == alba_backend.SCALINGS.LOCAL: read_preferences = [alba_node.node_id] else: read_preferences = AlbaController.get_read_preferences_for_global_backend( alba_backend=alba_backend, alba_node_id=alba_node.node_id, read_preferences=[]) for service_name, _ in services: AlbaMigrationController._logger.info( 'Processing service {0}'.format(service_name)) old_config_key = '/ovs/alba/backends/{0}/maintenance/config'.format( alba_backend.guid) new_config_key = '/ovs/alba/backends/{0}/maintenance/{1}/config'.format( alba_backend.guid, service_name) if Configuration.exists(key=old_config_key): new_config = Configuration.get( key=old_config_key) new_config[ 'read_preference'] = read_preferences Configuration.set(key=new_config_key, value=new_config) for alba_backend in alba_backends: Configuration.delete( key='/ovs/alba/backends/{0}/maintenance/config'.format( alba_backend.guid)) AlbaController.checkup_maintenance_agents.delay() Configuration.set( key='/ovs/framework/migration|read_preference', value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating read preferences for ALBA Backends failed') ####################################################### # Storing actual package name in version files (1.11.0) (https://github.com/openvstorage/framework/issues/1876) changed_clients = set() storagerouters = StorageRouterList.get_storagerouters() if Configuration.get( key= '/ovs/framework/migration|actual_package_name_in_version_file_alba', default=False) is False: try: service_manager = ServiceFactory.get_manager() alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for( component=PackageFactory.COMP_ALBA) for storagerouter in storagerouters: try: root_client = SSHClient( endpoint=storagerouter.ip, username='******' ) # Use '.ip' instead of StorageRouter object because this code is executed during post-update at which point the heartbeat has not been updated for some time except UnableToConnectException: AlbaMigrationController._logger.exception( 'Updating actual package name for version files failed on StorageRouter {0}' .format(storagerouter.ip)) continue for file_name in root_client.file_list( directory=ServiceFactory.RUN_FILE_DIR): if not file_name.endswith('.version'): continue file_path = '{0}/{1}'.format( ServiceFactory.RUN_FILE_DIR, file_name) contents = root_client.file_read(filename=file_path) if alba_pkg_name == PackageFactory.PKG_ALBA_EE and '{0}='.format( PackageFactory.PKG_ALBA) in contents: # Rewrite the version file in the RUN_FILE_DIR contents = contents.replace( PackageFactory.PKG_ALBA, PackageFactory.PKG_ALBA_EE) root_client.file_write(filename=file_path, contents=contents) # Regenerate the service and update the EXTRA_VERSION_CMD in the configuration management service_name = file_name.split('.')[0] service_config_key = ServiceFactory.SERVICE_CONFIG_KEY.format( storagerouter.machine_id, service_name) if Configuration.exists(key=service_config_key): service_config = Configuration.get( key=service_config_key) if 'EXTRA_VERSION_CMD' in service_config: service_config[ 'EXTRA_VERSION_CMD'] = '{0}=`{1}`'.format( alba_pkg_name, alba_version_cmd) Configuration.set(key=service_config_key, value=service_config) service_manager.regenerate_service( name='ovs-arakoon', client=root_client, target_name='ovs-{0}'.format( service_name) ) # Leave out .version changed_clients.add(root_client) Configuration.set( key= '/ovs/framework/migration|actual_package_name_in_version_file_alba', value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating actual package name for version files failed') for root_client in changed_clients: try: root_client.run(['systemctl', 'daemon-reload']) except Exception: AlbaMigrationController._logger.exception( 'Executing command "systemctl daemon-reload" failed') #################################### # Fix for migration version (1.11.0) # Previous code could potentially store a higher version number in the config management than the actual version number if Configuration.get( key='/ovs/framework/migration|alba_migration_version_fix', default=False) is False: try: for storagerouter in storagerouters: config_key = '/ovs/framework/hosts/{0}/versions'.format( storagerouter.machine_id) if Configuration.exists(key=config_key): versions = Configuration.get(key=config_key) if versions.get(PackageFactory.COMP_MIGRATION_ALBA, 0) > ExtensionMigrator.THIS_VERSION: versions[ PackageFactory. COMP_MIGRATION_ALBA] = ExtensionMigrator.THIS_VERSION Configuration.set(key=config_key, value=versions) Configuration.set( key='/ovs/framework/migration|alba_migration_version_fix', value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating migration version failed') #################################### # Enable auto-cleanup migration_auto_cleanup_key = '/ovs/framework/migration|alba_auto_cleanup' if Configuration.get(key=migration_auto_cleanup_key, default=False) is False: try: for storagerouter in StorageRouterList.get_storagerouters(): storagerouter.invalidate_dynamics( 'features') # New feature was added errors = [] for alba_backend in AlbaBackendList.get_albabackends(): try: AlbaController.set_auto_cleanup(alba_backend.guid) except Exception as ex: AlbaMigrationController._logger.exception( 'Failed to set the auto-cleanup for ALBA Backend {0}' .format(alba_backend.name)) errors.append(ex) if len(errors) == 0: Configuration.set(key=migration_auto_cleanup_key, value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating auto cleanup failed') #################################### # Change cache eviction migration_random_eviction_key = '/ovs/framework/migration|alba_cache_eviction_random' if Configuration.get(key=migration_random_eviction_key, default=False) is False: try: errors = [] for alba_backend in AlbaBackendList.get_albabackends(): try: AlbaController.set_cache_eviction(alba_backend.guid) except Exception as ex: AlbaMigrationController._logger.exception( 'Failed to set the auto-cleanup for ALBA Backend {0}' .format(alba_backend.name)) errors.append(ex) if len(errors) == 0: Configuration.set(key=migration_random_eviction_key, value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating auto cleanup failed') ################################################### # Sync all disks and apply the backend role. Backend role was removed with the AD (since 1.10) albanode_backend_role_sync_key = '/ovs/framework/migration|albanode_backend_role_sync' if not Configuration.get(key=albanode_backend_role_sync_key, default=False): try: errors = [] for alba_node in AlbaNodeList.get_albanodes(): try: if not alba_node.storagerouter: continue stack = alba_node.client.get_stack() # type: dict for slot_id, slot_information in stack.iteritems(): osds = slot_information.get('osds', {}) # type: dict slot_aliases = slot_information.get( 'aliases', []) # type: list if not osds: # No osds means no partition was made continue # Sync to add all potential partitions that will need a backend role DiskController.sync_with_reality( storagerouter_guid=alba_node.storagerouter_guid ) for disk in alba_node.storagerouter.disks: if set(disk.aliases).intersection( set(slot_aliases)): partition = disk.partitions[0] if DiskPartition.ROLES.BACKEND not in partition.roles: partition.roles.append( DiskPartition.ROLES.BACKEND) partition.save() except Exception as ex: AlbaMigrationController._logger.exception( 'Syncing for storagerouter/albanode {0} failed'. format(alba_node.storagerouter.ip)) errors.append(ex) if not errors: Configuration.set(key=albanode_backend_role_sync_key, value=True) except Exception: AlbaMigrationController._logger.exception( 'Syncing up the disks for backend roles failed') AlbaMigrationController._logger.info('Finished out of band migrations')
def remove_osd(node_guid, osd_id, expected_safety): """ Removes an OSD :param node_guid: Guid of the node to remove an OSD from :type node_guid: str :param osd_id: ID of the OSD to remove :type osd_id: str :param expected_safety: Expected safety after having removed the OSD :type expected_safety: dict or None :return: Aliases of the disk on which the OSD was removed :rtype: list """ # Retrieve corresponding OSD in model node = AlbaNode(node_guid) AlbaNodeController._logger.debug('Removing OSD {0} at node {1}'.format( osd_id, node.ip)) osd = AlbaOSDList.get_by_osd_id(osd_id) alba_backend = osd.alba_backend if expected_safety is None: AlbaNodeController._logger.warning( 'Skipping safety check for OSD {0} on backend {1} - this is dangerous' .format(osd_id, alba_backend.guid)) else: final_safety = AlbaController.calculate_safety( alba_backend_guid=alba_backend.guid, removal_osd_ids=[osd_id]) safety_lost = final_safety['lost'] safety_crit = final_safety['critical'] if (safety_crit != 0 or safety_lost != 0) and ( safety_crit != expected_safety['critical'] or safety_lost != expected_safety['lost']): raise RuntimeError( 'Cannot remove OSD {0} as the current safety is not as expected ({1} vs {2})' .format(osd_id, final_safety, expected_safety)) AlbaNodeController._logger.debug( 'Safety OK for OSD {0} on backend {1}'.format( osd_id, alba_backend.guid)) AlbaNodeController._logger.debug( 'Purging OSD {0} on backend {1}'.format(osd_id, alba_backend.guid)) AlbaController.remove_units(alba_backend_guid=alba_backend.guid, osd_ids=[osd_id]) # Delete the OSD result = node.client.delete_osd(slot_id=osd.slot_id, osd_id=osd_id) if result['_success'] is False: raise RuntimeError('Error removing OSD: {0}'.format( result['_error'])) # Clean configuration management and model - Well, just try it at least if Configuration.exists(ASD_CONFIG.format(osd_id), raw=True): Configuration.delete(ASD_CONFIG_DIR.format(osd_id), raw=True) osd.delete() node.invalidate_dynamics() if alba_backend is not None: alba_backend.invalidate_dynamics() alba_backend.backend.invalidate_dynamics() if node.storagerouter is not None: try: DiskController.sync_with_reality( storagerouter_guid=node.storagerouter_guid) except UnableToConnectException: AlbaNodeController._logger.warning( 'Skipping disk sync since StorageRouter {0} is offline'. format(node.storagerouter.name)) return [osd.slot_id]
def remove_node(node_ip, silent=None): """ Remove the node with specified IP from the cluster :param node_ip: IP of the node to remove :type node_ip: str :param silent: If silent == '--force-yes' no question will be asked to confirm the removal :type silent: str :return: None """ from ovs.lib.storagedriver import StorageDriverController from ovs.lib.storagerouter import StorageRouterController from ovs.dal.lists.storagerouterlist import StorageRouterList Toolbox.log(logger=NodeRemovalController._logger, messages="Remove node", boxed=True) Toolbox.log( logger=NodeRemovalController._logger, messages="WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n", ) ############### # VALIDATIONS # ############### try: node_ip = node_ip.strip() if not isinstance(node_ip, str): raise ValueError("Node IP must be a string") if not re.match(SSHClient.IP_REGEX, node_ip): raise ValueError("Invalid IP {0} specified".format(node_ip)) storage_router_all = StorageRouterList.get_storagerouters() storage_router_masters = StorageRouterList.get_masters() storage_router_all_ips = set([storage_router.ip for storage_router in storage_router_all]) storage_router_master_ips = set([storage_router.ip for storage_router in storage_router_masters]) storage_router_to_remove = StorageRouterList.get_by_ip(node_ip) if node_ip not in storage_router_all_ips: raise ValueError( "Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}".format( "\n - ".join(storage_router_all_ips), node_ip ) ) if len(storage_router_all_ips) == 1: raise RuntimeError("Removing the only node is not possible") if node_ip in storage_router_master_ips and len(storage_router_master_ips) == 1: raise RuntimeError("Removing the only master node is not possible") if System.get_my_storagerouter() == storage_router_to_remove: raise RuntimeError( "The node to be removed cannot be identical to the node on which the removal is initiated" ) Toolbox.log( logger=NodeRemovalController._logger, messages="Creating SSH connections to remaining master nodes" ) master_ip = None ip_client_map = {} storage_routers_offline = [] storage_router_to_remove_online = True for storage_router in storage_router_all: try: client = SSHClient(storage_router, username="******") if client.run(["pwd"]): Toolbox.log( logger=NodeRemovalController._logger, messages=" Node with IP {0:<15} successfully connected to".format(storage_router.ip), ) ip_client_map[storage_router.ip] = client if storage_router != storage_router_to_remove and storage_router.node_type == "MASTER": master_ip = storage_router.ip except UnableToConnectException: Toolbox.log( logger=NodeRemovalController._logger, messages=" Node with IP {0:<15} is unreachable".format(storage_router.ip), ) storage_routers_offline.append(storage_router) if storage_router == storage_router_to_remove: storage_router_to_remove_online = False if len(ip_client_map) == 0 or master_ip is None: raise RuntimeError("Could not connect to any master node in the cluster") storage_router_to_remove.invalidate_dynamics("vdisks_guids") if ( len(storage_router_to_remove.vdisks_guids) > 0 ): # vDisks are supposed to be moved away manually before removing a node raise RuntimeError("Still vDisks attached to Storage Router {0}".format(storage_router_to_remove.name)) internal_memcached = Toolbox.is_service_internally_managed(service="memcached") internal_rabbit_mq = Toolbox.is_service_internally_managed(service="rabbitmq") memcached_endpoints = Configuration.get(key="/ovs/framework/memcache|endpoints") rabbit_mq_endpoints = Configuration.get(key="/ovs/framework/messagequeue|endpoints") copy_memcached_endpoints = list(memcached_endpoints) copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints) for endpoint in memcached_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_memcached_endpoints.remove(endpoint) for endpoint in rabbit_mq_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_rabbit_mq_endpoints.remove(endpoint) if len(copy_memcached_endpoints) == 0 and internal_memcached is True: raise RuntimeError( "Removal of provided nodes will result in a complete removal of the memcached service" ) if len(copy_rabbit_mq_endpoints) == 0 and internal_rabbit_mq is True: raise RuntimeError( "Removal of provided nodes will result in a complete removal of the messagequeue service" ) except Exception as exception: Toolbox.log( logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel="exception" ) sys.exit(1) ################# # CONFIRMATIONS # ################# interactive = silent != "--force-yes" remove_asd_manager = not interactive # Remove ASD manager if non-interactive else ask if interactive is True: proceed = Interactive.ask_yesno( message="Are you sure you want to remove node {0}?".format(storage_router_to_remove.name), default_value=False, ) if proceed is False: Toolbox.log(logger=NodeRemovalController._logger, messages="Abort removal", title=True) sys.exit(1) if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username="******") if ServiceManager.has_service(name="asd-manager", client=client): remove_asd_manager = Interactive.ask_yesno( message="Do you also want to remove the ASD manager and related ASDs?", default_value=False ) if remove_asd_manager is True or storage_router_to_remove_online is False: for function in Toolbox.fetch_hooks("setup", "validate_asd_removal"): validation_output = function(storage_router_to_remove.ip) if validation_output["confirm"] is True: if Interactive.ask_yesno(message=validation_output["question"], default_value=False) is False: remove_asd_manager = False break ########### # REMOVAL # ########### try: Toolbox.log( logger=NodeRemovalController._logger, messages="Starting removal of node {0} - {1}".format( storage_router_to_remove.name, storage_router_to_remove.ip ), ) if storage_router_to_remove_online is False: Toolbox.log( logger=NodeRemovalController._logger, messages=" Marking all Storage Drivers served by Storage Router {0} as offline".format( storage_router_to_remove.ip ), ) StorageDriverController.mark_offline(storagerouter_guid=storage_router_to_remove.guid) # Remove vPools Toolbox.log( logger=NodeRemovalController._logger, messages=" Removing vPools from node".format(storage_router_to_remove.ip), ) storage_routers_offline_guids = [ sr.guid for sr in storage_routers_offline if sr.guid != storage_router_to_remove.guid ] for storage_driver in storage_router_to_remove.storagedrivers: Toolbox.log( logger=NodeRemovalController._logger, messages=" Removing vPool {0} from node".format(storage_driver.vpool.name), ) StorageRouterController.remove_storagedriver( storagedriver_guid=storage_driver.guid, offline_storage_router_guids=storage_routers_offline_guids ) # Demote if MASTER if storage_router_to_remove.node_type == "MASTER": NodeTypeController.demote_node( cluster_ip=storage_router_to_remove.ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=storage_router_to_remove.machine_id, unconfigure_memcached=internal_memcached, unconfigure_rabbitmq=internal_rabbit_mq, offline_nodes=storage_routers_offline, ) # Stop / remove services Toolbox.log(logger=NodeRemovalController._logger, messages="Stopping and removing services") config_store = Configuration.get_store() if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username="******") NodeRemovalController.remove_services( client=client, node_type=storage_router_to_remove.node_type.lower(), logger=NodeRemovalController._logger, ) service = "watcher-config" if ServiceManager.has_service(service, client=client): Toolbox.log(logger=NodeRemovalController._logger, messages="Removing service {0}".format(service)) ServiceManager.stop_service(service, client=client) ServiceManager.remove_service(service, client=client) if config_store == "etcd": from ovs.extensions.db.etcd.installer import EtcdInstaller if Configuration.get(key="/ovs/framework/external_config") is None: Toolbox.log(logger=NodeRemovalController._logger, messages=" Removing Etcd cluster") try: EtcdInstaller.stop("config", client) EtcdInstaller.remove("config", client) except Exception as ex: Toolbox.log( logger=NodeRemovalController._logger, messages=["\nFailed to unconfigure Etcd", ex], loglevel="exception", ) Toolbox.log(logger=NodeRemovalController._logger, messages="Removing Etcd proxy") EtcdInstaller.remove_proxy("config", client.ip) Toolbox.run_hooks( component="noderemoval", sub_component="remove", logger=NodeRemovalController._logger, cluster_ip=storage_router_to_remove.ip, complete_removal=remove_asd_manager, ) # Clean up model Toolbox.log(logger=NodeRemovalController._logger, messages="Removing node from model") for service in storage_router_to_remove.services: service.delete() for disk in storage_router_to_remove.disks: for partition in disk.partitions: partition.delete() disk.delete() for j_domain in storage_router_to_remove.domains: j_domain.delete() Configuration.delete("/ovs/framework/hosts/{0}".format(storage_router_to_remove.machine_id)) NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, offline_node_ips=[node.ip for node in storage_routers_offline], logger=NodeRemovalController._logger, ) if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username="******") if config_store == "arakoon": client.file_delete(filenames=[ArakoonConfiguration.CACC_LOCATION]) client.file_delete(filenames=[Configuration.BOOTSTRAP_CONFIG_LOCATION]) storage_router_to_remove.delete() Toolbox.log(logger=NodeRemovalController._logger, messages="Successfully removed node\n") except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages="\n") Toolbox.log( logger=NodeRemovalController._logger, messages=["An unexpected error occurred:", str(exception)], boxed=True, loglevel="exception", ) sys.exit(1) except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages="\n") Toolbox.log( logger=NodeRemovalController._logger, messages="This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.", boxed=True, loglevel="error", ) sys.exit(1) if remove_asd_manager is True: Toolbox.log(logger=NodeRemovalController._logger, messages="\nRemoving ASD Manager") with remote(storage_router_to_remove.ip, [os]) as rem: rem.os.system("asd-manager remove --force-yes") Toolbox.log(logger=NodeRemovalController._logger, messages="Remove nodes finished", title=True)
def migrate(previous_version, master_ips=None, extra_ips=None): """ Migrates from a given version to the current version. It uses 'previous_version' to be smart wherever possible, but the code should be able to migrate any version towards the expected version. When this is not possible, the code can set a minimum version and raise when it is not met. :param previous_version: The previous version from which to start the migration :type previous_version: float :param master_ips: IP addresses of the MASTER nodes :type master_ips: list or None :param extra_ips: IP addresses of the EXTRA nodes :type extra_ips: list or None """ _ = master_ips, extra_ips working_version = previous_version # From here on, all actual migration should happen to get to the expected state for THIS RELEASE if working_version < ExtensionMigrator.THIS_VERSION: try: from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.vpoollist import VPoolList from ovs.extensions.generic.configuration import Configuration from ovs.extensions.services.servicefactory import ServiceFactory from ovs.extensions.generic.sshclient import SSHClient from ovs.extensions.generic.system import System local_machine_id = System.get_my_machine_id() local_ip = Configuration.get( '/ovs/framework/hosts/{0}/ip'.format(local_machine_id)) local_client = SSHClient(endpoint=local_ip, username='******') # Multiple Proxies if local_client.dir_exists( directory= '/opt/OpenvStorage/config/storagedriver/storagedriver' ): local_client.dir_delete(directories=[ '/opt/OpenvStorage/config/storagedriver/storagedriver' ]) # MDS safety granularity on vPool level mds_safety_key = '/ovs/framework/storagedriver' if Configuration.exists(key=mds_safety_key): current_mds_settings = Configuration.get( key=mds_safety_key) for vpool in VPoolList.get_vpools(): vpool_key = '/ovs/vpools/{0}'.format(vpool.guid) if Configuration.dir_exists(key=vpool_key): Configuration.set( key='{0}/mds_config'.format(vpool_key), value=current_mds_settings) Configuration.delete(key=mds_safety_key) # Introduction of edition key if Configuration.get(key=Configuration.EDITION_KEY, default=None) not in [ PackageFactory.EDITION_COMMUNITY, PackageFactory.EDITION_ENTERPRISE ]: for storagerouter in StorageRouterList.get_storagerouters( ): try: Configuration.set( key=Configuration.EDITION_KEY, value=storagerouter.features['alba'] ['edition']) break except: continue except: ExtensionMigrator._logger.exception( 'Error occurred while executing the migration code') # Don't update migration version with latest version, resulting in next migration trying again to execute this code return ExtensionMigrator.THIS_VERSION - 1 return ExtensionMigrator.THIS_VERSION
def migrate(previous_version, master_ips=None, extra_ips=None): """ Migrates from any version to any version, running all migrations required If previous_version is for example 0 and this script is at verison 3 it will execute two steps: - 1 > 2 - 2 > 3 :param previous_version: The previous version from which to start the migration. :param master_ips: IP addresses of the MASTER nodes :param extra_ips: IP addresses of the EXTRA nodes """ working_version = previous_version # Version 1 introduced: # - Flexible SSD layout if working_version < 1: try: from ovs.extensions.generic.configuration import Configuration if Configuration.exists('ovs.arakoon'): Configuration.delete('ovs.arakoon', remove_root=True) Configuration.set('ovs.core.ovsdb', '/opt/OpenvStorage/db') except: logger.exception('Error migrating to version 1') working_version = 1 # Version 2 introduced: # - Registration if working_version < 2: try: import time from ovs.extensions.generic.configuration import Configuration if not Configuration.exists('ovs.core.registered'): Configuration.set('ovs.core.registered', False) Configuration.set('ovs.core.install_time', time.time()) except: logger.exception('Error migrating to version 2') working_version = 2 # Version 3 introduced: # - New arakoon clients if working_version < 3: try: from ovs.extensions.db.arakoon import ArakoonInstaller reload(ArakoonInstaller) from ovs.extensions.db.arakoon.ArakoonInstaller import ArakoonInstaller from ovs.extensions.generic.sshclient import SSHClient from ovs.extensions.generic.configuration import Configuration if master_ips is not None: for ip in master_ips: client = SSHClient(ip) if client.dir_exists(ArakoonInstaller.ARAKOON_CONFIG_DIR): for cluster_name in client.dir_list(ArakoonInstaller.ARAKOON_CONFIG_DIR): try: ArakoonInstaller.deploy_cluster(cluster_name, ip) except: pass if Configuration.exists('ovs.core.storage.persistent'): Configuration.set('ovs.core.storage.persistent', 'pyrakoon') except: logger.exception('Error migrating to version 3') working_version = 3 # Version 4 introduced: # - Etcd if working_version < 4: try: import os import json from ConfigParser import RawConfigParser from ovs.extensions.db.etcd import installer reload(installer) from ovs.extensions.db.etcd.installer import EtcdInstaller from ovs.extensions.db.etcd.configuration import EtcdConfiguration from ovs.extensions.generic.system import System host_id = System.get_my_machine_id() etcd_migrate = False if EtcdInstaller.has_cluster('127.0.0.1', 'config'): etcd_migrate = True else: if master_ips is not None and extra_ips is not None: cluster_ip = None for ip in master_ips + extra_ips: if EtcdInstaller.has_cluster(ip, 'config'): cluster_ip = ip break node_ip = None path = '/opt/OpenvStorage/config/ovs.json' if os.path.exists(path): with open(path) as config_file: config = json.load(config_file) node_ip = config['grid']['ip'] if node_ip is not None: if cluster_ip is None: EtcdInstaller.create_cluster('config', node_ip) EtcdConfiguration.initialize() EtcdConfiguration.initialize_host(host_id) else: EtcdInstaller.extend_cluster(cluster_ip, node_ip, 'config') EtcdConfiguration.initialize_host(host_id) etcd_migrate = True if etcd_migrate is True: # Migrating configuration files path = '/opt/OpenvStorage/config/ovs.json' if os.path.exists(path): with open(path) as config_file: config = json.load(config_file) EtcdConfiguration.set('/ovs/framework/cluster_id', config['support']['cid']) if not EtcdConfiguration.exists('/ovs/framework/install_time'): EtcdConfiguration.set('/ovs/framework/install_time', config['core']['install_time']) else: EtcdConfiguration.set('/ovs/framework/install_time', min(EtcdConfiguration.get('/ovs/framework/install_time'), config['core']['install_time'])) EtcdConfiguration.set('/ovs/framework/registered', config['core']['registered']) EtcdConfiguration.set('/ovs/framework/plugins/installed', config['plugins']) EtcdConfiguration.set('/ovs/framework/stores', config['core']['storage']) EtcdConfiguration.set('/ovs/framework/paths', {'cfgdir': config['core']['cfgdir'], 'basedir': config['core']['basedir'], 'ovsdb': config['core']['ovsdb']}) EtcdConfiguration.set('/ovs/framework/support', {'enablesupport': config['support']['enablesupport'], 'enabled': config['support']['enabled'], 'interval': config['support']['interval']}) EtcdConfiguration.set('/ovs/framework/storagedriver', {'mds_safety': config['storagedriver']['mds']['safety'], 'mds_tlogs': config['storagedriver']['mds']['tlogs'], 'mds_maxload': config['storagedriver']['mds']['maxload']}) EtcdConfiguration.set('/ovs/framework/webapps', {'html_endpoint': config['webapps']['html_endpoint'], 'oauth2': config['webapps']['oauth2']}) EtcdConfiguration.set('/ovs/framework/messagequeue', {'endpoints': [], 'protocol': config['core']['broker']['protocol'], 'user': config['core']['broker']['login'], 'port': config['core']['broker']['port'], 'password': config['core']['broker']['password'], 'queues': config['core']['broker']['queues']}) host_key = '/ovs/framework/hosts/{0}{{0}}'.format(host_id) EtcdConfiguration.set(host_key.format('/storagedriver'), {'rsp': config['storagedriver']['rsp'], 'vmware_mode': config['storagedriver']['vmware_mode']}) EtcdConfiguration.set(host_key.format('/ports'), config['ports']) EtcdConfiguration.set(host_key.format('/setupcompleted'), config['core']['setupcompleted']) EtcdConfiguration.set(host_key.format('/versions'), config['core'].get('versions', {})) EtcdConfiguration.set(host_key.format('/type'), config['core']['nodetype']) EtcdConfiguration.set(host_key.format('/ip'), config['grid']['ip']) path = '{0}/memcacheclient.cfg'.format(EtcdConfiguration.get('/ovs/framework/paths|cfgdir')) if os.path.exists(path): config = RawConfigParser() config.read(path) nodes = [config.get(node.strip(), 'location').strip() for node in config.get('main', 'nodes').split(',')] EtcdConfiguration.set('/ovs/framework/memcache|endpoints', nodes) os.remove(path) path = '{0}/rabbitmqclient.cfg'.format(EtcdConfiguration.get('/ovs/framework/paths|cfgdir')) if os.path.exists(path): config = RawConfigParser() config.read(path) nodes = [config.get(node.strip(), 'location').strip() for node in config.get('main', 'nodes').split(',')] EtcdConfiguration.set('/ovs/framework/messagequeue|endpoints', nodes) os.remove(path) # Migrate arakoon configuration files from ovs.extensions.db.arakoon import ArakoonInstaller reload(ArakoonInstaller) from ovs.extensions.db.arakoon.ArakoonInstaller import ArakoonInstaller, ArakoonClusterConfig from ovs.extensions.generic.sshclient import SSHClient if master_ips is not None: config_dir = '/opt/OpenvStorage/config/arakoon/' for ip in master_ips: client = SSHClient(ip) if client.dir_exists(config_dir): for cluster_name in client.dir_list(config_dir): try: with open('{0}/{1}/{1}.cfg'.format(config_dir, cluster_name)) as config_file: EtcdConfiguration.set(ArakoonClusterConfig.ETCD_CONFIG_KEY.format(cluster_name), config_file.read(), raw=True) ArakoonInstaller.deploy_cluster(cluster_name, ip) except: logger.exception('Error migrating {0} on {1}'.format(cluster_name, ip)) client.dir_delete(config_dir) except: logger.exception('Error migrating to version 4') working_version = 4 return working_version
def migrate(previous_version, master_ips=None, extra_ips=None): """ Migrates from any version to any version, running all migrations required If previous_version is for example 0 and this script is at verison 3 it will execute two steps: - 1 > 2 - 2 > 3 :param previous_version: The previous version from which to start the migration. :param master_ips: IP addresses of the MASTER nodes :param extra_ips: IP addresses of the EXTRA nodes """ working_version = previous_version # Version 1 introduced: # - Flexible SSD layout if working_version < 1: from ovs.extensions.generic.configuration import Configuration if Configuration.exists('ovs.arakoon'): Configuration.delete('ovs.arakoon', remove_root=True) Configuration.set('ovs.core.ovsdb', '/opt/OpenvStorage/db') working_version = 1 # Version 2 introduced: # - Registration if working_version < 2: import time from ovs.extensions.generic.configuration import Configuration if not Configuration.exists('ovs.core.registered'): Configuration.set('ovs.core.registered', False) Configuration.set('ovs.core.install_time', time.time()) working_version = 2 # Version 3 introduced: # - New arakoon clients if working_version < 3: from ovs.extensions.db.arakoon import ArakoonInstaller reload(ArakoonInstaller) from ovs.extensions.db.arakoon import ArakoonInstaller from ovs.extensions.generic.sshclient import SSHClient from ovs.extensions.generic.configuration import Configuration if master_ips is not None: for ip in master_ips: client = SSHClient(ip) if client.dir_exists(ArakoonInstaller.ArakoonInstaller. ARAKOON_CONFIG_DIR): for cluster_name in client.dir_list( ArakoonInstaller.ArakoonInstaller. ARAKOON_CONFIG_DIR): try: ArakoonInstaller.ArakoonInstaller.deploy_cluster( cluster_name, ip) except: pass if Configuration.exists('ovs.core.storage.persistent'): Configuration.set('ovs.core.storage.persistent', 'pyrakoon') working_version = 3 return working_version
def _deploy_stack_and_scrub(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled (by reference) :type error_messages: list :return: None :rtype: NoneType """ if len(vpool.storagedrivers ) == 0 or not vpool.storagedrivers[0].storagedriver_id: error_messages.append( 'vPool {0} does not have any valid StorageDrivers configured'. format(vpool.name)) return service_manager = ServiceFactory.get_manager() client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] partition_guid = scrub_info['partition_guid'] alba_proxy_service = 'ovs-albaproxy_{0}_{1}_{2}_scrub'.format( vpool.name, storagerouter.name, partition_guid) scrub_directory = '{0}/scrub_work_{1}_{2}'.format( scrub_info['scrub_path'], vpool.name, partition_guid) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format( vpool.guid, partition_guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format( vpool.guid, partition_guid) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod( scrub_directory, 0777 ) # Celery task executed by 'ovs' user and should be able to write in it if service_manager.has_service( name=alba_proxy_service, client=client ) is True and service_manager.get_service_status( name=alba_proxy_service, client=client) == 'active': GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get( '/ovs/framework/hosts/{0}/ports|storagedriver'.format( machine_id)) with volatile_mutex('deploy_proxy_for_scrub_{0}'.format( storagerouter.guid), wait=30): port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] scrub_config = Configuration.get( 'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format( vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = { 'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path(alba_proxy_service), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key) } service_manager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) service_manager.start_service(name=alba_proxy_service, client=client) GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get( 'ovs/vpools/{0}/hosts/{1}/config'.format( vpool.guid, vpool.storagedrivers[0].storagedriver_id ))['backend_connection_manager'] if backend_config.get('backend_type') != 'MULTI': backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config[ 'port'] else: for value in backend_config.itervalues(): if isinstance(value, dict): value['alba_connection_host'] = '127.0.0.1' value['alba_connection_port'] = scrub_config[ 'port'] # Copy backend connection manager information in separate key Configuration.set( backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) GenericController._logger.exception(message) if client is not None and service_manager.has_service( name=alba_proxy_service, client=client) is True: if service_manager.get_service_status( name=alba_proxy_service, client=client) == 'active': service_manager.stop_service(name=alba_proxy_service, client=client) service_manager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) # Execute the actual scrubbing threads = [] threads_key = '/ovs/framework/hosts/{0}/config|scrub_stack_threads'.format( storagerouter.machine_id) amount_threads = Configuration.get( key=threads_key) if Configuration.exists(key=threads_key) else 2 if not isinstance(amount_threads, int): error_messages.append( 'Amount of threads to spawn must be an integer for StorageRouter with ID {0}' .format(storagerouter.machine_id)) return amount_threads = max(amount_threads, 1) # Make sure amount_threads is at least 1 amount_threads = min(min(queue.qsize(), amount_threads), 20) # Make sure amount threads is max 20 GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Spawning {2} threads for proxy service {3}' .format(vpool.name, storagerouter.name, amount_threads, alba_proxy_service)) for index in range(amount_threads): thread = Thread(name='execute_scrub_{0}_{1}_{2}'.format( vpool.guid, partition_guid, index), target=GenericController._execute_scrub, args=(queue, vpool, scrub_info, scrub_directory, error_messages)) thread.start() threads.append(thread) for thread in threads: thread.join() # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if service_manager.has_service(alba_proxy_service, client=client): service_manager.stop_service(alba_proxy_service, client=client) service_manager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) GenericController._logger.exception(message)
def execute_scrub_work(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled :type error_messages: list :return: a list of error messages :rtype: list """ def _verify_mds_config(current_vdisk): current_vdisk.invalidate_dynamics('info') vdisk_configs = current_vdisk.info['metadata_backend_config'] if len(vdisk_configs) == 0: raise RuntimeError('Could not load MDS configuration') return vdisk_configs client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] scrub_directory = '{0}/scrub_work_{1}_{2}'.format(scrub_info['scrub_path'], vpool.name, storagerouter.name) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(vpool.guid, storagerouter.guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(vpool.guid, storagerouter.guid) alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format(vpool.name, storagerouter.name) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod(scrub_directory, 0777) # Celery task executed by 'ovs' user and should be able to write in it if ServiceManager.has_service(name=alba_proxy_service, client=client) is True and ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get('/ovs/framework/hosts/{0}/ports|storagedriver'.format(machine_id)) port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] # Scrub config # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini', # u'fragment_cache': [u'none'], # u'ips': [u'127.0.0.1'], # u'log_level': u'info', # u'manifest_cache_size': 17179869184, # u'port': 0, # u'transport': u'tcp'} # Backend config # {u'alba_connection_host': u'10.100.193.155', # u'alba_connection_port': 26204, # u'alba_connection_preset': u'preset', # u'alba_connection_timeout': 15, # u'alba_connection_transport': u'TCP', # u'backend_interface_retries_on_error': 5, # u'backend_interface_retry_backoff_multiplier': 2.0, # u'backend_interface_retry_interval_secs': 1, # u'backend_type': u'ALBA'} scrub_config = Configuration.get('ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = {'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path('alba_proxy'), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key)} ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) ServiceManager.start_service(name=alba_proxy_service, client=client) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get('ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, vpool.storagedrivers[0].storagedriver_id))['backend_connection_manager'] backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config['port'] Configuration.set(backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message) if client is not None and ServiceManager.has_service(name=alba_proxy_service, client=client) is True: if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ServiceManager.stop_service(name=alba_proxy_service, client=client) ServiceManager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) try: # Empty the queue with vDisks to scrub with remote(storagerouter.ip, [VDisk]) as rem: while True: vdisk = None vdisk_guid = queue.get(False) try: # Check MDS master is local. Trigger MDS handover if necessary vdisk = rem.VDisk(vdisk_guid) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'.format(vpool.name, storagerouter.name, vdisk.name, scrub_directory)) configs = _verify_mds_config(current_vdisk=vdisk) storagedriver = StorageDriverList.get_by_storagedriver_id(vdisk.storagedriver_id) if configs[0].get('ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'.format(vpool.name, storagerouter.name, vdisk.name)) MDSServiceController.ensure_safety(VDisk(vdisk_guid)) # Do not use a remote VDisk instance here configs = _verify_mds_config(current_vdisk=vdisk) if configs[0].get('ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.warning('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'.format(vpool.name, storagerouter.name, vdisk.name)) continue # Do the actual scrubbing with vdisk.storagedriver_client.make_locked_client(str(vdisk.volume_id)) as locked_client: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'.format(vpool.name, storagerouter.name, vdisk.name)) work_units = locked_client.get_scrubbing_workunits() for work_unit in work_units: res = locked_client.scrub(work_unit=work_unit, scratch_dir=scrub_directory, log_sinks=[LogHandler.get_sink_path('scrubber', allow_override=True)], backend_config=Configuration.get_configuration_path(backend_config_key)) locked_client.apply_scrubbing_result(scrubbing_work_result=res) if work_units: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'.format(vpool.name, storagerouter.name, vdisk.name, len(work_units))) else: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'.format(vpool.name, storagerouter.name, vdisk.name)) except Exception: if vdisk is None: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(vpool.name, storagerouter.name, vdisk_guid) else: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(vpool.name, storagerouter.name, vdisk.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) except Empty: # Raised when all items have been fetched from the queue ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'.format(vpool.name, storagerouter.name)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(vpool.name, storagerouter.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if ServiceManager.has_service(alba_proxy_service, client=client): ServiceManager.stop_service(alba_proxy_service, client=client) ServiceManager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message)
def migrate(): """ Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually executed. This code will typically contain: * "dangerous" migration code (it needs certain running services) * Migration code depending on a cluster-wide state * ... * Successfully finishing a piece of migration code, should create an entry in /ovs/framework/migration in case it should not be executed again * Eg: /ovs/framework/migration|stats_monkey_integration: True """ MigrationController._logger.info('Preparing out of band migrations...') from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.storagedriverlist import StorageDriverList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.vpoollist import VPoolList from ovs.extensions.db.arakooninstaller import ArakoonInstaller from ovs.extensions.generic.configuration import Configuration from ovs.extensions.generic.sshclient import SSHClient from ovs_extensions.generic.toolbox import ExtensionsToolbox from ovs.extensions.migration.migration.ovsmigrator import ExtensionMigrator from ovs.extensions.packages.packagefactory import PackageFactory from ovs_extensions.services.interfaces.systemd import Systemd from ovs.extensions.services.servicefactory import ServiceFactory from ovs.extensions.storageserver.storagedriver import StorageDriverConfiguration from ovs.lib.helpers.storagedriver.installer import StorageDriverInstaller MigrationController._logger.info('Start out of band migrations...') service_manager = ServiceFactory.get_manager() sr_client_map = {} for storagerouter in StorageRouterList.get_storagerouters(): sr_client_map[storagerouter.guid] = SSHClient(endpoint=storagerouter.ip, # Is triggered during post-update code too during which the ovs-watcher-framework service is still down and thus not refreshing the heartbeat --> use IP i/o StorageRouter username='******') ######################################################### # Addition of 'ExecReload' for AlbaProxy SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagedriver in StorageDriverList.get_storagedrivers(): root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: service = alba_proxy.service service_name = 'ovs-{0}'.format(service.name) if not service_manager.has_service(name=service_name, client=root_client): continue if 'ExecReload=' in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ################################################################## # Adjustment of open file descriptors for Arakoon services to 8192 changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-arakoon-'): continue if ServiceFactory.get_service_type() == 'systemd': path = '/lib/systemd/system/{0}.service'.format(service_name) check = 'LimitNOFILE=8192' else: path = '/etc/init/{0}.conf'.format(service_name) check = 'limit nofile 8192 8192' if not root_client.file_exists(path): continue if check in root_client.file_read(path): continue try: service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name) changed_clients.add(root_client) ExtensionsToolbox.edit_version_file(client=root_client, package_name='arakoon', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, service_name)) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ############################# # Migrate to multiple proxies for storagedriver in StorageDriverList.get_storagedrivers(): vpool = storagedriver.vpool root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: # Rename alba_proxy service in model service = alba_proxy.service old_service_name = 'albaproxy_{0}'.format(vpool.name) new_service_name = 'albaproxy_{0}_0'.format(vpool.name) if old_service_name != service.name: continue service.name = new_service_name service.save() if not service_manager.has_service(name=old_service_name, client=root_client): continue old_configuration_key = '/ovs/framework/hosts/{0}/services/{1}'.format(storagedriver.storagerouter.machine_id, old_service_name) if not Configuration.exists(key=old_configuration_key): continue # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, old_service_name), new_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, new_service_name)) # Register new service and remove old service service_manager.add_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, params=Configuration.get(old_configuration_key), target_name='ovs-{0}'.format(new_service_name)) # Update scrub proxy config proxy_config_key = '/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid) proxy_config = None if Configuration.exists(key=proxy_config_key) is False else Configuration.get(proxy_config_key) if proxy_config is not None: fragment_cache = proxy_config.get(StorageDriverConfiguration.CACHE_FRAGMENT, ['none', {}]) if fragment_cache[0] == 'alba' and fragment_cache[1].get('cache_on_write') is True: # Accelerated ALBA configured fragment_cache_scrub_info = copy.deepcopy(fragment_cache) fragment_cache_scrub_info[1]['cache_on_read'] = False proxy_scrub_config_key = '/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid) proxy_scrub_config = None if Configuration.exists(key=proxy_scrub_config_key) is False else Configuration.get(proxy_scrub_config_key) if proxy_scrub_config is not None and proxy_scrub_config[StorageDriverConfiguration.CACHE_FRAGMENT] == ['none']: proxy_scrub_config[StorageDriverConfiguration.CACHE_FRAGMENT] = fragment_cache_scrub_info Configuration.set(key=proxy_scrub_config_key, value=proxy_scrub_config) # Update 'backend_connection_manager' section changes = False storagedriver_config = StorageDriverConfiguration(vpool.guid, storagedriver.storagedriver_id) if 'backend_connection_manager' not in storagedriver_config.configuration: continue current_config = storagedriver_config.configuration['backend_connection_manager'] if current_config.get('backend_type') != 'MULTI': changes = True backend_connection_manager = {'backend_type': 'MULTI'} for index, proxy in enumerate(sorted(storagedriver.alba_proxies, key=lambda pr: pr.service.ports[0])): backend_connection_manager[str(index)] = copy.deepcopy(current_config) # noinspection PyUnresolvedReferences backend_connection_manager[str(index)]['alba_connection_use_rora'] = True # noinspection PyUnresolvedReferences backend_connection_manager[str(index)]['alba_connection_rora_manifest_cache_capacity'] = 5000 # noinspection PyUnresolvedReferences for key, value in backend_connection_manager[str(index)].items(): if key.startswith('backend_interface'): backend_connection_manager[key] = value # noinspection PyUnresolvedReferences del backend_connection_manager[str(index)][key] for key, value in {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}.iteritems(): if key not in backend_connection_manager: backend_connection_manager[key] = value else: backend_connection_manager = current_config for value in backend_connection_manager.values(): if isinstance(value, dict): for key, val in value.items(): if key.startswith('backend_interface'): backend_connection_manager[key] = val changes = True del value[key] for key, value in {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}.iteritems(): if key not in backend_connection_manager: changes = True backend_connection_manager[key] = value if changes is True: storagedriver_config.clear_backend_connection_manager() storagedriver_config.configure_backend_connection_manager(**backend_connection_manager) storagedriver_config.save(root_client) # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section) ExtensionsToolbox.edit_version_file(client=root_client, package_name='volumedriver', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, 'volumedriver_{0}'.format(vpool.name))) if service_manager.__class__ == Systemd: root_client.run(['systemctl', 'daemon-reload']) ######################################## # Update metadata_store_bits information vpools = VPoolList.get_vpools() for vpool in vpools: bits = None for storagedriver in vpool.storagedrivers: key = '/ovs/framework/hosts/{0}/services/volumedriver_{1}'.format(storagedriver.storagerouter.machine_id, vpool.name) if Configuration.exists(key=key) and 'METADATASTORE_BITS' not in Configuration.get(key=key): if bits is None: entries = service_manager.extract_from_service_file(name='ovs-volumedriver_{0}'.format(vpool.name), client=sr_client_map[storagedriver.storagerouter_guid], entries=['METADATASTORE_BITS=']) if len(entries) == 1: bits = entries[0].split('=')[-1] bits = int(bits) if bits.isdigit() else 5 if bits is not None: try: content = Configuration.get(key=key) content['METADATASTORE_BITS'] = bits Configuration.set(key=key, value=content) except: MigrationController._logger.exception('Error updating volumedriver info for vPool {0} on StorageRouter {1}'.format(vpool.name, storagedriver.storagerouter.name)) if bits is not None: vpool.metadata_store_bits = bits vpool.save() ##################################### # Update the vPool metadata structure def _update_metadata_structure(metadata): metadata = copy.deepcopy(metadata) cache_structure = {'read': False, 'write': False, 'is_backend': False, 'quota': None, 'backend_info': {'name': None, # Will be filled in when is_backend is true 'backend_guid': None, 'alba_backend_guid': None, 'policies': None, 'preset': None, 'arakoon_config': None, 'connection_info': {'client_id': None, 'client_secret': None, 'host': None, 'port': None, 'local': None}} } structure_map = {StorageDriverConfiguration.CACHE_BLOCK: {'read': 'block_cache_on_read', 'write': 'block_cache_on_write', 'quota': 'quota_bc', 'backend_prefix': 'backend_bc_{0}'}, StorageDriverConfiguration.CACHE_FRAGMENT: {'read': 'fragment_cache_on_read', 'write': 'fragment_cache_on_write', 'quota': 'quota_fc', 'backend_prefix': 'backend_aa_{0}'}} if 'arakoon_config' in metadata['backend']: # Arakoon config should be placed under the backend info metadata['backend']['backend_info']['arakoon_config'] = metadata['backend'].pop('arakoon_config') if 'connection_info' in metadata['backend']: # Connection info sohuld be placed under the backend info metadata['backend']['backend_info']['connection_info'] = metadata['backend'].pop('connection_info') if 'caching_info' not in metadata: # Caching info is the new key would_be_caching_info = {} metadata['caching_info'] = would_be_caching_info # Extract all caching data for every storagerouter current_caching_info = metadata['backend'].pop('caching_info') # Pop to mutate metadata for storagerouter_guid in current_caching_info.iterkeys(): current_cache_data = current_caching_info[storagerouter_guid] storagerouter_caching_info = {} would_be_caching_info[storagerouter_guid] = storagerouter_caching_info for cache_type, cache_type_mapping in structure_map.iteritems(): new_cache_structure = copy.deepcopy(cache_structure) storagerouter_caching_info[cache_type] = new_cache_structure for new_structure_key, old_structure_key in cache_type_mapping.iteritems(): if new_structure_key == 'backend_prefix': # Get possible backend related info metadata_key = old_structure_key.format(storagerouter_guid) if metadata_key not in metadata: continue backend_data = metadata.pop(metadata_key) # Pop to mutate metadata new_cache_structure['is_backend'] = True # Copy over the old data new_cache_structure['backend_info']['arakoon_config'] = backend_data['arakoon_config'] new_cache_structure['backend_info'].update(backend_data['backend_info']) new_cache_structure['backend_info']['connection_info'].update(backend_data['connection_info']) else: new_cache_structure[new_structure_key] = current_cache_data.get(old_structure_key) return metadata vpools = VPoolList.get_vpools() for vpool in vpools: try: new_metadata = _update_metadata_structure(vpool.metadata) vpool.metadata = new_metadata vpool.save() except KeyError: MigrationController._logger.exception('Exceptions occurred when updating the metadata for vPool {0}'.format(vpool.name)) ############################################## # Always use indent=4 during Configuration set def _resave_all_config_entries(config_path='/ovs'): """ Recursive functions which checks every config management key if its a directory or not. If not a directory, we retrieve the config and just save it again using the new indentation logic """ for item in Configuration.list(config_path): new_path = config_path + '/' + item print new_path if Configuration.dir_exists(new_path) is True: _resave_all_config_entries(config_path=new_path) else: try: _config = Configuration.get(new_path) Configuration.set(new_path, _config) except: _config = Configuration.get(new_path, raw=True) Configuration.set(new_path, _config, raw=True) if ExtensionMigrator.THIS_VERSION <= 13: # There is no way of checking whether this new indentation logic has been applied, so we only perform this for version 13 and lower MigrationController._logger.info('Re-saving every configuration setting with new indentation rules') _resave_all_config_entries() ############################ # Update some default values def _update_manifest_cache_size(_proxy_config_key): updated = False manifest_cache_size = 500 * 1024 * 1024 if Configuration.exists(key=_proxy_config_key): _proxy_config = Configuration.get(key=_proxy_config_key) for cache_type in [StorageDriverConfiguration.CACHE_BLOCK, StorageDriverConfiguration.CACHE_FRAGMENT]: if cache_type in _proxy_config and _proxy_config[cache_type][0] == 'alba': if _proxy_config[cache_type][1]['manifest_cache_size'] != manifest_cache_size: updated = True _proxy_config[cache_type][1]['manifest_cache_size'] = manifest_cache_size if _proxy_config['manifest_cache_size'] != manifest_cache_size: updated = True _proxy_config['manifest_cache_size'] = manifest_cache_size if updated is True: Configuration.set(key=_proxy_config_key, value=_proxy_config) return updated for storagedriver in StorageDriverList.get_storagedrivers(): try: vpool = storagedriver.vpool root_client = sr_client_map[storagedriver.storagerouter_guid] _update_manifest_cache_size('/ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)) # Generic scrub proxy is deployed every time scrubbing kicks in, so no need to restart these services for alba_proxy in storagedriver.alba_proxies: if _update_manifest_cache_size('/ovs/vpools/{0}/proxies/{1}/config/main'.format(vpool.guid, alba_proxy.guid)) is True: # Add '-reboot' to alba_proxy services (because of newly created services and removal of old service) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, alba_proxy.service.name)) # Update 'backend_connection_manager' section changes = False storagedriver_config = StorageDriverConfiguration(vpool.guid, storagedriver.storagedriver_id) if 'backend_connection_manager' not in storagedriver_config.configuration: continue current_config = storagedriver_config.configuration['backend_connection_manager'] for key, value in current_config.iteritems(): if key.isdigit() is True: if value.get('alba_connection_asd_connection_pool_capacity') != 10: changes = True value['alba_connection_asd_connection_pool_capacity'] = 10 if value.get('alba_connection_timeout') != 30: changes = True value['alba_connection_timeout'] = 30 if value.get('alba_connection_rora_manifest_cache_capacity') != 25000: changes = True value['alba_connection_rora_manifest_cache_capacity'] = 25000 if changes is True: storagedriver_config.clear_backend_connection_manager() storagedriver_config.configure_backend_connection_manager(**current_config) storagedriver_config.save(root_client) # Add '-reboot' to volumedriver services (because of updated 'backend_connection_manager' section) ExtensionsToolbox.edit_version_file(client=root_client, package_name='volumedriver', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, 'volumedriver_{0}'.format(vpool.name))) except Exception: MigrationController._logger.exception('Updating default configuration values failed for StorageDriver {0}'.format(storagedriver.storagedriver_id)) #################################################### # Adding proxy fail fast as env variable for proxies changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-albaproxy_'): continue if ServiceFactory.get_service_type() == 'systemd': path = '/lib/systemd/system/{0}.service'.format(service_name) check = 'Environment=ALBA_FAIL_FAST=true' else: path = '/etc/init/{0}.conf'.format(service_name) check = 'env ALBA_FAIL_FAST=true' if not root_client.file_exists(path): continue if check in root_client.file_read(path): continue try: service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name=service_name) changed_clients.add(root_client) ExtensionsToolbox.edit_version_file(client=root_client, package_name='alba', old_run_file='{0}/{1}.version'.format(ServiceFactory.RUN_FILE_DIR, service_name)) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ###################################### # Integration of stats monkey (2.10.2) if Configuration.get(key='/ovs/framework/migration|stats_monkey_integration', default=False) is False: try: # Get content of old key into new key old_stats_monkey_key = '/statsmonkey/statsmonkey' if Configuration.exists(key=old_stats_monkey_key) is True: Configuration.set(key='/ovs/framework/monitoring/stats_monkey', value=Configuration.get(key=old_stats_monkey_key)) Configuration.delete(key=old_stats_monkey_key) # Make sure to disable the stats monkey by default or take over the current schedule if it was configured manually before celery_key = '/ovs/framework/scheduling/celery' current_value = None scheduling_config = Configuration.get(key=celery_key, default={}) if 'statsmonkey.run_all_stats' in scheduling_config: # Old celery task name of the stats monkey current_value = scheduling_config.pop('statsmonkey.run_all_stats') scheduling_config['ovs.stats_monkey.run_all'] = current_value scheduling_config['alba.stats_monkey.run_all'] = current_value Configuration.set(key=celery_key, value=scheduling_config) support_key = '/ovs/framework/support' support_config = Configuration.get(key=support_key) support_config['support_agent'] = support_config.pop('enabled', True) support_config['remote_access'] = support_config.pop('enablesupport', False) Configuration.set(key=support_key, value=support_config) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|stats_monkey_integration', value=True) except Exception: MigrationController._logger.exception('Integration of stats monkey failed') ###################################################### # Write away cluster ID to a file for back-up purposes try: cluster_id = Configuration.get(key='/ovs/framework/cluster_id', default=None) with open(Configuration.CONFIG_STORE_LOCATION, 'r') as config_file: config = json.load(config_file) if cluster_id is not None and config.get('cluster_id', None) is None: config['cluster_id'] = cluster_id with open(Configuration.CONFIG_STORE_LOCATION, 'w') as config_file: json.dump(config, config_file, indent=4) except Exception: MigrationController._logger.exception('Writing cluster id to a file failed.') ######################################################### # Additional string formatting in Arakoon services (2.11) try: if Configuration.get(key='/ovs/framework/migration|arakoon_service_update', default=False) is False: arakoon_service_names = [ArakoonInstaller.get_service_name_for_cluster(cluster_name=cluster_name) for cluster_name in Configuration.list(key='ovs/arakoon')] for storagerouter in StorageRouterList.get_masters(): for service_name in arakoon_service_names: config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(storagerouter.machine_id, service_name) if Configuration.exists(key=config_key): config = Configuration.get(key=config_key) config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR config['ARAKOON_PKG_NAME'] = PackageFactory.PKG_ARAKOON config['ARAKOON_VERSION_CMD'] = PackageFactory.VERSION_CMD_ARAKOON Configuration.set(key=config_key, value=config) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|arakoon_service_update', value=True) except Exception: MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed') ############################################################ # Additional string formatting in ALBA proxy services (2.11) changed_clients = set() try: if Configuration.get(key='/ovs/framework/migration|alba_proxy_service_update', default=False) is False: alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_ALBA) for service in ServiceTypeList.get_by_name('AlbaProxy').services: root_client = sr_client_map[service.storagerouter_guid] config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(service.storagerouter.machine_id, service.name) if Configuration.exists(key=config_key): config = Configuration.get(key=config_key) config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR config['ALBA_PKG_NAME'] = alba_pkg_name config['ALBA_VERSION_CMD'] = alba_version_cmd Configuration.set(key=config_key, value=config) service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_PROXY, client=root_client, target_name='ovs-{0}'.format(service.name)) changed_clients.add(root_client) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|alba_proxy_service_update', value=True) except Exception: MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed') ############################################################ # Additional string formatting in DTL/VOLDRV services (2.11) try: if Configuration.get(key='/ovs/framework/migration|voldrv_service_update', default=False) is False: sd_pkg_name, sd_version_cmd = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD) for vpool in VPoolList.get_vpools(): for storagedriver in vpool.storagedrivers: root_client = sr_client_map[storagedriver.storagerouter_guid] for entry in ['dtl', 'volumedriver']: service_name = '{0}_{1}'.format(entry, vpool.name) service_template = StorageDriverInstaller.SERVICE_TEMPLATE_DTL if entry == 'dtl' else StorageDriverInstaller.SERVICE_TEMPLATE_SD config_key = ServiceFactory.SERVICE_CONFIG_KEY.format(storagedriver.storagerouter.machine_id, service_name) if Configuration.exists(key=config_key): config = Configuration.get(key=config_key) config['RUN_FILE_DIR'] = ServiceFactory.RUN_FILE_DIR config['VOLDRV_PKG_NAME'] = sd_pkg_name config['VOLDRV_VERSION_CMD'] = sd_version_cmd Configuration.set(key=config_key, value=config) service_manager.regenerate_service(name=service_template, client=root_client, target_name='ovs-{0}'.format(service_name)) changed_clients.add(root_client) # Make sure once this finished, it never runs again by setting this key to True Configuration.set(key='/ovs/framework/migration|voldrv_service_update', value=True) except Exception: MigrationController._logger.exception('Updating the string formatting for the Arakoon services failed') ####################################################### # Storing actual package name in version files (2.11.0) (https://github.com/openvstorage/framework/issues/1876) if Configuration.get(key='/ovs/framework/migration|actual_package_name_in_version_file', default=False) is False: try: voldrv_pkg_name, _ = PackageFactory.get_package_and_version_cmd_for(component=PackageFactory.COMP_SD) for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map.get(storagerouter.guid) if root_client is None: continue for file_name in root_client.file_list(directory=ServiceFactory.RUN_FILE_DIR): if not file_name.endswith('.version'): continue file_path = '{0}/{1}'.format(ServiceFactory.RUN_FILE_DIR, file_name) contents = root_client.file_read(filename=file_path) regenerate = False if voldrv_pkg_name == PackageFactory.PKG_VOLDRV_SERVER: if 'volumedriver-server' in contents: regenerate = True contents = contents.replace('volumedriver-server', PackageFactory.PKG_VOLDRV_SERVER) root_client.file_write(filename=file_path, contents=contents) elif voldrv_pkg_name == PackageFactory.PKG_VOLDRV_SERVER_EE: if 'volumedriver-server' in contents or PackageFactory.PKG_VOLDRV_SERVER in contents: regenerate = True contents = contents.replace('volumedriver-server', PackageFactory.PKG_VOLDRV_SERVER_EE) contents = contents.replace(PackageFactory.PKG_VOLDRV_SERVER, PackageFactory.PKG_VOLDRV_SERVER_EE) root_client.file_write(filename=file_path, contents=contents) if regenerate is True: service_manager.regenerate_service(name=StorageDriverInstaller.SERVICE_TEMPLATE_DTL if file_name.startswith('dtl') else StorageDriverInstaller.SERVICE_TEMPLATE_SD, client=root_client, target_name='ovs-{0}'.format(file_name.split('.')[0])) # Leave out .version changed_clients.add(root_client) Configuration.set(key='/ovs/framework/migration|actual_package_name_in_version_file', value=True) except Exception: MigrationController._logger.exception('Updating actual package name for version files failed') for root_client in changed_clients: try: root_client.run(['systemctl', 'daemon-reload']) except Exception: MigrationController._logger.exception('Executing command "systemctl daemon-reload" failed') ######################################################### # Addition of 'Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50' for AlbaProxy SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagedriver in StorageDriverList.get_storagedrivers(): root_client = sr_client_map[storagedriver.storagerouter_guid] for alba_proxy in storagedriver.alba_proxies: service = alba_proxy.service service_name = 'ovs-{0}'.format(service.name) if not service_manager.has_service(name=service_name, client=root_client): continue if "Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50" in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name='ovs-albaproxy', client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) ######################################################### # Addition of 'Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50' for Arakoon SystemD services if ServiceFactory.get_service_type() == 'systemd': changed_clients = set() for storagerouter in StorageRouterList.get_storagerouters(): root_client = sr_client_map[storagerouter.guid] for service_name in service_manager.list_services(client=root_client): if not service_name.startswith('ovs-arakoon-'): continue if not service_manager.has_service(name=service_name, client=root_client): continue if "Environment=OCAMLRUNPARAM='b,a=1,s=4096k,O=50" in root_client.file_read(filename='/lib/systemd/system/{0}.service'.format(service_name)): continue try: service_manager.regenerate_service(name='ovs-arakoon', client=root_client, target_name=service_name) changed_clients.add(root_client) except: MigrationController._logger.exception('Error rebuilding service {0}'.format(service_name)) for root_client in changed_clients: root_client.run(['systemctl', 'daemon-reload']) MigrationController._logger.info('Finished out of band migrations')
def execute_scrub_work(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled :type error_messages: list :return: a list of error messages :rtype: list """ def _verify_mds_config(current_vdisk): current_vdisk.invalidate_dynamics('info') vdisk_configs = current_vdisk.info['metadata_backend_config'] if len(vdisk_configs) == 0: raise RuntimeError('Could not load MDS configuration') return vdisk_configs client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] scrub_directory = '{0}/scrub_work_{1}_{2}'.format( scrub_info['scrub_path'], vpool.name, storagerouter.name) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format( vpool.guid, storagerouter.guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format( vpool.guid, storagerouter.guid) alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format( vpool.name, storagerouter.name) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod( scrub_directory, 0777 ) # Celery task executed by 'ovs' user and should be able to write in it if ServiceManager.has_service( name=alba_proxy_service, client=client ) is True and ServiceManager.get_service_status( name=alba_proxy_service, client=client) is True: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get( '/ovs/framework/hosts/{0}/ports|storagedriver'.format( machine_id)) port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] # Scrub config # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini', # u'fragment_cache': [u'none'], # u'ips': [u'127.0.0.1'], # u'log_level': u'info', # u'manifest_cache_size': 17179869184, # u'port': 0, # u'transport': u'tcp'} # Backend config # {u'alba_connection_host': u'10.100.193.155', # u'alba_connection_port': 26204, # u'alba_connection_preset': u'preset', # u'alba_connection_timeout': 15, # u'alba_connection_transport': u'TCP', # u'backend_interface_retries_on_error': 5, # u'backend_interface_retry_backoff_multiplier': 2.0, # u'backend_interface_retry_interval_secs': 1, # u'backend_type': u'ALBA'} scrub_config = Configuration.get( 'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format( vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = { 'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path('alba_proxy'), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key) } ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) ServiceManager.start_service(name=alba_proxy_service, client=client) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get( 'ovs/vpools/{0}/hosts/{1}/config'.format( vpool.guid, vpool.storagedrivers[0].storagedriver_id ))['backend_connection_manager'] backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config['port'] Configuration.set( backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message) if client is not None and ServiceManager.has_service( name=alba_proxy_service, client=client) is True: if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ServiceManager.stop_service(name=alba_proxy_service, client=client) ServiceManager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) try: # Empty the queue with vDisks to scrub with remote(storagerouter.ip, [VDisk]) as rem: while True: vdisk = None vdisk_guid = queue.get(False) try: # Check MDS master is local. Trigger MDS handover if necessary vdisk = rem.VDisk(vdisk_guid) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}' .format(vpool.name, storagerouter.name, vdisk.name, scrub_directory)) configs = _verify_mds_config(current_vdisk=vdisk) storagedriver = StorageDriverList.get_by_storagedriver_id( vdisk.storagedriver_id) if configs[0].get( 'ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover' .format(vpool.name, storagerouter.name, vdisk.name)) MDSServiceController.ensure_safety( VDisk(vdisk_guid) ) # Do not use a remote VDisk instance here configs = _verify_mds_config(current_vdisk=vdisk) if configs[0].get( 'ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.warning( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local' .format(vpool.name, storagerouter.name, vdisk.name)) continue # Do the actual scrubbing with vdisk.storagedriver_client.make_locked_client( str(vdisk.volume_id)) as locked_client: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work' .format(vpool.name, storagerouter.name, vdisk.name)) work_units = locked_client.get_scrubbing_workunits( ) for work_unit in work_units: res = locked_client.scrub( work_unit=work_unit, scratch_dir=scrub_directory, log_sinks=[ LogHandler.get_sink_path( 'scrubber', allow_override=True) ], backend_config=Configuration. get_configuration_path(backend_config_key)) locked_client.apply_scrubbing_result( scrubbing_work_result=res) if work_units: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied' .format(vpool.name, storagerouter.name, vdisk.name, len(work_units))) else: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required' .format(vpool.name, storagerouter.name, vdisk.name)) except Exception: if vdisk is None: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format( vpool.name, storagerouter.name, vdisk_guid) else: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format( vpool.name, storagerouter.name, vdisk.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) except Empty: # Raised when all items have been fetched from the queue ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed' .format(vpool.name, storagerouter.name)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format( vpool.name, storagerouter.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if ServiceManager.has_service(alba_proxy_service, client=client): ServiceManager.stop_service(alba_proxy_service, client=client) ServiceManager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message)