def get_storage_router_by_ip(ip): """ Retrieve Storage Router based on IP :param ip: IP of Storage Router :return: Storage Router DAL object """ return StorageRouterList.get_by_ip(ip)
def register(node_id): """ Adds a Node with a given node_id to the model :param node_id: ID of the ALBA node :type node_id: str :return: None """ node = AlbaNodeList.get_albanode_by_node_id(node_id) if node is None: main_config = Configuration.get('/ovs/alba/asdnodes/{0}/config/main'.format(node_id)) node = AlbaNode() node.ip = main_config['ip'] node.port = main_config['port'] node.username = main_config['username'] node.password = main_config['password'] node.storagerouter = StorageRouterList.get_by_ip(main_config['ip']) data = node.client.get_metadata() if data['_success'] is False and data['_error'] == 'Invalid credentials': raise RuntimeError('Invalid credentials') if data['node_id'] != node_id: AlbaNodeController._logger.error('Unexpected node_id: {0} vs {1}'.format(data['node_id'], node_id)) raise RuntimeError('Unexpected node identifier') node.node_id = node_id node.type = 'ASD' node.save() AlbaController.checkup_maintenance_agents.delay()
def register(node_id): """ Adds a Node with a given node_id to the model :param node_id: ID of the ALBA node :type node_id: str :return: None """ node = AlbaNodeList.get_albanode_by_node_id(node_id) if node is None: main_config = EtcdConfiguration.get('/ovs/alba/asdnodes/{0}/config/main'.format(node_id)) node = AlbaNode() node.ip = main_config['ip'] node.port = main_config['port'] node.username = main_config['username'] node.password = main_config['password'] node.storagerouter = StorageRouterList.get_by_ip(main_config['ip']) data = node.client.get_metadata() if data['_success'] is False and data['_error'] == 'Invalid credentials': raise RuntimeError('Invalid credentials') if data['node_id'] != node_id: AlbaNodeController._logger.error('Unexpected node_id: {0} vs {1}'.format(data['node_id'], node_id)) raise RuntimeError('Unexpected node identifier') node.node_id = node_id node.type = 'ASD' node.save() # increase maintenance agents count for all nodes by 1 for backend in AlbaBackendList.get_albabackends(): nr_of_agents_key = AlbaNodeController.NR_OF_AGENTS_ETCD_TEMPLATE.format(backend.guid) if EtcdConfiguration.exists(nr_of_agents_key): EtcdConfiguration.set(nr_of_agents_key, int(EtcdConfiguration.get(nr_of_agents_key) + 1)) else: EtcdConfiguration.set(nr_of_agents_key, 1) AlbaNodeController.checkup_maintenance_agents()
def manage_running_tasks(tasklist, timesleep=10): """ Manage a list of running celery task - discard PENDING tasks after a certain timeout - validate RUNNING tasks are actually running :param tasklist: Dictionary of tasks to wait {IP address: AsyncResult} :type tasklist: dict :param timesleep: leep between checks - -for long running tasks it's better to sleep for a longer period of time to reduce number of ssh calls :type timesleep: int :return: results :rtype: dict """ logger = LogHandler.get('lib', name='celery toolbox') ssh_clients = {} tasks_pending = {} tasks_pending_timeout = 1800 # 30 minutes results = {} failed_nodes = [] while len(tasklist.keys()) > 0: for ip, task in tasklist.items(): if task.state in ('SUCCESS', 'FAILURE'): logger.info('Task {0} finished: {1}'.format(task.id, task.state)) results[ip] = task.get(propagate=False) del tasklist[ip] elif task.state == 'PENDING': if task.id not in tasks_pending: tasks_pending[task.id] = time.time() else: task_pending_since = tasks_pending[task.id] if time.time() - task_pending_since > tasks_pending_timeout: logger.warning('Task {0} is pending since {1} on node {2}. Task will be revoked'.format(task.id, datetime.datetime.fromtimestamp(task_pending_since), ip)) revoke(task.id) del tasklist[ip] del tasks_pending[task.id] failed_nodes.append(ip) elif task.state == 'STARTED': if ip not in ssh_clients: ssh_clients[ip] = SSHClient(ip, username='******') client = ssh_clients[ip] if ServiceManager.get_service_status('workers', client) is False: logger.error('Service ovs-workers on node {0} appears halted while there is a task PENDING for it {1}. Task will be revoked.'.format(ip, task.id)) revoke(task.id) del tasklist[ip] failed_nodes.append(ip) else: ping_result = task.app.control.inspect().ping() storage_router = StorageRouterList.get_by_ip(ip) if "celery@{0}".format(storage_router.name) not in ping_result: logger.error('Service ovs-workers on node {0} is not reachable via rabbitmq while there is a task STARTED for it {1}. Task will be revoked.'.format(ip, task.id)) revoke(task.id) del tasklist[ip] failed_nodes.append(ip) if len(tasklist.keys()) > 0: time.sleep(timesleep) return results, failed_nodes
def set_config_params(self, vdisk, new_config_params, version): """ Sets configuration parameters to a given vdisk. :param vdisk: Guid of the virtual disk to configure :param new_config_params: Configuration settings for the virtual disk :param version: API version """ if version == 1 and 'dtl_target' in new_config_params: storage_router = StorageRouterList.get_by_ip(new_config_params['dtl_target']) if storage_router is None: raise NotAcceptable('API version 1 requires a Storage Router IP') new_config_params['dtl_target'] = storage_router.primary_failure_domain.guid return VDiskController.set_config_params.delay(vdisk_guid=vdisk.guid, new_config_params=new_config_params)
def set_config_params(self, vdisk, new_config_params, version): """ Sets configuration parameters to a given vdisk. :param vdisk: Guid of the virtual disk to configure :param new_config_params: Configuration settings for the virtual disk :param version: API version """ if version == 1 and 'dtl_target' in new_config_params: storage_router = StorageRouterList.get_by_ip( new_config_params['dtl_target']) if storage_router is None: raise NotAcceptable( 'API version 1 requires a Storage Router IP') new_config_params[ 'dtl_target'] = storage_router.primary_failure_domain.guid return VDiskController.set_config_params.delay( vdisk_guid=vdisk.guid, new_config_params=new_config_params)
def __init__(self, ip): """ Create RabbitMQ object :param ip: ip from the server :type ip: str """ # check if rabbitmq is available on the ip if not RabbitMQ._check_rabbitmq_ip(ip): raise ValueError('RabbitMQ on {0} could not be found.'.format(ip)) self._service_manager = ServiceFactory.get_manager() self.ip = ip if RabbitMQ.INTERNAL: self._storagerouter = StorageRouterList.get_by_ip(ip) self._client = SSHClient(ip, username='******') if not self.check_management_plugin(): self.enable_management_plugin()
def model_albanodes(**kwargs): """ Add all ALBA nodes known to the config platform to the model :param kwargs: Kwargs containing information regarding the node :type kwargs: dict :return: None """ _ = kwargs if Configuration.dir_exists('/ovs/alba/asdnodes'): for node_id in Configuration.list('/ovs/alba/asdnodes'): node = AlbaNodeList.get_albanode_by_node_id(node_id) if node is None: node = AlbaNode() main_config = Configuration.get('/ovs/alba/asdnodes/{0}/config/main'.format(node_id)) node.type = 'ASD' node.node_id = node_id node.ip = main_config['ip'] node.port = main_config['port'] node.username = main_config['username'] node.password = main_config['password'] node.storagerouter = StorageRouterList.get_by_ip(main_config['ip']) node.save()
def set_config_params(self, vdisk, new_config_params, version): """ Sets configuration parameters to a given vdisk. :param vdisk: Guid of the virtual disk to configure :type vdisk: VDisk :param new_config_params: Configuration settings for the virtual disk :type new_config_params: dict :param version: Client version :type version: int """ if version == 1 and "dtl_target" in new_config_params: storage_router = StorageRouterList.get_by_ip(new_config_params["dtl_target"]) if storage_router is None: raise HttpNotAcceptableException( error_description="API version 1 requires a Storage Router IP", error="invalid_version" ) new_config_params["dtl_target"] = [junction.domain_guid for junction in storage_router.domains] new_config_params.pop("dedupe_mode", None) new_config_params.pop("cache_strategy", None) new_config_params.pop("readcache_limit", None) return VDiskController.set_config_params.delay(vdisk_guid=vdisk.guid, new_config_params=new_config_params)
def set_config_params(self, vdisk, new_config_params, version): """ Sets configuration parameters to a given vdisk. :param vdisk: Guid of the virtual disk to configure :type vdisk: VDisk :param new_config_params: Configuration settings for the virtual disk :type new_config_params: dict :param version: Client version :type version: int :return: Asynchronous result of a CeleryTask :rtype: celery.result.AsyncResult """ if version == 1 and 'dtl_target' in new_config_params: storage_router = StorageRouterList.get_by_ip(new_config_params['dtl_target']) if storage_router is None: raise HttpNotAcceptableException(error_description='API version 1 requires a Storage Router IP', error='invalid_version') new_config_params['dtl_target'] = [junction.domain_guid for junction in storage_router.domains] new_config_params.pop('dedupe_mode', None) new_config_params.pop('cache_strategy', None) new_config_params.pop('readcache_limit', None) return VDiskController.set_config_params.delay(vdisk_guid=vdisk.guid, new_config_params=new_config_params)
def flush_node(storagerouter_ip): """ Flush write buffer to backend :param storagerouter_ip: ip of a storage router were to perform the flush :type storagerouter_ip: str """ sr_info = StorageRouterList.get_by_ip(storagerouter_ip) vdisks_by_guid = sr_info.vdisks_guids for vdisk_guid in vdisks_by_guid: disk = VDisk(vdisk_guid) snapshot_name = "flush_snapshot" # create snapshot disk.storagedriver_client.create_snapshot(str(disk.volume_id), snapshot_name) # check if snapshot is synced to the backend while not disk.storagedriver_client.info_snapshot(str(disk.volume_id), snapshot_name).in_backend: time.sleep(5) # delete snapshot if sync is completed disk.storagedriver_client.delete_snapshot(str(disk.volume_id), snapshot_name)
def remove_node(node_ip, silent=None): """ Remove the node with specified IP from the cluster :param node_ip: IP of the node to remove :type node_ip: str :param silent: If silent == '--force-yes' no question will be asked to confirm the removal :type silent: str :return: None """ from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.lib.storagedriver import StorageDriverController from ovs.lib.vpool import VPoolController Toolbox.log(logger=NodeRemovalController._logger, messages='Remove node', boxed=True) Toolbox.log( logger=NodeRemovalController._logger, messages= 'WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n' ) service_manager = ServiceFactory.get_manager() ############### # VALIDATIONS # ############### try: node_ip = node_ip.strip() if not isinstance(node_ip, str): raise ValueError('Node IP must be a string') if not re.match(SSHClient.IP_REGEX, node_ip): raise ValueError('Invalid IP {0} specified'.format(node_ip)) storage_router_all = sorted(StorageRouterList.get_storagerouters(), key=lambda k: k.name) storage_router_masters = StorageRouterList.get_masters() storage_router_all_ips = set( [storage_router.ip for storage_router in storage_router_all]) storage_router_master_ips = set([ storage_router.ip for storage_router in storage_router_masters ]) storage_router_to_remove = StorageRouterList.get_by_ip(node_ip) offline_reasons = {} if node_ip not in storage_router_all_ips: raise ValueError( 'Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}' .format('\n - '.join(storage_router_all_ips), node_ip)) if len(storage_router_all_ips) == 1: raise RuntimeError("Removing the only node is not possible") if node_ip in storage_router_master_ips and len( storage_router_master_ips) == 1: raise RuntimeError( "Removing the only master node is not possible") if System.get_my_storagerouter() == storage_router_to_remove: raise RuntimeError( 'The node to be removed cannot be identical to the node on which the removal is initiated' ) Toolbox.log( logger=NodeRemovalController._logger, messages='Creating SSH connections to remaining master nodes') master_ip = None ip_client_map = {} storage_routers_offline = [] storage_router_to_remove_online = True for storage_router in storage_router_all: try: client = SSHClient(storage_router, username='******', timeout=10) except (UnableToConnectException, NotAuthenticatedException, TimeOutException) as ex: if isinstance(ex, UnableToConnectException): msg = 'Unable to connect' elif isinstance(ex, NotAuthenticatedException): msg = 'Could not authenticate' elif isinstance(ex, TimeOutException): msg = 'Connection timed out' Toolbox.log( logger=NodeRemovalController._logger, messages=' * Node with IP {0:<15}- {1}'.format( storage_router.ip, msg)) offline_reasons[storage_router.ip] = msg storage_routers_offline.append(storage_router) if storage_router == storage_router_to_remove: storage_router_to_remove_online = False continue Toolbox.log( logger=NodeRemovalController._logger, messages=' * Node with IP {0:<15}- Successfully connected' .format(storage_router.ip)) ip_client_map[storage_router.ip] = client if storage_router != storage_router_to_remove and storage_router.node_type == 'MASTER': master_ip = storage_router.ip if len(ip_client_map) == 0 or master_ip is None: raise RuntimeError( 'Could not connect to any master node in the cluster') storage_router_to_remove.invalidate_dynamics('vdisks_guids') if len( storage_router_to_remove.vdisks_guids ) > 0: # vDisks are supposed to be moved away manually before removing a node raise RuntimeError( "Still vDisks attached to Storage Router {0}".format( storage_router_to_remove.name)) internal_memcached = Toolbox.is_service_internally_managed( service='memcached') internal_rabbit_mq = Toolbox.is_service_internally_managed( service='rabbitmq') memcached_endpoints = Configuration.get( key='/ovs/framework/memcache|endpoints') rabbit_mq_endpoints = Configuration.get( key='/ovs/framework/messagequeue|endpoints') copy_memcached_endpoints = list(memcached_endpoints) copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints) for endpoint in memcached_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_memcached_endpoints.remove(endpoint) for endpoint in rabbit_mq_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_rabbit_mq_endpoints.remove(endpoint) if len(copy_memcached_endpoints ) == 0 and internal_memcached is True: raise RuntimeError( 'Removal of provided nodes will result in a complete removal of the memcached service' ) if len(copy_rabbit_mq_endpoints ) == 0 and internal_rabbit_mq is True: raise RuntimeError( 'Removal of provided nodes will result in a complete removal of the messagequeue service' ) Toolbox.run_hooks(component='noderemoval', sub_component='validate_removal', logger=NodeRemovalController._logger, cluster_ip=storage_router_to_remove.ip) except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages= 'Removal has been aborted during the validation step. No changes have been applied.', boxed=True, loglevel='warning') sys.exit(1) except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel='exception') sys.exit(1) ################# # CONFIRMATIONS # ################# try: interactive = silent != '--force-yes' remove_asd_manager = not interactive # Remove ASD manager if non-interactive else ask if interactive is True: if len(storage_routers_offline) > 0: Toolbox.log( logger=NodeRemovalController._logger, messages= 'Certain nodes appear to be offline. These will not fully removed and will cause issues if they are not really offline.' ) Toolbox.log( logger=NodeRemovalController._logger, messages='Offline nodes: {0}'.format(''.join( ('\n * {0:<15}- {1}.'.format(ip, message) for ip, message in offline_reasons.iteritems())))) valid_node_info = Interactive.ask_yesno( message= 'Continue the removal with these being presumably offline?', default_value=False) if valid_node_info is False: Toolbox.log( logger=NodeRemovalController._logger, messages= 'Please validate the state of the nodes before removing.', title=True) sys.exit(1) proceed = Interactive.ask_yesno( message='Are you sure you want to remove node {0}?'.format( storage_router_to_remove.name), default_value=False) if proceed is False: Toolbox.log(logger=NodeRemovalController._logger, messages='Abort removal', title=True) sys.exit(1) remove_asd_manager = True if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username='******') if service_manager.has_service(name='asd-manager', client=client): remove_asd_manager = Interactive.ask_yesno( message= 'Do you also want to remove the ASD manager and related ASDs?', default_value=False) if remove_asd_manager is True or storage_router_to_remove_online is False: for fct in Toolbox.fetch_hooks('noderemoval', 'validate_asd_removal'): validation_output = fct(storage_router_to_remove.ip) if validation_output['confirm'] is True: if Interactive.ask_yesno( message=validation_output['question'], default_value=False) is False: remove_asd_manager = False break except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages= 'Removal has been aborted during the confirmation step. No changes have been applied.', boxed=True, loglevel='warning') sys.exit(1) except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel='exception') sys.exit(1) ########### # REMOVAL # ########### try: Toolbox.log(logger=NodeRemovalController._logger, messages='Starting removal of node {0} - {1}'.format( storage_router_to_remove.name, storage_router_to_remove.ip)) if storage_router_to_remove_online is False: Toolbox.log( logger=NodeRemovalController._logger, messages= ' Marking all Storage Drivers served by Storage Router {0} as offline' .format(storage_router_to_remove.ip)) StorageDriverController.mark_offline( storagerouter_guid=storage_router_to_remove.guid) # Remove vPools Toolbox.log(logger=NodeRemovalController._logger, messages=' Removing vPools from node'.format( storage_router_to_remove.ip)) storage_routers_offline_guids = [ sr.guid for sr in storage_routers_offline if sr.guid != storage_router_to_remove.guid ] for storage_driver in storage_router_to_remove.storagedrivers: Toolbox.log(logger=NodeRemovalController._logger, messages=' Removing vPool {0} from node'.format( storage_driver.vpool.name)) VPoolController.shrink_vpool( storagedriver_guid=storage_driver.guid, offline_storage_router_guids=storage_routers_offline_guids) # Demote if MASTER if storage_router_to_remove.node_type == 'MASTER': NodeTypeController.demote_node( cluster_ip=storage_router_to_remove.ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=storage_router_to_remove.machine_id, unconfigure_memcached=internal_memcached, unconfigure_rabbitmq=internal_rabbit_mq, offline_nodes=storage_routers_offline) # Stop / remove services Toolbox.log(logger=NodeRemovalController._logger, messages='Stopping and removing services') if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username='******') NodeRemovalController.remove_services( client=client, node_type=storage_router_to_remove.node_type.lower(), logger=NodeRemovalController._logger) service = 'watcher-config' if service_manager.has_service(service, client=client): Toolbox.log( logger=NodeRemovalController._logger, messages='Removing service {0}'.format(service)) service_manager.stop_service(service, client=client) service_manager.remove_service(service, client=client) Toolbox.run_hooks(component='noderemoval', sub_component='remove', logger=NodeRemovalController._logger, cluster_ip=storage_router_to_remove.ip, complete_removal=remove_asd_manager) # Clean up model Toolbox.log(logger=NodeRemovalController._logger, messages='Removing node from model') for service in storage_router_to_remove.services: service.delete() for disk in storage_router_to_remove.disks: for partition in disk.partitions: partition.delete() disk.delete() for j_domain in storage_router_to_remove.domains: j_domain.delete() Configuration.delete('/ovs/framework/hosts/{0}'.format( storage_router_to_remove.machine_id)) NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, offline_node_ips=[node.ip for node in storage_routers_offline], logger=NodeRemovalController._logger) if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username='******') client.file_delete(filenames=[CACC_LOCATION]) client.file_delete(filenames=[CONFIG_STORE_LOCATION]) storage_router_to_remove.delete() Toolbox.log(logger=NodeRemovalController._logger, messages='Successfully removed node\n') except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages=['An unexpected error occurred:', str(exception)], boxed=True, loglevel='exception') sys.exit(1) except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages='\n') Toolbox.log( logger=NodeRemovalController._logger, messages= 'This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.', boxed=True, loglevel='error') sys.exit(1) if remove_asd_manager is True and storage_router_to_remove_online is True: Toolbox.log(logger=NodeRemovalController._logger, messages='\nRemoving ASD Manager') with remote(storage_router_to_remove.ip, [os]) as rem: rem.os.system('asd-manager remove --force-yes') Toolbox.log(logger=NodeRemovalController._logger, messages='Remove nodes finished', title=True)
def set_config_params(vdisk_guid, new_config_params): """ Sets configuration parameters for a given vdisk. :param vdisk_guid: Guid of the virtual disk to set the configuration parameters for :param new_config_params: New configuration parameters """ required_params = {'dtl_mode': (str, StorageDriverClient.VDISK_DTL_MODE_MAP.keys()), 'sco_size': (int, StorageDriverClient.TLOG_MULTIPLIER_MAP.keys()), 'dedupe_mode': (str, StorageDriverClient.VDISK_DEDUPE_MAP.keys()), 'write_buffer': (int, {'min': 128, 'max': 10 * 1024}), 'cache_strategy': (str, StorageDriverClient.VDISK_CACHE_MAP.keys()), 'readcache_limit': (int, {'min': 1, 'max': 10 * 1024}, False)} if new_config_params.get('dtl_target') is not None: required_params.update({'dtl_target': (str, Toolbox.regex_ip)}) Toolbox.verify_required_params(required_params, new_config_params) if new_config_params['dtl_mode'] != 'no_sync' and new_config_params.get('dtl_target') is None: raise Exception('If DTL mode is Asynchronous or Synchronous, a target IP should always be specified') errors = False vdisk = VDisk(vdisk_guid) volume_id = str(vdisk.volume_id) old_config_params = VDiskController.get_config_params(vdisk.guid) # 1st update SCO size, because this impacts TLOG multiplier which on its turn impacts write buffer new_sco_size = new_config_params['sco_size'] old_sco_size = old_config_params['sco_size'] if new_sco_size != old_sco_size: write_buffer = float(new_config_params['write_buffer']) tlog_multiplier = StorageDriverClient.TLOG_MULTIPLIER_MAP[new_sco_size] sco_factor = write_buffer / tlog_multiplier / new_sco_size try: logger.info('Updating property sco_size on vDisk {0} to {1}'.format(vdisk_guid, new_sco_size)) vdisk.storagedriver_client.set_sco_multiplier(volume_id, new_sco_size / 4 * 1024) vdisk.storagedriver_client.set_tlog_multiplier(volume_id, tlog_multiplier) vdisk.storagedriver_client.set_sco_cache_max_non_disposable_factor(volume_id, sco_factor) logger.info('Updated property sco_size') except Exception as ex: logger.error('Error updating "sco_size": {0}'.format(ex)) errors = True # 2nd Check for DTL changes new_dtl_mode = new_config_params['dtl_mode'] old_dtl_mode = old_config_params['dtl_mode'] new_dtl_target = new_config_params.get('dtl_target') old_dtl_target = old_config_params['dtl_target'] if old_dtl_mode != new_dtl_mode or new_dtl_target != old_dtl_target: if old_dtl_mode != new_dtl_mode and new_dtl_mode == 'no_sync': logger.info('Disabling DTL for vDisk {0}'.format(vdisk_guid)) vdisk.storagedriver_client.set_manual_dtl_config(volume_id, None) elif (new_dtl_target is not None and new_dtl_target != old_dtl_target or old_dtl_mode != new_dtl_mode) and new_dtl_mode != 'no_sync': logger.info('Changing DTL to use global values for vDisk {0}'.format(vdisk_guid)) sr_target = StorageRouterList.get_by_ip(new_dtl_target) if sr_target is None: logger.error('Failed to retrieve Storage Router with IP {0}'.format(new_dtl_target)) errors = True for sd in sr_target.storagedrivers: if sd.vpool == vdisk.vpool: dtl_config = DTLConfig(str(new_dtl_target), sd.ports[2], StorageDriverClient.VDISK_DTL_MODE_MAP[new_dtl_mode]) vdisk.storagedriver_client.set_manual_dtl_config(volume_id, dtl_config) break else: logger.error('Failed to retrieve Storage Driver with IP {0}'.format(new_dtl_target)) errors = True # 2nd update rest for key in required_params: try: if key in ['sco_size', 'dtl_mode', 'dtl_target']: continue new_value = new_config_params[key] old_value = old_config_params[key] if new_value != old_value: logger.info('Updating property {0} on vDisk {1} from to {2}'.format(key, vdisk_guid, new_value)) if key == 'dedupe_mode': vdisk.storagedriver_client.set_readcache_mode(volume_id, StorageDriverClient.VDISK_DEDUPE_MAP[new_value]) elif key == 'write_buffer': tlog_multiplier = vdisk.storagedriver_client.get_tlog_multiplier(volume_id) or StorageDriverClient.TLOG_MULTIPLIER_MAP[new_sco_size] sco_factor = float(new_value) / tlog_multiplier / new_sco_size vdisk.storagedriver_client.set_sco_cache_max_non_disposable_factor(volume_id, sco_factor) elif key == 'cache_strategy': vdisk.storagedriver_client.set_readcache_behaviour(volume_id, StorageDriverClient.VDISK_CACHE_MAP[new_value]) elif key == 'readcache_limit': vol_info = vdisk.storagedriver_client.info_volume(volume_id) block_size = vol_info.lba_size * vol_info.cluster_multiplier or 4096 limit = new_value * 1024 * 1024 * 1024 / block_size if new_value else None vdisk.storagedriver_client.set_readcache_limit(volume_id, limit) else: raise KeyError('Unsupported property provided: "{0}"'.format(key)) logger.info('Updated property {0}'.format(key)) except Exception as ex: logger.error('Error updating "{0}": {1}'.format(key, ex)) errors = True if errors is True: raise Exception('Failed to update the values for vDisk {0}'.format(vdisk.name))
def add_vpool(cls, parameters): """ Add a vPool to the machine this task is running on :param parameters: Parameters for vPool creation :type parameters: dict :return: None :rtype: NoneType """ # TODO: Add logging cls._logger.debug('Adding vpool. Parameters: {}'.format(parameters)) # VALIDATIONS if not isinstance(parameters, dict): raise ValueError( 'Parameters passed to create a vPool should be of type dict') # Check StorageRouter existence storagerouter = StorageRouterList.get_by_ip( ip=parameters.get('storagerouter_ip')) if storagerouter is None: raise RuntimeError('Could not find StorageRouter') # Validate requested vPool configurations vp_installer = VPoolInstaller(name=parameters.get('vpool_name')) vp_installer.validate(storagerouter=storagerouter) # Validate requested StorageDriver configurations cls._logger.info( 'vPool {0}: Validating StorageDriver configurations'.format( vp_installer.name)) sd_installer = StorageDriverInstaller( vp_installer=vp_installer, configurations={ 'storage_ip': parameters.get('storage_ip'), 'caching_info': parameters.get('caching_info'), 'backend_info': { 'main': parameters.get('backend_info'), StorageDriverConfiguration.CACHE_BLOCK: parameters.get('backend_info_bc'), StorageDriverConfiguration.CACHE_FRAGMENT: parameters.get('backend_info_fc') }, 'connection_info': { 'main': parameters.get('connection_info'), StorageDriverConfiguration.CACHE_BLOCK: parameters.get('connection_info_bc'), StorageDriverConfiguration.CACHE_FRAGMENT: parameters.get('connection_info_fc') }, 'sd_configuration': parameters.get('config_params') }) partitions_mutex = volatile_mutex('add_vpool_partitions_{0}'.format( storagerouter.guid)) try: # VPOOL CREATION # Create the vPool as soon as possible in the process to be displayed in the GUI (INSTALLING/EXTENDING state) if vp_installer.is_new is True: vp_installer.create(rdma_enabled=sd_installer.rdma_enabled) vp_installer.configure_mds( config=parameters.get('mds_config_params', {})) else: vp_installer.update_status(status=VPool.STATUSES.EXTENDING) # ADDITIONAL VALIDATIONS # Check StorageRouter connectivity cls._logger.info( 'vPool {0}: Validating StorageRouter connectivity'.format( vp_installer.name)) linked_storagerouters = [storagerouter] if vp_installer.is_new is False: linked_storagerouters += [ sd.storagerouter for sd in vp_installer.vpool.storagedrivers ] sr_client_map = SSHClient.get_clients( endpoints=linked_storagerouters, user_names=['ovs', 'root']) offline_nodes = sr_client_map.pop('offline') if storagerouter in offline_nodes: raise RuntimeError( 'Node on which the vPool is being {0} is not reachable'. format('created' if vp_installer.is_new is True else 'extended')) sr_installer = StorageRouterInstaller( root_client=sr_client_map[storagerouter]['root'], sd_installer=sd_installer, vp_installer=vp_installer, storagerouter=storagerouter) # When 2 or more jobs simultaneously run on the same StorageRouter, we need to check and create the StorageDriver partitions in locked context partitions_mutex.acquire(wait=60) sr_installer.partition_info = StorageRouterController.get_partition_info( storagerouter_guid=storagerouter.guid) sr_installer.validate_vpool_extendable() sr_installer.validate_global_write_buffer( requested_size=parameters.get('writecache_size', 0)) sr_installer.validate_local_cache_size( requested_proxies=parameters.get('parallelism', {}).get( 'proxies', 2)) # MODEL STORAGEDRIVER AND PARTITION JUNCTIONS sd_installer.create() sd_installer.create_partitions() partitions_mutex.release() vp_installer.refresh_metadata() except Exception: cls._logger.exception( 'Something went wrong during the validation or modeling of vPool {0} on StorageRouter {1}' .format(vp_installer.name, storagerouter.name)) partitions_mutex.release() vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise # Arakoon setup counter = 0 while counter < 300: try: if StorageDriverController.manual_voldrv_arakoon_checkup( ) is True: break except Exception: cls._logger.exception( 'Arakoon checkup for voldrv cluster failed') vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise counter += 1 time.sleep(1) if counter == 300: vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise RuntimeError( 'Arakoon checkup for the StorageDriver cluster could not be started' ) # Cluster registry try: vp_installer.configure_cluster_registry(allow_raise=True) except Exception: if vp_installer.is_new is True: vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) else: vp_installer.revert_vpool(status=VPool.STATUSES.FAILURE) raise try: sd_installer.setup_proxy_configs() sd_installer.configure_storagedriver_service() DiskController.sync_with_reality(storagerouter.guid) MDSServiceController.prepare_mds_service( storagerouter=storagerouter, vpool=vp_installer.vpool) # Update the MDS safety if changed via API (vpool.configuration will be available at this point also for the newly added StorageDriver) vp_installer.vpool.invalidate_dynamics('configuration') if vp_installer.mds_safety is not None and vp_installer.vpool.configuration[ 'mds_config']['mds_safety'] != vp_installer.mds_safety: Configuration.set( key='/ovs/vpools/{0}/mds_config|mds_safety'.format( vp_installer.vpool.guid), value=vp_installer.mds_safety) sd_installer.start_services( ) # Create and start watcher volumedriver, DTL, proxies and StorageDriver services # Post creation/extension checkups mds_config_set = MDSServiceController.get_mds_storagedriver_config_set( vpool=vp_installer.vpool, offline_nodes=offline_nodes) for sr, clients in sr_client_map.iteritems(): for current_storagedriver in [ sd for sd in sr.storagedrivers if sd.vpool_guid == vp_installer.vpool.guid ]: storagedriver_config = StorageDriverConfiguration( vpool_guid=vp_installer.vpool.guid, storagedriver_id=current_storagedriver.storagedriver_id ) if storagedriver_config.config_missing is False: # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them storagedriver_config.configure_filesystem( fs_metadata_backend_mds_nodes=mds_config_set[ sr.guid]) storagedriver_config.save(client=clients['ovs']) # Everything's reconfigured, refresh new cluster configuration for current_storagedriver in vp_installer.vpool.storagedrivers: if current_storagedriver.storagerouter not in sr_client_map: continue vp_installer.vpool.storagedriver_client.update_cluster_node_configs( str(current_storagedriver.storagedriver_id), req_timeout_secs=10) except Exception: cls._logger.exception('vPool {0}: Creation failed'.format( vp_installer.name)) vp_installer.update_status(status=VPool.STATUSES.FAILURE) raise # When a node is offline, we can run into errors, but also when 1 or more volumes are not running # Scheduled tasks below, so don't really care whether they succeed or not try: VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600) except: pass for vdisk in vp_installer.vpool.vdisks: try: MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid) except: pass vp_installer.update_status(status=VPool.STATUSES.RUNNING) cls._logger.info('Add vPool {0} ended successfully'.format( vp_installer.name))
def remove_node(node_ip, silent=None): """ Remove the node with specified IP from the cluster :param node_ip: IP of the node to remove :type node_ip: str :param silent: If silent == '--force-yes' no question will be asked to confirm the removal :type silent: str :return: None """ from ovs.lib.storagedriver import StorageDriverController from ovs.lib.storagerouter import StorageRouterController from ovs.dal.lists.storagerouterlist import StorageRouterList Toolbox.log(logger=NodeRemovalController._logger, messages="Remove node", boxed=True) Toolbox.log( logger=NodeRemovalController._logger, messages="WARNING: Some of these steps may take a very long time, please check the logs for more information\n\n", ) ############### # VALIDATIONS # ############### try: node_ip = node_ip.strip() if not isinstance(node_ip, str): raise ValueError("Node IP must be a string") if not re.match(SSHClient.IP_REGEX, node_ip): raise ValueError("Invalid IP {0} specified".format(node_ip)) storage_router_all = StorageRouterList.get_storagerouters() storage_router_masters = StorageRouterList.get_masters() storage_router_all_ips = set([storage_router.ip for storage_router in storage_router_all]) storage_router_master_ips = set([storage_router.ip for storage_router in storage_router_masters]) storage_router_to_remove = StorageRouterList.get_by_ip(node_ip) if node_ip not in storage_router_all_ips: raise ValueError( "Unknown IP specified\nKnown in model:\n - {0}\nSpecified for removal:\n - {1}".format( "\n - ".join(storage_router_all_ips), node_ip ) ) if len(storage_router_all_ips) == 1: raise RuntimeError("Removing the only node is not possible") if node_ip in storage_router_master_ips and len(storage_router_master_ips) == 1: raise RuntimeError("Removing the only master node is not possible") if System.get_my_storagerouter() == storage_router_to_remove: raise RuntimeError( "The node to be removed cannot be identical to the node on which the removal is initiated" ) Toolbox.log( logger=NodeRemovalController._logger, messages="Creating SSH connections to remaining master nodes" ) master_ip = None ip_client_map = {} storage_routers_offline = [] storage_router_to_remove_online = True for storage_router in storage_router_all: try: client = SSHClient(storage_router, username="******") if client.run(["pwd"]): Toolbox.log( logger=NodeRemovalController._logger, messages=" Node with IP {0:<15} successfully connected to".format(storage_router.ip), ) ip_client_map[storage_router.ip] = client if storage_router != storage_router_to_remove and storage_router.node_type == "MASTER": master_ip = storage_router.ip except UnableToConnectException: Toolbox.log( logger=NodeRemovalController._logger, messages=" Node with IP {0:<15} is unreachable".format(storage_router.ip), ) storage_routers_offline.append(storage_router) if storage_router == storage_router_to_remove: storage_router_to_remove_online = False if len(ip_client_map) == 0 or master_ip is None: raise RuntimeError("Could not connect to any master node in the cluster") storage_router_to_remove.invalidate_dynamics("vdisks_guids") if ( len(storage_router_to_remove.vdisks_guids) > 0 ): # vDisks are supposed to be moved away manually before removing a node raise RuntimeError("Still vDisks attached to Storage Router {0}".format(storage_router_to_remove.name)) internal_memcached = Toolbox.is_service_internally_managed(service="memcached") internal_rabbit_mq = Toolbox.is_service_internally_managed(service="rabbitmq") memcached_endpoints = Configuration.get(key="/ovs/framework/memcache|endpoints") rabbit_mq_endpoints = Configuration.get(key="/ovs/framework/messagequeue|endpoints") copy_memcached_endpoints = list(memcached_endpoints) copy_rabbit_mq_endpoints = list(rabbit_mq_endpoints) for endpoint in memcached_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_memcached_endpoints.remove(endpoint) for endpoint in rabbit_mq_endpoints: if endpoint.startswith(storage_router_to_remove.ip): copy_rabbit_mq_endpoints.remove(endpoint) if len(copy_memcached_endpoints) == 0 and internal_memcached is True: raise RuntimeError( "Removal of provided nodes will result in a complete removal of the memcached service" ) if len(copy_rabbit_mq_endpoints) == 0 and internal_rabbit_mq is True: raise RuntimeError( "Removal of provided nodes will result in a complete removal of the messagequeue service" ) except Exception as exception: Toolbox.log( logger=NodeRemovalController._logger, messages=[str(exception)], boxed=True, loglevel="exception" ) sys.exit(1) ################# # CONFIRMATIONS # ################# interactive = silent != "--force-yes" remove_asd_manager = not interactive # Remove ASD manager if non-interactive else ask if interactive is True: proceed = Interactive.ask_yesno( message="Are you sure you want to remove node {0}?".format(storage_router_to_remove.name), default_value=False, ) if proceed is False: Toolbox.log(logger=NodeRemovalController._logger, messages="Abort removal", title=True) sys.exit(1) if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username="******") if ServiceManager.has_service(name="asd-manager", client=client): remove_asd_manager = Interactive.ask_yesno( message="Do you also want to remove the ASD manager and related ASDs?", default_value=False ) if remove_asd_manager is True or storage_router_to_remove_online is False: for function in Toolbox.fetch_hooks("setup", "validate_asd_removal"): validation_output = function(storage_router_to_remove.ip) if validation_output["confirm"] is True: if Interactive.ask_yesno(message=validation_output["question"], default_value=False) is False: remove_asd_manager = False break ########### # REMOVAL # ########### try: Toolbox.log( logger=NodeRemovalController._logger, messages="Starting removal of node {0} - {1}".format( storage_router_to_remove.name, storage_router_to_remove.ip ), ) if storage_router_to_remove_online is False: Toolbox.log( logger=NodeRemovalController._logger, messages=" Marking all Storage Drivers served by Storage Router {0} as offline".format( storage_router_to_remove.ip ), ) StorageDriverController.mark_offline(storagerouter_guid=storage_router_to_remove.guid) # Remove vPools Toolbox.log( logger=NodeRemovalController._logger, messages=" Removing vPools from node".format(storage_router_to_remove.ip), ) storage_routers_offline_guids = [ sr.guid for sr in storage_routers_offline if sr.guid != storage_router_to_remove.guid ] for storage_driver in storage_router_to_remove.storagedrivers: Toolbox.log( logger=NodeRemovalController._logger, messages=" Removing vPool {0} from node".format(storage_driver.vpool.name), ) StorageRouterController.remove_storagedriver( storagedriver_guid=storage_driver.guid, offline_storage_router_guids=storage_routers_offline_guids ) # Demote if MASTER if storage_router_to_remove.node_type == "MASTER": NodeTypeController.demote_node( cluster_ip=storage_router_to_remove.ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=storage_router_to_remove.machine_id, unconfigure_memcached=internal_memcached, unconfigure_rabbitmq=internal_rabbit_mq, offline_nodes=storage_routers_offline, ) # Stop / remove services Toolbox.log(logger=NodeRemovalController._logger, messages="Stopping and removing services") config_store = Configuration.get_store() if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username="******") NodeRemovalController.remove_services( client=client, node_type=storage_router_to_remove.node_type.lower(), logger=NodeRemovalController._logger, ) service = "watcher-config" if ServiceManager.has_service(service, client=client): Toolbox.log(logger=NodeRemovalController._logger, messages="Removing service {0}".format(service)) ServiceManager.stop_service(service, client=client) ServiceManager.remove_service(service, client=client) if config_store == "etcd": from ovs.extensions.db.etcd.installer import EtcdInstaller if Configuration.get(key="/ovs/framework/external_config") is None: Toolbox.log(logger=NodeRemovalController._logger, messages=" Removing Etcd cluster") try: EtcdInstaller.stop("config", client) EtcdInstaller.remove("config", client) except Exception as ex: Toolbox.log( logger=NodeRemovalController._logger, messages=["\nFailed to unconfigure Etcd", ex], loglevel="exception", ) Toolbox.log(logger=NodeRemovalController._logger, messages="Removing Etcd proxy") EtcdInstaller.remove_proxy("config", client.ip) Toolbox.run_hooks( component="noderemoval", sub_component="remove", logger=NodeRemovalController._logger, cluster_ip=storage_router_to_remove.ip, complete_removal=remove_asd_manager, ) # Clean up model Toolbox.log(logger=NodeRemovalController._logger, messages="Removing node from model") for service in storage_router_to_remove.services: service.delete() for disk in storage_router_to_remove.disks: for partition in disk.partitions: partition.delete() disk.delete() for j_domain in storage_router_to_remove.domains: j_domain.delete() Configuration.delete("/ovs/framework/hosts/{0}".format(storage_router_to_remove.machine_id)) NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, offline_node_ips=[node.ip for node in storage_routers_offline], logger=NodeRemovalController._logger, ) if storage_router_to_remove_online is True: client = SSHClient(endpoint=storage_router_to_remove, username="******") if config_store == "arakoon": client.file_delete(filenames=[ArakoonConfiguration.CACC_LOCATION]) client.file_delete(filenames=[Configuration.BOOTSTRAP_CONFIG_LOCATION]) storage_router_to_remove.delete() Toolbox.log(logger=NodeRemovalController._logger, messages="Successfully removed node\n") except Exception as exception: Toolbox.log(logger=NodeRemovalController._logger, messages="\n") Toolbox.log( logger=NodeRemovalController._logger, messages=["An unexpected error occurred:", str(exception)], boxed=True, loglevel="exception", ) sys.exit(1) except KeyboardInterrupt: Toolbox.log(logger=NodeRemovalController._logger, messages="\n") Toolbox.log( logger=NodeRemovalController._logger, messages="This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.", boxed=True, loglevel="error", ) sys.exit(1) if remove_asd_manager is True: Toolbox.log(logger=NodeRemovalController._logger, messages="\nRemoving ASD Manager") with remote(storage_router_to_remove.ip, [os]) as rem: rem.os.system("asd-manager remove --force-yes") Toolbox.log(logger=NodeRemovalController._logger, messages="Remove nodes finished", title=True)
def get_package_information_core(client, package_info): """ Called by GenericController.refresh_package_information() every hour Retrieve information about the currently installed versions of the core packages Retrieve information about the versions to which each package can potentially be updated If installed version is different from candidate version --> store this information in model Additionally check the services with a 'run' file Verify whether the running version is up-to-date with the candidate version If different --> store this information in the model Result: Every package with updates or which requires services to be restarted is stored in the model :param client: Client on which to collect the version information :type client: SSHClient :param package_info: Dictionary passed in by the thread calling this function :type package_info: dict :return: Package information :rtype: dict """ try: if client.username != 'root': raise RuntimeError('Only the "root" user can retrieve the package information') binaries = PackageManager.get_binary_versions(client=client, package_names=UpdateController.core_packages_with_binaries) installed = PackageManager.get_installed_versions(client=client, package_names=UpdateController.all_core_packages) candidate = PackageManager.get_candidate_versions(client=client, package_names=UpdateController.all_core_packages) if set(installed.keys()) != set(UpdateController.all_core_packages) or set(candidate.keys()) != set(UpdateController.all_core_packages): raise RuntimeError('Failed to retrieve the installed and candidate versions for packages: {0}'.format(', '.join(UpdateController.all_core_packages))) # Retrieve Arakoon information framework_arakoons = [] storagedriver_arakoons = [] for cluster, arakoon_list in {'cacc': framework_arakoons, 'ovsdb': framework_arakoons, 'voldrv': storagedriver_arakoons}.iteritems(): cluster_name = ArakoonClusterConfig.get_cluster_name(cluster) if cluster_name is None: continue if cluster == 'cacc': arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name, filesystem=True, ip=client.ip) else: arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=cluster_name) if arakoon_metadata['internal'] is True: arakoon_list.append(ArakoonInstaller.get_service_name_for_cluster(cluster_name=arakoon_metadata['cluster_name'])) storagerouter = StorageRouterList.get_by_ip(client.ip) alba_proxies = [] for service in storagerouter.services: if service.type.name == ServiceType.SERVICE_TYPES.ALBA_PROXY: alba_proxies.append(service.name) storagedriver_services = [] for sd in storagerouter.storagedrivers: storagedriver_services.append('ovs-dtl_{0}'.format(sd.vpool.name)) storagedriver_services.append('ovs-volumedriver_{0}'.format(sd.vpool.name)) default_entry = {'candidate': None, 'installed': None, 'services_to_restart': []} # component: package_name: services_with_run_file for component, info in {'framework': {'arakoon': framework_arakoons, 'openvstorage': []}, 'storagedriver': {'alba': alba_proxies, 'arakoon': storagedriver_arakoons, 'volumedriver-no-dedup-base': [], 'volumedriver-no-dedup-server': storagedriver_services}}.iteritems(): component_info = {} for package, services in info.iteritems(): for service in services: service = ExtensionToolbox.remove_prefix(service, 'ovs-') version_file = '/opt/OpenvStorage/run/{0}.version'.format(service) if not client.file_exists(version_file): UpdateController._logger.warning('{0}: Failed to find a version file in /opt/OpenvStorage/run for service {1}'.format(client.ip, service)) continue package_name = package running_versions = client.file_read(version_file).strip() for version in running_versions.split(';'): version = version.strip() running_version = None if '=' in version: package_name = version.split('=')[0] running_version = version.split('=')[1] elif version: running_version = version if package_name not in UpdateController.all_core_packages: raise ValueError('Unknown package dependency found in {0}'.format(version_file)) if package_name not in binaries: raise RuntimeError('Binary version for package {0} was not retrieved'.format(package_name)) if running_version is not None and running_version != binaries[package_name]: if package_name not in component_info: component_info[package_name] = copy.deepcopy(default_entry) component_info[package_name]['installed'] = running_version component_info[package_name]['candidate'] = binaries[package_name] component_info[package_name]['services_to_restart'].append('ovs-{0}'.format(service)) if installed[package] != candidate[package] and package not in component_info: component_info[package] = copy.deepcopy(default_entry) component_info[package]['installed'] = installed[package] component_info[package]['candidate'] = candidate[package] if component_info: if component not in package_info[client.ip]: package_info[client.ip][component] = {} package_info[client.ip][component].update(component_info) except Exception as ex: if 'errors' not in package_info[client.ip]: package_info[client.ip]['errors'] = [] package_info[client.ip]['errors'].append(ex) return package_info
def _get_update_information_cluster_alba(cls, client, update_info, package_info): """ In this function the services for each component / package combination are defined This service information consists out of: * Services to stop (before update) and start (after update of packages) -> 'services_stop_start' * Services to restart after update (post-update logic) -> 'services_post_update' * Down-times which will be caused due to service restarts -> 'downtime' * Prerequisites that have not been met -> 'prerequisites' Verify whether all relevant services have the correct binary active Whether a service has the correct binary version in use, we use the ServiceFactory.get_service_update_versions functionality When a service has an older binary version running, we add this information to the 'update_info' This combined information is then stored in the 'package_information' of the StorageRouter DAL object :param client: SSHClient on which to retrieve the service information required for an update :type client: ovs.extensions.generic.sshclient.SSHClient :param update_info: Dictionary passed in by the thread calling this function used to store all update information :type update_info: dict :param package_info: Dictionary containing the components and packages which have an update available for current SSHClient :type package_info: dict :return: None :rtype: NoneType """ cls._logger.info( 'StorageRouter {0}: Refreshing ALBA update information'.format( client.ip)) try: binaries = cls._package_manager.get_binary_versions(client=client) storagerouter = StorageRouterList.get_by_ip(ip=client.ip) cls._logger.debug('StorageRouter {0}: Binary versions: {1}'.format( client.ip, binaries)) # Retrieve Arakoon information arakoon_info = {} for service in storagerouter.services: if service.type.name not in [ ServiceType.SERVICE_TYPES.ALBA_MGR, ServiceType.SERVICE_TYPES.NS_MGR ]: continue if service.type.name == ServiceType.SERVICE_TYPES.ALBA_MGR: cluster_name = service.abm_service.abm_cluster.name alba_backend_name = service.abm_service.abm_cluster.alba_backend.name else: cluster_name = service.nsm_service.nsm_cluster.name alba_backend_name = service.nsm_service.nsm_cluster.alba_backend.name cls._logger.debug( 'StorageRouter {0}: Retrieving update information for Arakoon cluster {1}' .format(client.ip, cluster_name)) arakoon_update_info = ArakoonInstaller.get_arakoon_update_info( cluster_name=cluster_name) cls._logger.debug( 'StorageRouter {0}: Arakoon update information for cluster {1}: {2}' .format(client.ip, cluster_name, arakoon_update_info)) if arakoon_update_info['internal'] is True: arakoon_info[arakoon_update_info['service_name']] = [ 'backend', alba_backend_name ] if arakoon_update_info['downtime'] is True else None for component, package_names in PackageFactory.get_package_info( )['names'].iteritems(): package_names = sorted(package_names) cls._logger.debug( 'StorageRouter {0}: Validating component {1} and related packages: {2}' .format(client.ip, component, package_names)) if component not in update_info[client.ip]: update_info[client.ip][component] = copy.deepcopy( ServiceFactory.DEFAULT_UPDATE_ENTRY) svc_component_info = update_info[client.ip][component] pkg_component_info = package_info.get(component, {}) for package_name in package_names: cls._logger.debug( 'StorageRouter {0}: Validating ALBA plugin related package {1}' .format(client.ip, package_name)) if package_name == PackageFactory.PKG_OVS_BACKEND and package_name in pkg_component_info: if ['gui', None] not in svc_component_info['downtime']: svc_component_info['downtime'].append( ['gui', None]) if ['api', None] not in svc_component_info['downtime']: svc_component_info['downtime'].append( ['api', None]) svc_component_info['services_stop_start'][10].append( 'ovs-watcher-framework') svc_component_info['services_stop_start'][20].append( 'memcached') cls._logger.debug( 'StorageRouter {0}: Added services "ovs-watcher-framework" and "memcached" to stop-start services' .format(client.ip)) cls._logger.debug( 'StorageRouter {0}: Added GUI and API to downtime'. format(client.ip)) elif package_name in [ PackageFactory.PKG_ALBA, PackageFactory.PKG_ALBA_EE ]: # Retrieve proxy service information for service in storagerouter.services: if service.type.name != ServiceType.SERVICE_TYPES.ALBA_PROXY or service.alba_proxy is None: continue service_version = None if package_name not in pkg_component_info: service_version = ServiceFactory.get_service_update_versions( client=client, service_name=service.name, binary_versions=binaries) cls._logger.debug( 'StorageRouter {0}: Service {1} is running version {2}' .format(client.ip, service.name, service_version)) if package_name in pkg_component_info or service_version is not None: if service_version is not None and package_name not in svc_component_info[ 'packages']: svc_component_info['packages'][ package_name] = service_version svc_component_info['services_post_update'][ 10].append('ovs-{0}'.format(service.name)) cls._logger.debug( 'StorageRouter {0}: Added service {1} to post-update services' .format(client.ip, 'ovs-{0}'.format(service.name))) downtime = [ 'proxy', service.alba_proxy.storagedriver.vpool.name ] if downtime not in svc_component_info[ 'downtime']: svc_component_info['downtime'].append( downtime) cls._logger.debug( 'StorageRouter {0}: Added ALBA proxy downtime for vPool {1} to downtime' .format( client.ip, service.alba_proxy. storagedriver.vpool.name)) if package_name in [ PackageFactory.PKG_ALBA, PackageFactory.PKG_ALBA_EE, PackageFactory.PKG_ARAKOON ]: for service_name, downtime in arakoon_info.iteritems(): service_version = ServiceFactory.get_service_update_versions( client=client, service_name=service_name, binary_versions=binaries, package_name=package_name) cls._logger.debug( 'StorageRouter {0}: Arakoon service {1} information: {2}' .format(client.ip, service_name, service_version)) if package_name in pkg_component_info or service_version is not None: svc_component_info['services_post_update'][ 10].append('ovs-{0}'.format(service_name)) cls._logger.debug( 'StorageRouter {0}: Added service {1} to post-update services' .format(client.ip, 'ovs-{0}'.format(service_name))) if service_version is not None and package_name not in svc_component_info[ 'packages']: svc_component_info['packages'][ package_name] = service_version if downtime is not None and downtime not in svc_component_info[ 'downtime']: svc_component_info['downtime'].append( downtime) cls._logger.debug( 'StorageRouter {0}: Added Arakoon cluster for ALBA Backend {1} to downtime' .format(client.ip, downtime[1])) # Extend the service information with the package information related to this repository for current StorageRouter if package_name in pkg_component_info and package_name not in svc_component_info[ 'packages']: cls._logger.debug( 'StorageRouter {0}: Adding package {1} because it has an update available' .format(client.ip, package_name)) svc_component_info['packages'][ package_name] = pkg_component_info[package_name] if component == PackageFactory.COMP_ALBA: for alba_node in AlbaNodeList.get_albanodes(): try: alba_node.client.get_metadata() except: svc_component_info['prerequisites'].append( ['alba_node_unresponsive', alba_node.ip]) cls._logger.debug( 'StorageRouter {0}: Added unresponsive ALBA Node {1} to prerequisites' .format(client.ip, alba_node.ip)) # Verify whether migration (DAL and extension) code needs to be executed (only if no packages have an update available so far) elif component == PackageFactory.COMP_FWK and PackageFactory.PKG_OVS_BACKEND not in svc_component_info[ 'packages']: cls._logger.debug( 'StorageRouter {0}: No updates detected, checking for required migrations' .format(client.ip)) # Extension migration check key = '/ovs/framework/hosts/{0}/versions'.format( System.get_my_machine_id(client=client)) old_version = Configuration.get(key, default={}).get( PackageFactory.COMP_MIGRATION_ALBA) installed_version = str( cls._package_manager.get_installed_versions( client=client, package_names=[PackageFactory.PKG_OVS_BACKEND ])[PackageFactory.PKG_OVS_BACKEND]) migrations_detected = False if old_version is not None: cls._logger.debug( 'StorageRouter {0}: Current running version for {1} extension migrations: {2}' .format(client.ip, PackageFactory.COMP_ALBA, old_version)) with remote(client.ip, [ExtensionMigrator]) as rem: cls._logger.debug( 'StorageRouter {0}: Available version for {1} extension migrations: {2}' .format(client.ip, PackageFactory.COMP_ALBA, rem.ExtensionMigrator.THIS_VERSION)) if rem.ExtensionMigrator.THIS_VERSION > old_version: migrations_detected = True svc_component_info['packages'][ PackageFactory.PKG_OVS_BACKEND] = { 'installed': 'migrations', 'candidate': installed_version } # DAL migration check if migrations_detected is False: persistent_client = PersistentFactory.get_client() old_version = persistent_client.get( 'ovs_model_version').get( PackageFactory.COMP_MIGRATION_ALBA ) if persistent_client.exists( 'ovs_model_version') else None if old_version is not None: cls._logger.debug( 'StorageRouter {0}: Current running version for {1} DAL migrations: {2}' .format(client.ip, PackageFactory.COMP_ALBA, old_version)) with remote(client.ip, [DALMigrator]) as rem: cls._logger.debug( 'StorageRouter {0}: Available version for {1} DAL migrations: {2}' .format(client.ip, PackageFactory.COMP_ALBA, rem.DALMigrator.THIS_VERSION)) if rem.DALMigrator.THIS_VERSION > old_version: svc_component_info['packages'][ PackageFactory.PKG_OVS_BACKEND] = { 'installed': 'migrations', 'candidate': installed_version } cls._logger.info( 'StorageRouter {0}: Refreshed ALBA update information'.format( client.ip)) except Exception as ex: cls._logger.exception( 'StorageRouter {0}: Refreshing ALBA update information failed'. format(client.ip)) if 'errors' not in update_info[client.ip]: update_info[client.ip]['errors'] = [] update_info[client.ip]['errors'].append(ex)