def remove_slot(node_guid, slot_id): """ Removes a disk :param node_guid: Guid of the node to remove a disk from :type node_guid: str :param slot_id: Slot ID :type slot_id: str :return: None :rtype: NoneType """ # Verify client connectivity node = AlbaNode(node_guid) osds = [osd for osd in node.osds if osd.slot_id == slot_id] if len(osds) > 0: raise RuntimeError('A slot with claimed OSDs can\'t be removed') node.client.clear_slot(slot_id) node.invalidate_dynamics() # Sync model if node.storagerouter is not None: stack = node.client.get_stack() # type: dict slot_information = stack.get(slot_id, {}) slot_aliases = slot_information.get('aliases', []) for disk in node.storagerouter.disks: if set(disk.aliases).intersection(set(slot_aliases)): partition = disk.partitions[0] if DiskPartition.ROLES.BACKEND in partition.roles: partition.roles.remove(DiskPartition.ROLES.BACKEND) partition.save() DiskController.sync_with_reality( storagerouter_guid=node.storagerouter_guid)
def remove_disk(node_guid, device_alias): """ Removes a disk :param node_guid: Guid of the node to remove a disk from :type node_guid: str :param device_alias: Alias of the device to remove (eg: /dev/disk/by-path/pci-0000:03:00.0-sas-0x5000c29f4cf04566-lun-0) :type device_alias: str :return: None """ asds = {} node = AlbaNode(node_guid) node_id = node.node_id device_id = device_alias.split('/')[-1] offline_node = False # Verify client connectivity try: _ = node.client.get_disks() except (requests.ConnectionError, requests.Timeout, InvalidCredentialsError): AlbaNodeController._logger.warning('Could not connect to node {0} to validate disks'.format(node.guid)) offline_node = True # Retrieve ASD information for the ALBA Disk for backend in AlbaBackendList.get_albabackends(): local_stack = backend.local_stack if node_id in local_stack and device_id in local_stack[node_id]: asds.update(local_stack[node_id][device_id]['asds']) for asd_info in asds.values(): if (offline_node is False and asd_info.get('status') != 'available') or (offline_node is True and asd_info.get('status_detail') == 'nodedown'): AlbaNodeController._logger.error('Disk {0} has still non-available ASDs on node {1}'.format(device_alias, node.ip)) raise RuntimeError('Disk {0} on ALBA node {1} has still some non-available ASDs'.format(device_alias, node_id)) # Retrieve the Disk from the framework model matching the ALBA Disk disk_to_clear = None for disk in DiskList.get_disks(): if device_alias in disk.aliases: disk_to_clear = disk break # Remove the ALBA Disk making use of the ASD Manager Client if offline_node is False: result = node.client.remove_disk(disk_id=device_id, partition_aliases=disk_to_clear.partitions[0].aliases if len(disk_to_clear.partitions) > 0 else []) if result['_success'] is False: raise RuntimeError('Error removing disk {0}: {1}'.format(device_alias, result['_error'])) # Clean the model for model_disk in node.disks: if device_alias in model_disk.aliases: for osd in model_disk.osds: osd.delete() model_disk.delete() if disk_to_clear is not None: for partition in disk_to_clear.partitions: partition.roles = [] partition.mountpoint = None partition.save() node.invalidate_dynamics() if node.storagerouter is not None: DiskController.sync_with_reality(storagerouter_guid=node.storagerouter_guid)
def initialize_disks(node_guid, disks): """ Initializes 1 or multiple disks :param node_guid: Guid of the node to which the disks belong :type node_guid: str :param disks: Disks to initialize (key: device_alias, value: amount of ASDs to deploy) :type disks: dict :return: Dict of all failures with as key the Diskname, and as value the error :rtype: dict """ node = AlbaNode(node_guid) try: available_disks = node.client.get_disks() except (requests.ConnectionError, requests.Timeout): AlbaNodeController._logger.exception('Could not connect to node {0} to validate disks'.format(node.guid)) raise failures = {} added_disks = [] for device_alias, amount in disks.iteritems(): device_id = device_alias.split('/')[-1] AlbaNodeController._logger.debug('Initializing disk {0} at node {1}'.format(device_alias, node.ip)) if device_id not in available_disks or available_disks[device_id]['available'] is False: AlbaNodeController._logger.exception('Disk {0} not available on node {1}'.format(device_alias, node.ip)) failures[device_alias] = 'Disk unavailable' else: add_disk_result = node.client.add_disk(disk_id=device_id) # Verify if an AlbaDisk with found aliases already exists (eg: When initialize individual and initialize all run at the same time) exists = False aliases = add_disk_result['aliases'] for alba_disk in node.disks: if set(alba_disk.aliases).intersection(set(aliases)): exists = True break if exists is True: continue disk = AlbaDisk() disk.aliases = aliases disk.alba_node = node disk.save() if add_disk_result['_success'] is False: failures[device_alias] = add_disk_result['_error'] disk.delete() else: device_id = disk.aliases[0].split('/')[-1] for _ in xrange(amount): add_asd_result = node.client.add_asd(disk_id=device_id) if add_asd_result['_success'] is False: failures[device_alias] = add_asd_result['_error'] added_disks.extend(add_disk_result['aliases']) if node.storagerouter is not None: DiskController.sync_with_reality(storagerouter_guid=node.storagerouter_guid) for disk in node.storagerouter.disks: if set(disk.aliases).intersection(set(added_disks)): partition = disk.partitions[0] if DiskPartition.ROLES.BACKEND not in partition.roles: partition.roles.append(DiskPartition.ROLES.BACKEND) partition.save() return failures
def refresh_hardware(storagerouter_guid): """ Refreshes all hardware related information :param storagerouter_guid: Guid of the StorageRouter to refresh the hardware on :type storagerouter_guid: str :return: None :rtype: NoneType """ StorageRouterController.set_rdma_capability(storagerouter_guid) DiskController.sync_with_reality(storagerouter_guid)
def initialize_disks(node_guid, disks): """ Initializes a disk :param node_guid: Guid of the node which disks need to be initialized :type node_guid: str :param disks: Disks to initialize :type disks: dict :return: Dict of all failures with as key the Diskname, and as value the error :rtype: dict """ node = AlbaNode(node_guid) try: available_disks = node.client.get_disks() except (requests.ConnectionError, requests.Timeout): AlbaNodeController._logger.exception('Could not connect to node {0} to validate disks'.format(node.guid)) raise failures = {} added_disks = [] for disk_id, amount in disks.iteritems(): AlbaNodeController._logger.debug('Initializing disk {0} at node {1}'.format(disk_id, node.ip)) if disk_id not in available_disks or available_disks[disk_id]['available'] is False: AlbaNodeController._logger.exception('Disk {0} not available on node {1}'.format(disk_id, node.ip)) failures[disk_id] = 'Disk unavailable' else: disk = AlbaDisk() disk.name = disk_id disk.alba_node = node disk.save() result = node.client.add_disk(disk_id) if result['_success'] is False: failures[disk_id] = result['_error'] disk.delete() else: device = result['device'] for _ in xrange(amount): result = node.client.add_asd(disk_id) if result['_success'] is False: failures[disk_id] = result['_error'] added_disks.append(device) if node.storagerouter is not None: DiskController.sync_with_reality(node.storagerouter_guid) for disk in node.storagerouter.disks: if disk.path in added_disks: partition = disk.partitions[0] partition.roles.append(DiskPartition.ROLES.BACKEND) partition.save() return failures
def _fill_slot(cls, node, slot_id, extra): # type: (AlbaNode, str, any) -> List[dict] """ Fills in the slots with ASDs and checks if the BACKEND role needs to be added :param node: The AlbaNode to fill on :type node: AlbaNode :param slot_id: ID of the slot to fill (which is an alias of the slot) :type slot_id: str :param extra: Extra information for filling :type extra: any :return: Information about the created osds :rtype: List[dict] """ if node.type == AlbaNode.NODE_TYPES.S3: extra = extra.copy() try: s3_transaction_cluster = S3TransactionClusterList.get_s3_transaction_clusters( )[0] extra[ 'transaction_arakoon_url'] = Configuration.get_configuration_path( key=s3_transaction_cluster.config_location) except IndexError: raise RuntimeError( 'No transaction arakoon was deployed for this cluster!') created_osds = node.client.fill_slot(slot_id=slot_id, extra=extra) cls._logger.info(created_osds) # Sync model if node.storagerouter is not None: stack = node.client.get_stack() # type: dict DiskController.sync_with_reality( storagerouter_guid=node.storagerouter_guid) slot_information = stack.get(slot_id, {}) slot_aliases = slot_information.get('aliases', []) for disk in node.storagerouter.disks: if set(disk.aliases).intersection(set(slot_aliases)): partition = disk.partitions[0] if DiskPartition.ROLES.BACKEND not in partition.roles: partition.roles.append(DiskPartition.ROLES.BACKEND) partition.save() return created_osds or [] # Always return a list
def remove_disk(node_guid, disk): """ Removes a disk :param node_guid: Guid of the node to remove a disk from :type node_guid: str :param disk: Disk name to remove :type disk: str :return: None """ node = AlbaNode(node_guid) offline_node = False try: if disk not in node.client.get_disks(): raise RuntimeError('Disk {0} not available on node {1}'.format(disk, node.guid)) except (requests.ConnectionError, requests.Timeout): AlbaNodeController._logger.warning('Could not connect to node {0} to validate disks'.format(node.guid)) offline_node = True node_id = node.node_id asds = {} for backend in AlbaBackendList.get_albabackends(): storage_stack = backend.storage_stack if node_id in storage_stack and disk in storage_stack[node_id]: asds.update(storage_stack[node_id][disk]['asds']) for asd_info in asds.values(): if (offline_node is False and asd_info['status'] != 'available') or (offline_node is True and asd_info['status_detail'] == 'nodedown'): AlbaNodeController._logger.error('Disk {0} has still non-available ASDs on node {1}'.format(disk, node.ip)) raise RuntimeError('Disk {0} has still some non-available ASDs'.format(disk)) if offline_node is False: result = node.client.remove_disk(disk) if result['_success'] is False: raise RuntimeError('Error removing disk {0}: {1}'.format(disk, result['_error'])) for model_disk in node.disks: if model_disk.name == disk: for asd in model_disk.asds: asd.delete() model_disk.delete() node.invalidate_dynamics() if node.storagerouter is not None: DiskController.sync_with_reality(node.storagerouter_guid)
def remove_slot(node_cluster_guid, node_guid, slot_id): # type: (str, str, str) -> None """ Removes a slot :param node_cluster_guid: Guid of the node cluster to remove a disk from :type node_cluster_guid: str :param node_guid: Guid of the AlbaNode to act as the 'active' side :type node_guid: basestring :param slot_id: Slot ID :type slot_id: str :return: None :rtype: NoneType """ node_cluster = AlbaNodeCluster(node_cluster_guid) active_node = AlbaNode(node_guid) if active_node not in node_cluster.alba_nodes: raise ValueError( 'The requested active AlbaNode is not part of AlbaNodeCluster {0}' .format(node_cluster.guid)) osds = [osd for osd in active_node.osds if osd.slot_id == slot_id] if len(osds) > 0: raise RuntimeError('A slot with claimed OSDs can\'t be removed') active_node.client.clear_slot(slot_id) active_node.invalidate_dynamics() # Invalidate the stack and sync towards all passive sides for node in node_cluster.alba_nodes: if node != active_node: try: node.client.sync_stack(active_node.stack) except: AlbaNodeClusterController._logger.exception( 'Error while syncing stacks to the passive side') if active_node.storagerouter is not None: DiskController.sync_with_reality( storagerouter_guid=active_node.storagerouter_guid)
def remove_asd(node_guid, asd_id, expected_safety): """ Removes an ASD :param node_guid: Guid of the node to remove an ASD from :type node_guid: str :param asd_id: ID of the ASD to remove :type asd_id: str :param expected_safety: Expected safety after having removed the ASD :type expected_safety: dict or None :return: Aliases of the disk on which the ASD was removed :rtype: list """ node = AlbaNode(node_guid) AlbaNodeController._logger.debug('Removing ASD {0} at node {1}'.format(asd_id, node.ip)) model_osd = None for disk in node.disks: for asd in disk.osds: if asd.osd_id == asd_id: model_osd = asd break if model_osd is not None: break if model_osd is not None: alba_backend = model_osd.alba_backend else: alba_backend = None asds = {} try: asds = node.client.get_asds() except (requests.ConnectionError, requests.Timeout, InvalidCredentialsError): AlbaNodeController._logger.warning('Could not connect to node {0} to validate ASD'.format(node.guid)) partition_alias = None for alias, asd_ids in asds.iteritems(): if asd_id in asd_ids: partition_alias = alias break if alba_backend is not None: if expected_safety is None: AlbaNodeController._logger.warning('Skipping safety check for ASD {0} on backend {1} - this is dangerous'.format(asd_id, alba_backend.guid)) else: final_safety = AlbaController.calculate_safety(alba_backend_guid=alba_backend.guid, removal_osd_ids=[asd_id]) safety_lost = final_safety['lost'] safety_crit = final_safety['critical'] if (safety_crit != 0 or safety_lost != 0) and (safety_crit != expected_safety['critical'] or safety_lost != expected_safety['lost']): raise RuntimeError('Cannot remove ASD {0} as the current safety is not as expected ({1} vs {2})'.format(asd_id, final_safety, expected_safety)) AlbaNodeController._logger.debug('Safety OK for ASD {0} on backend {1}'.format(asd_id, alba_backend.guid)) AlbaNodeController._logger.debug('Purging ASD {0} on backend {1}'.format(asd_id, alba_backend.guid)) AlbaController.remove_units(alba_backend_guid=alba_backend.guid, osd_ids=[asd_id]) else: AlbaNodeController._logger.warning('Could not match ASD {0} to any backend. Cannot purge'.format(asd_id)) disk_data = None if partition_alias is not None: AlbaNodeController._logger.debug('Removing ASD {0} from disk {1}'.format(asd_id, partition_alias)) for device_info in node.client.get_disks().itervalues(): if partition_alias in device_info['partition_aliases']: disk_data = device_info result = node.client.delete_asd(disk_id=device_info['aliases'][0].split('/')[-1], asd_id=asd_id) if result['_success'] is False: raise RuntimeError('Error removing ASD: {0}'.format(result['_error'])) if disk_data == {}: raise RuntimeError('Failed to find disk for partition with alias {0}'.format(partition_alias)) else: AlbaNodeController._logger.warning('Could not remove ASD from remote node (node down)'.format(asd_id)) if Configuration.exists(AlbaNodeController.ASD_CONFIG.format(asd_id), raw=True): Configuration.delete(AlbaNodeController.ASD_CONFIG_DIR.format(asd_id), raw=True) if model_osd is not None: model_osd.delete() if alba_backend is not None: alba_backend.invalidate_dynamics() alba_backend.backend.invalidate_dynamics() if node.storagerouter is not None: DiskController.sync_with_reality(storagerouter_guid=node.storagerouter_guid) return [] if disk_data is None else disk_data.get('aliases', [])
def add_vpool(cls, parameters): """ Add a vPool to the machine this task is running on :param parameters: Parameters for vPool creation :type parameters: dict :return: None :rtype: NoneType """ # TODO: Add logging cls._logger.debug('Adding vpool. Parameters: {}'.format(parameters)) # VALIDATIONS if not isinstance(parameters, dict): raise ValueError( 'Parameters passed to create a vPool should be of type dict') # Check StorageRouter existence storagerouter = StorageRouterList.get_by_ip( ip=parameters.get('storagerouter_ip')) if storagerouter is None: raise RuntimeError('Could not find StorageRouter') # Validate requested vPool configurations vp_installer = VPoolInstaller(name=parameters.get('vpool_name')) vp_installer.validate(storagerouter=storagerouter) # Validate requested StorageDriver configurations cls._logger.info( 'vPool {0}: Validating StorageDriver configurations'.format( vp_installer.name)) sd_installer = StorageDriverInstaller( vp_installer=vp_installer, configurations={ 'storage_ip': parameters.get('storage_ip'), 'caching_info': parameters.get('caching_info'), 'backend_info': { 'main': parameters.get('backend_info'), StorageDriverConfiguration.CACHE_BLOCK: parameters.get('backend_info_bc'), StorageDriverConfiguration.CACHE_FRAGMENT: parameters.get('backend_info_fc') }, 'connection_info': { 'main': parameters.get('connection_info'), StorageDriverConfiguration.CACHE_BLOCK: parameters.get('connection_info_bc'), StorageDriverConfiguration.CACHE_FRAGMENT: parameters.get('connection_info_fc') }, 'sd_configuration': parameters.get('config_params') }) partitions_mutex = volatile_mutex('add_vpool_partitions_{0}'.format( storagerouter.guid)) try: # VPOOL CREATION # Create the vPool as soon as possible in the process to be displayed in the GUI (INSTALLING/EXTENDING state) if vp_installer.is_new is True: vp_installer.create(rdma_enabled=sd_installer.rdma_enabled) vp_installer.configure_mds( config=parameters.get('mds_config_params', {})) else: vp_installer.update_status(status=VPool.STATUSES.EXTENDING) # ADDITIONAL VALIDATIONS # Check StorageRouter connectivity cls._logger.info( 'vPool {0}: Validating StorageRouter connectivity'.format( vp_installer.name)) linked_storagerouters = [storagerouter] if vp_installer.is_new is False: linked_storagerouters += [ sd.storagerouter for sd in vp_installer.vpool.storagedrivers ] sr_client_map = SSHClient.get_clients( endpoints=linked_storagerouters, user_names=['ovs', 'root']) offline_nodes = sr_client_map.pop('offline') if storagerouter in offline_nodes: raise RuntimeError( 'Node on which the vPool is being {0} is not reachable'. format('created' if vp_installer.is_new is True else 'extended')) sr_installer = StorageRouterInstaller( root_client=sr_client_map[storagerouter]['root'], sd_installer=sd_installer, vp_installer=vp_installer, storagerouter=storagerouter) # When 2 or more jobs simultaneously run on the same StorageRouter, we need to check and create the StorageDriver partitions in locked context partitions_mutex.acquire(wait=60) sr_installer.partition_info = StorageRouterController.get_partition_info( storagerouter_guid=storagerouter.guid) sr_installer.validate_vpool_extendable() sr_installer.validate_global_write_buffer( requested_size=parameters.get('writecache_size', 0)) sr_installer.validate_local_cache_size( requested_proxies=parameters.get('parallelism', {}).get( 'proxies', 2)) # MODEL STORAGEDRIVER AND PARTITION JUNCTIONS sd_installer.create() sd_installer.create_partitions() partitions_mutex.release() vp_installer.refresh_metadata() except Exception: cls._logger.exception( 'Something went wrong during the validation or modeling of vPool {0} on StorageRouter {1}' .format(vp_installer.name, storagerouter.name)) partitions_mutex.release() vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise # Arakoon setup counter = 0 while counter < 300: try: if StorageDriverController.manual_voldrv_arakoon_checkup( ) is True: break except Exception: cls._logger.exception( 'Arakoon checkup for voldrv cluster failed') vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise counter += 1 time.sleep(1) if counter == 300: vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise RuntimeError( 'Arakoon checkup for the StorageDriver cluster could not be started' ) # Cluster registry try: vp_installer.configure_cluster_registry(allow_raise=True) except Exception: if vp_installer.is_new is True: vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) else: vp_installer.revert_vpool(status=VPool.STATUSES.FAILURE) raise try: sd_installer.setup_proxy_configs() sd_installer.configure_storagedriver_service() DiskController.sync_with_reality(storagerouter.guid) MDSServiceController.prepare_mds_service( storagerouter=storagerouter, vpool=vp_installer.vpool) # Update the MDS safety if changed via API (vpool.configuration will be available at this point also for the newly added StorageDriver) vp_installer.vpool.invalidate_dynamics('configuration') if vp_installer.mds_safety is not None and vp_installer.vpool.configuration[ 'mds_config']['mds_safety'] != vp_installer.mds_safety: Configuration.set( key='/ovs/vpools/{0}/mds_config|mds_safety'.format( vp_installer.vpool.guid), value=vp_installer.mds_safety) sd_installer.start_services( ) # Create and start watcher volumedriver, DTL, proxies and StorageDriver services # Post creation/extension checkups mds_config_set = MDSServiceController.get_mds_storagedriver_config_set( vpool=vp_installer.vpool, offline_nodes=offline_nodes) for sr, clients in sr_client_map.iteritems(): for current_storagedriver in [ sd for sd in sr.storagedrivers if sd.vpool_guid == vp_installer.vpool.guid ]: storagedriver_config = StorageDriverConfiguration( vpool_guid=vp_installer.vpool.guid, storagedriver_id=current_storagedriver.storagedriver_id ) if storagedriver_config.config_missing is False: # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them storagedriver_config.configure_filesystem( fs_metadata_backend_mds_nodes=mds_config_set[ sr.guid]) storagedriver_config.save(client=clients['ovs']) # Everything's reconfigured, refresh new cluster configuration for current_storagedriver in vp_installer.vpool.storagedrivers: if current_storagedriver.storagerouter not in sr_client_map: continue vp_installer.vpool.storagedriver_client.update_cluster_node_configs( str(current_storagedriver.storagedriver_id), req_timeout_secs=10) except Exception: cls._logger.exception('vPool {0}: Creation failed'.format( vp_installer.name)) vp_installer.update_status(status=VPool.STATUSES.FAILURE) raise # When a node is offline, we can run into errors, but also when 1 or more volumes are not running # Scheduled tasks below, so don't really care whether they succeed or not try: VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600) except: pass for vdisk in vp_installer.vpool.vdisks: try: MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid) except: pass vp_installer.update_status(status=VPool.STATUSES.RUNNING) cls._logger.info('Add vPool {0} ended successfully'.format( vp_installer.name))
def shrink_vpool(cls, storagedriver_guid, offline_storage_router_guids=list()): """ Removes a StorageDriver (if its the last StorageDriver for a vPool, the vPool is removed as well) :param storagedriver_guid: Guid of the StorageDriver to remove :type storagedriver_guid: str :param offline_storage_router_guids: Guids of StorageRouters which are offline and will be removed from cluster. WHETHER VPOOL WILL BE DELETED DEPENDS ON THIS :type offline_storage_router_guids: list :return: None :rtype: NoneType """ # TODO: Add logging # TODO: Unit test individual pieces of code # Validations storagedriver = StorageDriver(storagedriver_guid) storagerouter = storagedriver.storagerouter cls._logger.info( 'StorageDriver {0} - Deleting StorageDriver {1}'.format( storagedriver.guid, storagedriver.name)) vp_installer = VPoolInstaller(name=storagedriver.vpool.name) vp_installer.validate(storagedriver=storagedriver) sd_installer = StorageDriverInstaller(vp_installer=vp_installer, storagedriver=storagedriver) cls._logger.info( 'StorageDriver {0} - Checking availability of related StorageRouters' .format(storagedriver.guid, storagedriver.name)) sr_client_map = SSHClient.get_clients(endpoints=[ sd.storagerouter for sd in vp_installer.vpool.storagedrivers ], user_names=['root']) sr_installer = StorageRouterInstaller(root_client=sr_client_map.get( storagerouter, {}).get('root'), storagerouter=storagerouter, vp_installer=vp_installer, sd_installer=sd_installer) offline_srs = sr_client_map.pop('offline') if sorted([sr.guid for sr in offline_srs ]) != sorted(offline_storage_router_guids): raise RuntimeError('Not all StorageRouters are reachable') if storagerouter not in offline_srs: mtpt_pids = sr_installer.root_client.run( "lsof -t +D '/mnt/{0}' || true".format( vp_installer.name.replace(r"'", r"'\''")), allow_insecure=True).splitlines() if len(mtpt_pids) > 0: raise RuntimeError( 'vPool cannot be deleted. Following processes keep the vPool mount point occupied: {0}' .format(', '.join(mtpt_pids))) # Retrieve reachable StorageDrivers reachable_storagedrivers = [] for sd in vp_installer.vpool.storagedrivers: if sd.storagerouter not in sr_client_map: # StorageRouter is offline continue sd_key = '/ovs/vpools/{0}/hosts/{1}/config'.format( vp_installer.vpool.guid, sd.storagedriver_id) if Configuration.exists(sd_key) is True: path = Configuration.get_configuration_path(sd_key) with remote(sd.storagerouter.ip, [LocalStorageRouterClient]) as rem: try: lsrc = rem.LocalStorageRouterClient(path) lsrc.server_revision( ) # 'Cheap' call to verify whether volumedriver is responsive cls._logger.info( 'StorageDriver {0} - Responsive StorageDriver {1} on node with IP {2}' .format(storagedriver.guid, sd.name, sd.storagerouter.ip)) reachable_storagedrivers.append(sd) except Exception as exception: if not is_connection_failure(exception): raise if len(reachable_storagedrivers) == 0: raise RuntimeError( 'Could not find any responsive node in the cluster') # Start removal if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.SHRINKING) else: vp_installer.update_status(status=VPool.STATUSES.DELETING) # Clean up stale vDisks cls._logger.info('StorageDriver {0} - Removing stale vDisks'.format( storagedriver.guid)) VDiskController.remove_stale_vdisks(vpool=vp_installer.vpool) # Reconfigure the MDSes cls._logger.info('StorageDriver {0} - Reconfiguring MDSes'.format( storagedriver.guid)) for vdisk_guid in storagerouter.vdisks_guids: try: MDSServiceController.ensure_safety( vdisk_guid=vdisk_guid, excluded_storagerouter_guids=[storagerouter.guid] + offline_storage_router_guids) except Exception: cls._logger.exception( 'StorageDriver {0} - vDisk {1} - Ensuring MDS safety failed' .format(storagedriver.guid, vdisk_guid)) # Validate that all MDSes on current StorageRouter have been moved away # Ensure safety does not always throw an error, that's why we perform this check here instead of in the Exception clause of above code vdisks = [] for mds in vp_installer.mds_services: for junction in mds.vdisks: vdisk = junction.vdisk if vdisk in vdisks: continue vdisks.append(vdisk) cls._logger.critical( 'StorageDriver {0} - vDisk {1} {2} - MDS Services have not been migrated away' .format(storagedriver.guid, vdisk.guid, vdisk.name)) if len(vdisks) > 0: # Put back in RUNNING, so it can be used again. Errors keep on displaying in GUI now anyway vp_installer.update_status(status=VPool.STATUSES.RUNNING) raise RuntimeError( 'Not all MDS Services have been successfully migrated away') # Start with actual removal errors_found = False if storagerouter not in offline_srs: errors_found &= sd_installer.stop_services() errors_found &= vp_installer.configure_cluster_registry( exclude=[storagedriver], apply_on=reachable_storagedrivers) errors_found &= vp_installer.update_node_distance_map() errors_found &= vp_installer.remove_mds_services() errors_found &= sd_installer.clean_config_management() errors_found &= sd_installer.clean_model() if storagerouter not in offline_srs: errors_found &= sd_installer.clean_directories( mountpoints=StorageRouterController.get_mountpoints( client=sr_installer.root_client)) try: DiskController.sync_with_reality( storagerouter_guid=storagerouter.guid) except Exception: cls._logger.exception( 'StorageDriver {0} - Synchronizing disks with reality failed' .format(storagedriver.guid)) errors_found = True if vp_installer.storagedriver_amount > 1: # Update the vPool metadata and run DTL checkup vp_installer.vpool.metadata['caching_info'].pop( sr_installer.storagerouter.guid, None) vp_installer.vpool.save() try: VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600) except Exception: cls._logger.exception( 'StorageDriver {0} - DTL checkup failed for vPool {1} with guid {2}' .format(storagedriver.guid, vp_installer.name, vp_installer.vpool.guid)) else: cls._logger.info( 'StorageDriver {0} - Removing vPool from model'.format( storagedriver.guid)) # Clean up model try: vp_installer.vpool.delete() except Exception: errors_found = True cls._logger.exception( 'StorageDriver {0} - Cleaning up vPool from the model failed' .format(storagedriver.guid)) Configuration.delete('/ovs/vpools/{0}'.format( vp_installer.vpool.guid)) cls._logger.info('StorageDriver {0} - Running MDS checkup'.format( storagedriver.guid)) try: MDSServiceController.mds_checkup() except Exception: cls._logger.exception( 'StorageDriver {0} - MDS checkup failed'.format( storagedriver.guid)) # Update vPool status if errors_found is True: if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.FAILURE) raise RuntimeError( '1 or more errors occurred while trying to remove the StorageDriver. Please check the logs for more information' ) if vp_installer.storagedriver_amount > 1: vp_installer.update_status(status=VPool.STATUSES.RUNNING) cls._logger.info( 'StorageDriver {0} - Deleted StorageDriver {1}'.format( storagedriver.guid, storagedriver.name)) if len(VPoolList.get_vpools()) == 0: cluster_name = ArakoonInstaller.get_cluster_name('voldrv') if ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name)['internal'] is True: cls._logger.debug( 'StorageDriver {0} - Removing Arakoon cluster {1}'.format( storagedriver.guid, cluster_name)) try: installer = ArakoonInstaller(cluster_name=cluster_name) installer.load() installer.delete_cluster() except Exception: cls._logger.exception( 'StorageDriver {0} - Delete voldrv Arakoon cluster failed' .format(storagedriver.guid)) service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) service_name = ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name) for service in list(service_type.services): if service.name == service_name: service.delete() # Remove watcher volumedriver service if last StorageDriver on current StorageRouter if len( storagerouter.storagedrivers ) == 0 and storagerouter not in offline_srs: # ensure client is initialized for StorageRouter try: if cls._service_manager.has_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client): cls._service_manager.stop_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client) cls._service_manager.remove_service( ServiceFactory.SERVICE_WATCHER_VOLDRV, client=sr_installer.root_client) except Exception: cls._logger.exception( 'StorageDriver {0} - {1} service deletion failed'.format( storagedriver.guid, ServiceFactory.SERVICE_WATCHER_VOLDRV))
def configure_disk(storagerouter_guid, disk_guid, partition_guid, offset, size, roles): """ Configures a partition :param storagerouter_guid: Guid of the StorageRouter to configure a disk on :type storagerouter_guid: str :param disk_guid: Guid of the disk to configure :type disk_guid: str :param partition_guid: Guid of the partition on the disk :type partition_guid: str :param offset: Offset for the partition :type offset: int :param size: Size of the partition :type size: int :param roles: Roles assigned to the partition :type roles: list :return: None :rtype: NoneType """ # Validations storagerouter = StorageRouter(storagerouter_guid) for role in roles: if role not in DiskPartition.ROLES or role == DiskPartition.ROLES.BACKEND: raise RuntimeError('Invalid role specified: {0}'.format(role)) disk = Disk(disk_guid) if disk.storagerouter_guid != storagerouter_guid: raise RuntimeError( 'The given Disk is not on the given StorageRouter') for partition in disk.partitions: if DiskPartition.ROLES.BACKEND in partition.roles: raise RuntimeError('The given Disk is in use by a Backend') if len({DiskPartition.ROLES.DB, DiskPartition.ROLES.DTL}.intersection( set(roles))) > 0: roles_on_sr = StorageRouterController._get_roles_on_storagerouter( storagerouter.ip) for role in [DiskPartition.ROLES.DB, DiskPartition.ROLES.DTL]: if role in roles_on_sr and role in roles and roles_on_sr[role][ 0] != disk.name: # DB and DTL roles still have to be unassignable raise RoleDuplicationException( 'Disk {0} cannot have the {1} role due to presence on disk {2}' .format(disk.name, role, roles_on_sr[role][0])) # Create partition if partition_guid is None: StorageRouterController._logger.debug( 'Creating new partition - Offset: {0} bytes - Size: {1} bytes - Roles: {2}' .format(offset, size, roles)) with remote(storagerouter.ip, [DiskTools], username='******') as rem: if len(disk.aliases) == 0: raise ValueError( 'Disk {0} does not have any aliases'.format(disk.name)) rem.DiskTools.create_partition(disk_alias=disk.aliases[0], disk_size=disk.size, partition_start=offset, partition_size=size) DiskController.sync_with_reality(storagerouter_guid) disk = Disk(disk_guid) end_point = offset + size partition = None for part in disk.partitions: if offset < part.offset + part.size and end_point > part.offset: partition = part break if partition is None: raise RuntimeError( 'No new partition detected on disk {0} after having created 1' .format(disk.name)) StorageRouterController._logger.debug('Partition created') else: StorageRouterController._logger.debug('Using existing partition') partition = DiskPartition(partition_guid) if partition.disk_guid != disk_guid: raise RuntimeError( 'The given DiskPartition is not on the given Disk') if partition.filesystem in [ 'swap', 'linux_raid_member', 'LVM2_member' ]: raise RuntimeError( "It is not allowed to assign roles on partitions of type: ['swap', 'linux_raid_member', 'LVM2_member']" ) metadata = StorageRouterController.get_metadata(storagerouter_guid) partition_info = metadata['partitions'] removed_roles = set(partition.roles) - set(roles) used_roles = [] for role in removed_roles: for info in partition_info[role]: if info['in_use'] and info['guid'] == partition.guid: used_roles.append(role) if len(used_roles) > 0: raise RuntimeError( 'Roles in use cannot be removed. Used roles: {0}'.format( ', '.join(used_roles))) # Add filesystem if partition.filesystem is None or partition_guid is None: StorageRouterController._logger.debug('Creating filesystem') if len(partition.aliases) == 0: raise ValueError( 'Partition with offset {0} does not have any aliases'. format(partition.offset)) with remote(storagerouter.ip, [DiskTools], username='******') as rem: rem.DiskTools.make_fs(partition_alias=partition.aliases[0]) DiskController.sync_with_reality(storagerouter_guid) partition = DiskPartition(partition.guid) if partition.filesystem not in ['ext4', 'xfs']: raise RuntimeError('Unexpected filesystem') StorageRouterController._logger.debug('Filesystem created') # Mount the partition and add to FSTab if partition.mountpoint is None: StorageRouterController._logger.debug('Configuring mount point') with remote(storagerouter.ip, [DiskTools], username='******') as rem: counter = 1 mountpoint = '/mnt/{0}{1}'.format( 'ssd' if disk.is_ssd else 'hdd', counter) while True: if not rem.DiskTools.mountpoint_exists(mountpoint): break counter += 1 mountpoint = '/mnt/{0}{1}'.format( 'ssd' if disk.is_ssd else 'hdd', counter) StorageRouterController._logger.debug( 'Found mount point: {0}'.format(mountpoint)) rem.DiskTools.add_fstab(partition_aliases=partition.aliases, mountpoint=mountpoint, filesystem=partition.filesystem) rem.DiskTools.mount(mountpoint) DiskController.sync_with_reality(storagerouter_guid) partition = DiskPartition(partition.guid) if partition.mountpoint != mountpoint: raise RuntimeError('Unexpected mount point') StorageRouterController._logger.debug('Mount point configured') partition.roles = roles partition.save() StorageRouterController._logger.debug('Partition configured')
def migrate(previous_version): """ Migrates from a given version to the current version. It uses 'previous_version' to be smart wherever possible, but the code should be able to migrate any version towards the expected version. When this is not possible, the code can set a minimum version and raise when it is not met. :param previous_version: The previous version from which to start the migration :type previous_version: float """ working_version = previous_version if working_version == 0: # Initial version: # * Set the version to THIS RELEASE version from ovs.dal.hybrids.user import User from ovs.dal.hybrids.group import Group from ovs.dal.hybrids.role import Role from ovs.dal.hybrids.client import Client from ovs.dal.hybrids.j_rolegroup import RoleGroup from ovs.dal.hybrids.j_roleclient import RoleClient from ovs.dal.hybrids.servicetype import ServiceType from ovs.dal.hybrids.branding import Branding from ovs.dal.lists.backendtypelist import BackendTypeList # Create groups admin_group = Group() admin_group.name = 'administrators' admin_group.description = 'Administrators' admin_group.save() viewers_group = Group() viewers_group.name = 'viewers' viewers_group.description = 'Viewers' viewers_group.save() # Create users admin = User() admin.username = '******' admin.password = hashlib.sha256('admin').hexdigest() admin.is_active = True admin.group = admin_group admin.save() # Create internal OAuth 2 clients admin_pw_client = Client() admin_pw_client.ovs_type = 'INTERNAL' admin_pw_client.grant_type = 'PASSWORD' admin_pw_client.user = admin admin_pw_client.save() admin_cc_client = Client() admin_cc_client.ovs_type = 'INTERNAL' admin_cc_client.grant_type = 'CLIENT_CREDENTIALS' admin_cc_client.client_secret = ''.join(random.choice(string.ascii_letters + string.digits + '|_=+*#@!/-[]{}<>.?,\'";:~') for _ in range(128)) admin_cc_client.user = admin admin_cc_client.save() # Create roles read_role = Role() read_role.code = 'read' read_role.name = 'Read' read_role.description = 'Can read objects' read_role.save() write_role = Role() write_role.code = 'write' write_role.name = 'Write' write_role.description = 'Can write objects' write_role.save() manage_role = Role() manage_role.code = 'manage' manage_role.name = 'Manage' manage_role.description = 'Can manage the system' manage_role.save() # Attach groups to roles mapping = [ (admin_group, [read_role, write_role, manage_role]), (viewers_group, [read_role]) ] for setting in mapping: for role in setting[1]: rolegroup = RoleGroup() rolegroup.group = setting[0] rolegroup.role = role rolegroup.save() for user in setting[0].users: for role in setting[1]: for client in user.clients: roleclient = RoleClient() roleclient.client = client roleclient.role = role roleclient.save() # Add service types for service_type_info in [ServiceType.SERVICE_TYPES.MD_SERVER, ServiceType.SERVICE_TYPES.ALBA_PROXY, ServiceType.SERVICE_TYPES.ARAKOON]: service_type = ServiceType() service_type.name = service_type_info service_type.save() # Branding branding = Branding() branding.name = 'Default' branding.description = 'Default bootstrap theme' branding.css = 'bootstrap-default.min.css' branding.productname = 'Open vStorage' branding.is_default = True branding.save() slate = Branding() slate.name = 'Slate' slate.description = 'Dark bootstrap theme' slate.css = 'bootstrap-slate.min.css' slate.productname = 'Open vStorage' slate.is_default = False slate.save() # From here on, all actual migration should happen to get to the expected state for THIS RELEASE elif working_version < OVSMigrator.THIS_VERSION: # Migrate unique constraints from ovs.dal.helpers import HybridRunner, Descriptor from ovs.extensions.storage.persistentfactory import PersistentFactory client = PersistentFactory.get_client() hybrid_structure = HybridRunner.get_hybrids() for class_descriptor in hybrid_structure.values(): cls = Descriptor().load(class_descriptor).get_object() classname = cls.__name__.lower() unique_key = 'ovs_unique_{0}_{{0}}_'.format(classname) uniques = [] # noinspection PyProtectedMember for prop in cls._properties: if prop.unique is True and len([k for k in client.prefix(unique_key.format(prop.name))]) == 0: uniques.append(prop.name) if len(uniques) > 0: prefix = 'ovs_data_{0}_'.format(classname) for key in client.prefix(prefix): data = client.get(key) for property_name in uniques: ukey = '{0}{1}'.format(unique_key.format(property_name), hashlib.sha1(str(data[property_name])).hexdigest()) client.set(ukey, key) # Complete rework of the way we detect devices to assign roles or use as ASD # Allow loop-, raid-, nvme-, ??-devices and logical volumes as ASD (https://github.com/openvstorage/framework/issues/792) from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.extensions.generic.sshclient import SSHClient, UnableToConnectException from ovs.lib.disk import DiskController for storagerouter in StorageRouterList.get_storagerouters(): try: client = SSHClient(storagerouter, username='******') except UnableToConnectException: raise # Retrieve all symlinks for all devices # Example of name_alias_mapping: # {'/dev/md0': ['/dev/disk/by-id/md-uuid-ad2de634:26d97253:5eda0a23:96986b76', '/dev/disk/by-id/md-name-OVS-1:0'], # '/dev/sda': ['/dev/disk/by-path/pci-0000:03:00.0-sas-0x5000c295fe2ff771-lun-0'], # '/dev/sda1': ['/dev/disk/by-uuid/e3e0bc62-4edc-4c6b-a6ce-1f39e8f27e41', '/dev/disk/by-path/pci-0000:03:00.0-sas-0x5000c295fe2ff771-lun-0-part1']} name_alias_mapping = {} alias_name_mapping = {} for path_type in client.dir_list(directory='/dev/disk'): if path_type in ['by-uuid', 'by-partuuid']: # UUIDs can change after creating a filesystem on a partition continue directory = '/dev/disk/{0}'.format(path_type) for symlink in client.dir_list(directory=directory): symlink_path = '{0}/{1}'.format(directory, symlink) link = client.file_read_link(symlink_path) if link not in name_alias_mapping: name_alias_mapping[link] = [] name_alias_mapping[link].append(symlink_path) alias_name_mapping[symlink_path] = link for disk in storagerouter.disks: if disk.aliases is None: # noinspection PyProtectedMember device_path = '/dev/{0}'.format(disk.name) disk.aliases = name_alias_mapping.get(device_path, [device_path]) disk.save() for partition in disk.partitions: if partition.aliases is None: # noinspection PyProtectedMember partition_device = alias_name_mapping.get(partition._data.get('path')) if partition_device is None: partition.aliases = [] partition.save() continue partition.aliases = name_alias_mapping.get(partition_device, []) partition.save() DiskController.sync_with_reality(storagerouter_guid=storagerouter.guid) # Only support ALBA backend type from ovs.dal.lists.backendtypelist import BackendTypeList for backend_type in BackendTypeList.get_backend_types(): if backend_type.code != 'alba': backend_type.delete() # Reformat the vpool.metadata information from ovs.dal.lists.vpoollist import VPoolList for vpool in VPoolList.get_vpools(): new_metadata = {} for metadata_key, value in vpool.metadata.items(): new_info = {} storagerouter_guids = [key for key in vpool.metadata.keys() if not key.startswith('backend')] if isinstance(value, dict): read_cache = value.get('backend_info', {}).get('fragment_cache_on_read', True) write_cache = value.get('backend_info', {}).get('fragment_cache_on_write', False) new_info['backend_info'] = {'alba_backend_guid': value.get('backend_guid'), 'backend_guid': None, 'frag_size': value.get('backend_info', {}).get('frag_size'), 'name': value.get('name'), 'policies': value.get('backend_info', {}).get('policies'), 'preset': value.get('preset'), 'sco_size': value.get('backend_info', {}).get('sco_size'), 'total_size': value.get('backend_info', {}).get('total_size')} new_info['arakoon_config'] = value.get('arakoon_config') new_info['connection_info'] = {'host': value.get('connection', {}).get('host', ''), 'port': value.get('connection', {}).get('port', ''), 'local': value.get('connection', {}).get('local', ''), 'client_id': value.get('connection', {}).get('client_id', ''), 'client_secret': value.get('connection', {}).get('client_secret', '')} if metadata_key == 'backend': new_info['caching_info'] = dict((sr_guid, {'fragment_cache_on_read': read_cache, 'fragment_cache_on_write': write_cache}) for sr_guid in storagerouter_guids) if metadata_key in storagerouter_guids: metadata_key = 'backend_aa_{0}'.format(metadata_key) new_metadata[metadata_key] = new_info vpool.metadata = new_metadata vpool.save() # Removal of READ role from ovs.dal.lists.diskpartitionlist import DiskPartitionList for partition in DiskPartitionList.get_partitions(): if 'READ' in partition.roles: partition.roles.remove('READ') partition.save() return OVSMigrator.THIS_VERSION
def remove_osd(node_guid, osd_id, expected_safety): """ Removes an OSD :param node_guid: Guid of the node to remove an OSD from :type node_guid: str :param osd_id: ID of the OSD to remove :type osd_id: str :param expected_safety: Expected safety after having removed the OSD :type expected_safety: dict or None :return: Aliases of the disk on which the OSD was removed :rtype: list """ # Retrieve corresponding OSD in model node = AlbaNode(node_guid) AlbaNodeController._logger.debug('Removing OSD {0} at node {1}'.format( osd_id, node.ip)) osd = AlbaOSDList.get_by_osd_id(osd_id) alba_backend = osd.alba_backend if expected_safety is None: AlbaNodeController._logger.warning( 'Skipping safety check for OSD {0} on backend {1} - this is dangerous' .format(osd_id, alba_backend.guid)) else: final_safety = AlbaController.calculate_safety( alba_backend_guid=alba_backend.guid, removal_osd_ids=[osd_id]) safety_lost = final_safety['lost'] safety_crit = final_safety['critical'] if (safety_crit != 0 or safety_lost != 0) and ( safety_crit != expected_safety['critical'] or safety_lost != expected_safety['lost']): raise RuntimeError( 'Cannot remove OSD {0} as the current safety is not as expected ({1} vs {2})' .format(osd_id, final_safety, expected_safety)) AlbaNodeController._logger.debug( 'Safety OK for OSD {0} on backend {1}'.format( osd_id, alba_backend.guid)) AlbaNodeController._logger.debug( 'Purging OSD {0} on backend {1}'.format(osd_id, alba_backend.guid)) AlbaController.remove_units(alba_backend_guid=alba_backend.guid, osd_ids=[osd_id]) # Delete the OSD result = node.client.delete_osd(slot_id=osd.slot_id, osd_id=osd_id) if result['_success'] is False: raise RuntimeError('Error removing OSD: {0}'.format( result['_error'])) # Clean configuration management and model - Well, just try it at least if Configuration.exists(ASD_CONFIG.format(osd_id), raw=True): Configuration.delete(ASD_CONFIG_DIR.format(osd_id), raw=True) osd.delete() node.invalidate_dynamics() if alba_backend is not None: alba_backend.invalidate_dynamics() alba_backend.backend.invalidate_dynamics() if node.storagerouter is not None: try: DiskController.sync_with_reality( storagerouter_guid=node.storagerouter_guid) except UnableToConnectException: AlbaNodeController._logger.warning( 'Skipping disk sync since StorageRouter {0} is offline'. format(node.storagerouter.name)) return [osd.slot_id]
def migrate(): """ Executes async migrations. It doesn't matter too much when they are executed, as long as they get eventually executed. This code will typically contain: * "dangerous" migration code (it needs certain running services) * Migration code depending on a cluster-wide state * ... """ AlbaMigrationController._logger.info( 'Preparing out of band migrations...') from ovs.dal.hybrids.diskpartition import DiskPartition from ovs.dal.lists.albabackendlist import AlbaBackendList from ovs.dal.lists.albanodelist import AlbaNodeList from ovs.dal.lists.albaosdlist import AlbaOSDList from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.extensions.generic.configuration import Configuration from ovs.extensions.generic.sshclient import SSHClient, UnableToConnectException from ovs.extensions.migration.migration.albamigrator import ExtensionMigrator from ovs.extensions.packages.albapackagefactory import PackageFactory from ovs.extensions.services.albaservicefactory import ServiceFactory from ovs.extensions.plugins.albacli import AlbaCLI, AlbaError from ovs.lib.alba import AlbaController from ovs.lib.disk import DiskController AlbaMigrationController._logger.info('Start out of band migrations...') ############################################# # Introduction of IP:port combination on OSDs osd_info_map = {} alba_backends = AlbaBackendList.get_albabackends() for alba_backend in alba_backends: AlbaMigrationController._logger.info( 'Verifying ALBA Backend {0}'.format(alba_backend.name)) if alba_backend.abm_cluster is None: AlbaMigrationController._logger.warning( 'ALBA Backend {0} does not have an ABM cluster registered'. format(alba_backend.name)) continue AlbaMigrationController._logger.debug( 'Retrieving configuration path for ALBA Backend {0}'.format( alba_backend.name)) try: config = Configuration.get_configuration_path( alba_backend.abm_cluster.config_location) except: AlbaMigrationController._logger.exception( 'Failed to retrieve the configuration path for ALBA Backend {0}' .format(alba_backend.name)) continue AlbaMigrationController._logger.info( 'Retrieving OSD information for ALBA Backend {0}'.format( alba_backend.name)) try: osd_info = AlbaCLI.run(command='list-all-osds', config=config) except (AlbaError, RuntimeError): AlbaMigrationController._logger.exception( 'Failed to retrieve OSD information for ALBA Backend {0}'. format(alba_backend.name)) continue for osd_info in osd_info: if osd_info.get('long_id'): osd_info_map[osd_info['long_id']] = { 'ips': osd_info.get('ips', []), 'port': osd_info.get('port') } for osd in AlbaOSDList.get_albaosds(): if osd.osd_id not in osd_info_map: AlbaMigrationController._logger.warning( 'OSD with ID {0} is modelled but could not be found through ALBA' .format(osd.osd_id)) continue ips = osd_info_map[osd.osd_id]['ips'] port = osd_info_map[osd.osd_id]['port'] changes = False if osd.ips is None: changes = True osd.ips = ips if osd.port is None: changes = True osd.port = port if changes is True: AlbaMigrationController._logger.info( 'Updating OSD with ID {0} with IPS {1} and port {2}'. format(osd.osd_id, ips, port)) osd.save() ################################################### # Read preference for GLOBAL ALBA Backends (1.10.3) (https://github.com/openvstorage/framework-alba-plugin/issues/452) if Configuration.get(key='/ovs/framework/migration|read_preference', default=False) is False: try: name_backend_map = dict((alba_backend.name, alba_backend) for alba_backend in alba_backends) for alba_node in AlbaNodeList.get_albanodes(): AlbaMigrationController._logger.info( 'Processing maintenance services running on ALBA Node {0} with ID {1}' .format(alba_node.ip, alba_node.node_id)) alba_node.invalidate_dynamics('maintenance_services') for alba_backend_name, services in alba_node.maintenance_services.iteritems( ): if alba_backend_name not in name_backend_map: AlbaMigrationController._logger.error( 'ALBA Node {0} has services for an ALBA Backend {1} which is not modelled' .format(alba_node.ip, alba_backend_name)) continue alba_backend = name_backend_map[alba_backend_name] AlbaMigrationController._logger.info( 'Processing {0} ALBA Backend {1} with GUID {2}'. format(alba_backend.scaling, alba_backend.name, alba_backend.guid)) if alba_backend.scaling == alba_backend.SCALINGS.LOCAL: read_preferences = [alba_node.node_id] else: read_preferences = AlbaController.get_read_preferences_for_global_backend( alba_backend=alba_backend, alba_node_id=alba_node.node_id, read_preferences=[]) for service_name, _ in services: AlbaMigrationController._logger.info( 'Processing service {0}'.format(service_name)) old_config_key = '/ovs/alba/backends/{0}/maintenance/config'.format( alba_backend.guid) new_config_key = '/ovs/alba/backends/{0}/maintenance/{1}/config'.format( alba_backend.guid, service_name) if Configuration.exists(key=old_config_key): new_config = Configuration.get( key=old_config_key) new_config[ 'read_preference'] = read_preferences Configuration.set(key=new_config_key, value=new_config) for alba_backend in alba_backends: Configuration.delete( key='/ovs/alba/backends/{0}/maintenance/config'.format( alba_backend.guid)) AlbaController.checkup_maintenance_agents.delay() Configuration.set( key='/ovs/framework/migration|read_preference', value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating read preferences for ALBA Backends failed') ####################################################### # Storing actual package name in version files (1.11.0) (https://github.com/openvstorage/framework/issues/1876) changed_clients = set() storagerouters = StorageRouterList.get_storagerouters() if Configuration.get( key= '/ovs/framework/migration|actual_package_name_in_version_file_alba', default=False) is False: try: service_manager = ServiceFactory.get_manager() alba_pkg_name, alba_version_cmd = PackageFactory.get_package_and_version_cmd_for( component=PackageFactory.COMP_ALBA) for storagerouter in storagerouters: try: root_client = SSHClient( endpoint=storagerouter.ip, username='******' ) # Use '.ip' instead of StorageRouter object because this code is executed during post-update at which point the heartbeat has not been updated for some time except UnableToConnectException: AlbaMigrationController._logger.exception( 'Updating actual package name for version files failed on StorageRouter {0}' .format(storagerouter.ip)) continue for file_name in root_client.file_list( directory=ServiceFactory.RUN_FILE_DIR): if not file_name.endswith('.version'): continue file_path = '{0}/{1}'.format( ServiceFactory.RUN_FILE_DIR, file_name) contents = root_client.file_read(filename=file_path) if alba_pkg_name == PackageFactory.PKG_ALBA_EE and '{0}='.format( PackageFactory.PKG_ALBA) in contents: # Rewrite the version file in the RUN_FILE_DIR contents = contents.replace( PackageFactory.PKG_ALBA, PackageFactory.PKG_ALBA_EE) root_client.file_write(filename=file_path, contents=contents) # Regenerate the service and update the EXTRA_VERSION_CMD in the configuration management service_name = file_name.split('.')[0] service_config_key = ServiceFactory.SERVICE_CONFIG_KEY.format( storagerouter.machine_id, service_name) if Configuration.exists(key=service_config_key): service_config = Configuration.get( key=service_config_key) if 'EXTRA_VERSION_CMD' in service_config: service_config[ 'EXTRA_VERSION_CMD'] = '{0}=`{1}`'.format( alba_pkg_name, alba_version_cmd) Configuration.set(key=service_config_key, value=service_config) service_manager.regenerate_service( name='ovs-arakoon', client=root_client, target_name='ovs-{0}'.format( service_name) ) # Leave out .version changed_clients.add(root_client) Configuration.set( key= '/ovs/framework/migration|actual_package_name_in_version_file_alba', value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating actual package name for version files failed') for root_client in changed_clients: try: root_client.run(['systemctl', 'daemon-reload']) except Exception: AlbaMigrationController._logger.exception( 'Executing command "systemctl daemon-reload" failed') #################################### # Fix for migration version (1.11.0) # Previous code could potentially store a higher version number in the config management than the actual version number if Configuration.get( key='/ovs/framework/migration|alba_migration_version_fix', default=False) is False: try: for storagerouter in storagerouters: config_key = '/ovs/framework/hosts/{0}/versions'.format( storagerouter.machine_id) if Configuration.exists(key=config_key): versions = Configuration.get(key=config_key) if versions.get(PackageFactory.COMP_MIGRATION_ALBA, 0) > ExtensionMigrator.THIS_VERSION: versions[ PackageFactory. COMP_MIGRATION_ALBA] = ExtensionMigrator.THIS_VERSION Configuration.set(key=config_key, value=versions) Configuration.set( key='/ovs/framework/migration|alba_migration_version_fix', value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating migration version failed') #################################### # Enable auto-cleanup migration_auto_cleanup_key = '/ovs/framework/migration|alba_auto_cleanup' if Configuration.get(key=migration_auto_cleanup_key, default=False) is False: try: for storagerouter in StorageRouterList.get_storagerouters(): storagerouter.invalidate_dynamics( 'features') # New feature was added errors = [] for alba_backend in AlbaBackendList.get_albabackends(): try: AlbaController.set_auto_cleanup(alba_backend.guid) except Exception as ex: AlbaMigrationController._logger.exception( 'Failed to set the auto-cleanup for ALBA Backend {0}' .format(alba_backend.name)) errors.append(ex) if len(errors) == 0: Configuration.set(key=migration_auto_cleanup_key, value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating auto cleanup failed') #################################### # Change cache eviction migration_random_eviction_key = '/ovs/framework/migration|alba_cache_eviction_random' if Configuration.get(key=migration_random_eviction_key, default=False) is False: try: errors = [] for alba_backend in AlbaBackendList.get_albabackends(): try: AlbaController.set_cache_eviction(alba_backend.guid) except Exception as ex: AlbaMigrationController._logger.exception( 'Failed to set the auto-cleanup for ALBA Backend {0}' .format(alba_backend.name)) errors.append(ex) if len(errors) == 0: Configuration.set(key=migration_random_eviction_key, value=True) except Exception: AlbaMigrationController._logger.exception( 'Updating auto cleanup failed') ################################################### # Sync all disks and apply the backend role. Backend role was removed with the AD (since 1.10) albanode_backend_role_sync_key = '/ovs/framework/migration|albanode_backend_role_sync' if not Configuration.get(key=albanode_backend_role_sync_key, default=False): try: errors = [] for alba_node in AlbaNodeList.get_albanodes(): try: if not alba_node.storagerouter: continue stack = alba_node.client.get_stack() # type: dict for slot_id, slot_information in stack.iteritems(): osds = slot_information.get('osds', {}) # type: dict slot_aliases = slot_information.get( 'aliases', []) # type: list if not osds: # No osds means no partition was made continue # Sync to add all potential partitions that will need a backend role DiskController.sync_with_reality( storagerouter_guid=alba_node.storagerouter_guid ) for disk in alba_node.storagerouter.disks: if set(disk.aliases).intersection( set(slot_aliases)): partition = disk.partitions[0] if DiskPartition.ROLES.BACKEND not in partition.roles: partition.roles.append( DiskPartition.ROLES.BACKEND) partition.save() except Exception as ex: AlbaMigrationController._logger.exception( 'Syncing for storagerouter/albanode {0} failed'. format(alba_node.storagerouter.ip)) errors.append(ex) if not errors: Configuration.set(key=albanode_backend_role_sync_key, value=True) except Exception: AlbaMigrationController._logger.exception( 'Syncing up the disks for backend roles failed') AlbaMigrationController._logger.info('Finished out of band migrations')
def remove_asd(node_guid, asd_id, expected_safety): """ Removes an ASD :param node_guid: Guid of the node to remove a disk from :type node_guid: str :param asd_id: ASD to remove :type asd_id: str :param expected_safety: Expected safety after having removed the disk :type expected_safety: dict :return: True :rtype: bool """ node = AlbaNode(node_guid) AlbaNodeController._logger.debug('Removing ASD {0} at node {1}'.format(asd_id, node.ip)) model_asd = None for disk in node.disks: for asd in disk.asds: if asd.asd_id == asd_id: model_asd = asd break if model_asd is not None: break if model_asd is None: raise RuntimeError('Could not locate asd {0} in the model'.format(asd_id)) alba_backend = model_asd.alba_backend asds = {} try: asds = node.client.get_asds() except (requests.ConnectionError, requests.Timeout): AlbaNodeController._logger.warning('Could not connect to node {0} to validate asd'.format(node.guid)) disk_id = None for _disk_id in asds: if asd_id in asds[_disk_id]: disk_id = _disk_id break AlbaController.remove_units(alba_backend.guid, [asd_id], absorb_exception=True) if disk_id is not None: final_safety = AlbaController.calculate_safety(alba_backend.guid, [asd_id]) safety_lost = final_safety['lost'] safety_crit = final_safety['critical'] if (safety_crit != 0 or safety_lost != 0) and (safety_crit != expected_safety['critical'] or safety_lost != expected_safety['lost']): raise RuntimeError('Cannot remove ASD {0} as the current safety is not as expected ({1} vs {2})'.format(asd_id, final_safety, expected_safety)) result = node.client.delete_asd(disk_id, asd_id) if result['_success'] is False: raise RuntimeError('Error removing ASD: {0}'.format(result['_error'])) else: AlbaNodeController._logger.warning('Alba decommission osd {0} without safety validations (node down)'.format(asd_id)) if EtcdConfiguration.exists(AlbaNodeController.ASD_CONFIG.format(asd_id), raw=True): EtcdConfiguration.delete(AlbaNodeController.ASD_CONFIG_DIR.format(asd_id), raw=True) model_asd.delete() alba_backend.invalidate_dynamics() alba_backend.backend.invalidate_dynamics() if node.storagerouter is not None: DiskController.sync_with_reality(node.storagerouter_guid) return disk_id