def get_mgmtcenter(pmachine=None, mgmt_center=None): """ @param pmachine: pmachine hybrid from DAL @param mgmt_center: mgmtcenter hybrid from DAL Returns the appropriate sdk client for the management center of the node """ if not ((pmachine is None) ^ (mgmt_center is None)): raise ValueError('Either a pMachine or a Management center should be passed') if pmachine is not None: mgmt_center = pmachine.mgmtcenter if mgmt_center is None: return None mgmtcenter_type = mgmt_center.type ip = mgmt_center.ip username = mgmt_center.username password = mgmt_center.password key = '{0}_{1}'.format(ip, username) if key not in Factory.mgmtcenters: mutex = file_mutex('mgmtcenter_{0}'.format(key)) try: mutex.acquire(30) if key not in Factory.mgmtcenters: if mgmtcenter_type == 'VCENTER': from mgmtcenters.vcenter import VCenter mgmtcenter = VCenter(ip, username, password) elif mgmtcenter_type == 'OPENSTACK': from mgmtcenters.openstack import OpenStack mgmtcenter = OpenStack(ip, username, password) else: raise NotImplementedError('Management center for {0} is not yet supported'.format(mgmtcenter_type)) Factory.mgmtcenters[key] = mgmtcenter finally: mutex.release() return Factory.mgmtcenters[key]
def get_mgmtcenter(pmachine=None, mgmt_center=None): """ @param pmachine: pmachine hybrid from DAL @param mgmt_center: mgmtcenter hybrid from DAL Returns the appropriate sdk client for the management center of the node """ if not ((pmachine is None) ^ (mgmt_center is None)): raise ValueError('Either a pMachine or a Management center should be passed') if pmachine is not None: mgmt_center = pmachine.mgmtcenter if mgmt_center is None: return None mgmtcenter_type = mgmt_center.type ip = mgmt_center.ip username = mgmt_center.username password = mgmt_center.password key = '{0}_{1}'.format(ip, username) if key not in Factory.mgmtcenters: mutex = file_mutex('mgmtcenter_{0}'.format(key)) try: mutex.acquire(30) if key not in Factory.mgmtcenters: if mgmtcenter_type == 'VCENTER': from mgmtcenters.vcenter import VCenter mgmtcenter = VCenter(ip, username, password) elif mgmtcenter_type == 'OPENSTACK': from mgmtcenters.openstack import OpenStack mgmtcenter = OpenStack(ip, username, password) else: raise NotImplementedError('Management center for {0} is not yet supported'.format(mgmtcenter_type)) Factory.mgmtcenters[key] = mgmtcenter finally: mutex.release() return Factory.mgmtcenters[key]
def get(pmachine): """ Returns the appropriate hypervisor client class for a given PMachine """ hvtype = pmachine.hvtype ip = pmachine.ip username = pmachine.username password = pmachine.password key = '{0}_{1}'.format(ip, username) if key not in Factory.hypervisors: mutex = file_mutex('hypervisor_{0}'.format(key)) try: mutex.acquire(30) if key not in Factory.hypervisors: if hvtype == 'VMWARE': from hypervisors.vmware import VMware hypervisor = VMware(ip, username, password) elif hvtype == 'KVM': from hypervisors.kvm import KVM hypervisor = KVM(ip, username, password) else: raise NotImplementedError('Hypervisor {0} is not yet supported'.format(hvtype)) Factory.hypervisors[key] = hypervisor finally: mutex.release() return Factory.hypervisors[key]
def get(pmachine): """ Returns the appropriate hypervisor client class for a given PMachine """ hvtype = pmachine.hvtype ip = pmachine.ip username = pmachine.username password = pmachine.password key = '{0}_{1}'.format(ip, username) if key not in Factory.hypervisors: mutex = file_mutex('hypervisor_{0}'.format(key)) try: mutex.acquire(30) if key not in Factory.hypervisors: if hvtype == 'VMWARE': from hypervisors.vmware import VMware hypervisor = VMware(ip, username, password) elif hvtype == 'KVM': from hypervisors.kvm import KVM hypervisor = KVM(ip, username, password) else: raise NotImplementedError('Hypervisor {0} is not yet supported'.format(hvtype)) Factory.hypervisors[key] = hypervisor finally: mutex.release() return Factory.hypervisors[key]
def add_fstab(partition_aliases, mountpoint, filesystem): """ Add entry to /etc/fstab for mountpoint :param partition_aliases: Possible aliases of the partition to add :type partition_aliases: list :param mountpoint: Mountpoint on which device is mounted :type mountpoint: str :param filesystem: Filesystem used :type filesystem: str :return: None """ if len(partition_aliases) == 0: raise ValueError('No partition aliases provided') with open('/etc/fstab', 'r') as fstab_file: lines = [line.strip() for line in fstab_file.readlines()] used_path = None used_index = None mount_line = None for device_alias in partition_aliases: for index, line in enumerate(lines): if line.startswith('#'): continue if line.startswith(device_alias) and re.match( '^{0}\s+'.format(re.escape(device_alias)), line): used_path = device_alias used_index = index if len(line.split()) == 6 and line.split( )[1] == mountpoint: # Example line: 'UUID=40d99523-a1e7-4374-84f2-85b5d14b516e / swap sw 0 0' mount_line = line if used_path is not None: break if used_path is None: # Partition not yet present with any of its possible aliases lines.append( OSManager.get_fstab_entry(partition_aliases[0], mountpoint, filesystem)) else: # Partition present, update information lines.pop(used_index) lines.insert( used_index, OSManager.get_fstab_entry(used_path, mountpoint, filesystem)) if mount_line is not None: # Mountpoint already in use by another device (potentially same device, but other device_path) lines.remove(mount_line) with file_mutex('ovs-fstab-lock'): with open('/etc/fstab', 'w') as fstab_file: fstab_file.write('{0}\n'.format('\n'.join(lines)))
def new_function(*args, **kw): """ Executes the decorated function in a locked context """ filemutex = file_mutex('messaging') try: filemutex.acquire(wait=60) mutex = volatile_mutex('messaging') try: mutex.acquire(wait=60) return f(*args, **kw) finally: mutex.release() finally: filemutex.release()
def new_function(*args, **kw): """ Executes the decorated function in a locked context """ filemutex = file_mutex('messaging') try: filemutex.acquire(wait=60) mutex = volatile_mutex('messaging') try: mutex.acquire(wait=60) return f(*args, **kw) finally: mutex.release() finally: filemutex.release()
def add_fstab(partition_aliases, mountpoint, filesystem): """ Add entry to /etc/fstab for mountpoint :param partition_aliases: Possible aliases of the partition to add :type partition_aliases: list :param mountpoint: Mountpoint on which device is mounted :type mountpoint: str :param filesystem: Filesystem used :type filesystem: str :return: None """ if len(partition_aliases) == 0: raise ValueError('No partition aliases provided') with open('/etc/fstab', 'r') as fstab_file: lines = [line.strip() for line in fstab_file.readlines()] used_path = None used_index = None mount_line = None for device_alias in partition_aliases: for index, line in enumerate(lines): if line.startswith('#'): continue if line.startswith(device_alias) and re.match('^{0}\s+'.format(re.escape(device_alias)), line): used_path = device_alias used_index = index if len(line.split()) == 6 and line.split()[1] == mountpoint: # Example line: 'UUID=40d99523-a1e7-4374-84f2-85b5d14b516e / swap sw 0 0' mount_line = line if used_path is not None: break if used_path is None: # Partition not yet present with any of its possible aliases lines.append(OSManager.get_fstab_entry(partition_aliases[0], mountpoint, filesystem)) else: # Partition present, update information lines.pop(used_index) lines.insert(used_index, OSManager.get_fstab_entry(used_path, mountpoint, filesystem)) if mount_line is not None: # Mountpoint already in use by another device (potentially same device, but other device_path) lines.remove(mount_line) with file_mutex('ovs-fstab-lock'): with open('/etc/fstab', 'w') as fstab_file: fstab_file.write('{0}\n'.format('\n'.join(lines)))
def update_framework(): """ Update the framework :return: None """ filemutex = file_mutex('system_update', wait=2) upgrade_file = '/etc/ready_for_upgrade' upgrade_ongoing_check_file = '/etc/upgrade_ongoing' ssh_clients = [] try: filemutex.acquire() UpdateController._log_message('+++ Starting framework update +++') from ovs.dal.lists.storagerouterlist import StorageRouterList UpdateController._log_message('Generating SSH client connections for each storage router') upgrade_file = '/etc/ready_for_upgrade' upgrade_ongoing_check_file = '/etc/upgrade_ongoing' storage_routers = StorageRouterList.get_storagerouters() ssh_clients = [] master_ips = [] extra_ips = [] for sr in storage_routers: ssh_clients.append(SSHClient(sr.ip, username='******')) if sr.node_type == 'MASTER': master_ips.append(sr.ip) elif sr.node_type == 'EXTRA': extra_ips.append(sr.ip) this_client = [client for client in ssh_clients if client.is_local is True][0] # Create locks UpdateController._log_message('Creating lock files', client_ip=this_client.ip) for client in ssh_clients: client.run('touch {0}'.format(upgrade_file)) # Prevents manual install or upgrade individual packages client.run('touch {0}'.format(upgrade_ongoing_check_file)) # Prevents clicking x times on 'Update' btn # Check requirements packages_to_update = set() all_services_to_restart = [] for client in ssh_clients: for function in Toolbox.fetch_hooks('update', 'metadata'): UpdateController._log_message('Executing function {0}'.format(function.__name__), client_ip=client.ip) output = function(client) for key, value in output.iteritems(): if key != 'framework': continue for package_info in value: packages_to_update.update(package_info['packages']) all_services_to_restart += package_info['services'] services_to_restart = [] for service in all_services_to_restart: if service not in services_to_restart: services_to_restart.append(service) # Filter out duplicates maintaining the order of services (eg: watcher-framework before memcached) UpdateController._log_message('Services which will be restarted --> {0}'.format(', '.join(services_to_restart))) UpdateController._log_message('Packages which will be installed --> {0}'.format(', '.join(packages_to_update))) # Stop services if UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='stop') is False: UpdateController._log_message('Stopping all services on every node failed, cannot continue', client_ip=this_client.ip, severity='warning') UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) # Start services again if a service could not be stopped UpdateController._log_message('Attempting to start the services again', client_ip=this_client.ip) UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message('Failed to stop all required services, aborting update', client_ip=this_client.ip, severity='error') return # Update packages failed_clients = [] for client in ssh_clients: PackageManager.update(client=client) try: UpdateController._log_message('Installing latest packages', client.ip) for package in packages_to_update: UpdateController._log_message('Installing {0}'.format(package), client.ip) PackageManager.install(package_name=package, client=client, force=True) UpdateController._log_message('Installed {0}'.format(package), client.ip) client.file_delete(upgrade_file) except subprocess.CalledProcessError as cpe: UpdateController._log_message('Upgrade failed with error: {0}'.format(cpe.output), client.ip, 'error') failed_clients.append(client) break if failed_clients: UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('Error occurred. Attempting to start all services again', client_ip=this_client.ip, severity='error') UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message('Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information'.format('\n - '.join([client.ip for client in failed_clients]), this_client.ip), this_client.ip, 'error') return # Migrate code for client in ssh_clients: try: UpdateController._log_message('Started code migration', client.ip) try: with remote(client.ip, [Migrator]) as rem: rem.Migrator.migrate(master_ips, extra_ips) except EOFError as eof: UpdateController._log_message('EOFError during code migration, retrying {0}'.format(eof), client.ip, 'warning') with remote(client.ip, [Migrator]) as rem: rem.Migrator.migrate(master_ips, extra_ips) UpdateController._log_message('Finished code migration', client.ip) except Exception as ex: UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('Code migration failed with error: {0}'.format(ex), client.ip, 'error') return # Start services UpdateController._log_message('Starting services', client_ip=this_client.ip) model_services = [] if 'arakoon-ovsdb' in services_to_restart: model_services.append('arakoon-ovsdb') services_to_restart.remove('arakoon-ovsdb') if 'memcached' in services_to_restart: model_services.append('memcached') services_to_restart.remove('memcached') UpdateController._change_services_state(services=model_services, ssh_clients=ssh_clients, action='start') # Migrate model UpdateController._log_message('Started model migration', client_ip=this_client.ip) try: from ovs.dal.helpers import Migration with remote(ssh_clients[0].ip, [Migration]) as rem: rem.Migration.migrate() UpdateController._log_message('Finished model migration', client_ip=this_client.ip) except Exception as ex: UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('An unexpected error occurred: {0}'.format(ex), client_ip=this_client.ip, severity='error') return # Post upgrade actions UpdateController._log_message('Executing post upgrade actions', client_ip=this_client.ip) for client in ssh_clients: with remote(client.ip, [Toolbox, SSHClient]) as rem: for function in rem.Toolbox.fetch_hooks('update', 'postupgrade'): UpdateController._log_message('Executing action {0}'.format(function.__name__), client_ip=client.ip) try: function(rem.SSHClient(client.ip, username='******')) UpdateController._log_message('Executing action {0} completed'.format(function.__name__), client_ip=client.ip) except Exception as ex: UpdateController._log_message('Post upgrade action failed with error: {0}'.format(ex), client.ip, 'error') # Start watcher and restart support-agent UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._change_services_state(services=['support-agent'], ssh_clients=ssh_clients, action='restart') UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('+++ Finished updating +++') except RuntimeError as rte: UpdateController._log_message('Error during framework update: {0}'.format(rte), severity='error') UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) except NoLockAvailableException: UpdateController._log_message('Another framework update is currently in progress!') except Exception as ex: UpdateController._log_message('Error during framework update: {0}'.format(ex), severity='error') UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) finally: filemutex.release()
def update_volumedriver(): """ Update the volumedriver :return: None """ filemutex = file_mutex('system_update', wait=2) upgrade_file = '/etc/ready_for_upgrade' upgrade_ongoing_check_file = '/etc/upgrade_ongoing' ssh_clients = [] try: filemutex.acquire() UpdateController._log_message('+++ Starting volumedriver update +++') from ovs.dal.lists.storagerouterlist import StorageRouterList UpdateController._log_message('Generating SSH client connections for each storage router') storage_routers = StorageRouterList.get_storagerouters() ssh_clients = [SSHClient(storage_router.ip, 'root') for storage_router in storage_routers] this_client = [client for client in ssh_clients if client.is_local is True][0] # Commence update !!!!!!! # 0. Create locks UpdateController._log_message('Creating lock files', client_ip=this_client.ip) for client in ssh_clients: client.run('touch {0}'.format(upgrade_file)) # Prevents manual install or upgrade individual packages client.run('touch {0}'.format(upgrade_ongoing_check_file)) # Prevents clicking x times on 'Update' btn # 1. Check requirements packages_to_update = set() all_services_to_restart = [] for client in ssh_clients: for function in Toolbox.fetch_hooks('update', 'metadata'): UpdateController._log_message('Executing function {0}'.format(function.__name__), client_ip=client.ip) output = function(client) for key, value in output.iteritems(): if key != 'volumedriver': continue for package_info in value: packages_to_update.update(package_info['packages']) all_services_to_restart += package_info['services'] services_to_restart = [] for service in all_services_to_restart: if service not in services_to_restart: services_to_restart.append(service) # Filter out duplicates keeping the order of services (eg: watcher-framework before memcached) UpdateController._log_message('Services which will be restarted --> {0}'.format(', '.join(services_to_restart))) UpdateController._log_message('Packages which will be installed --> {0}'.format(', '.join(packages_to_update))) # 1. Stop services if UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='stop') is False: UpdateController._log_message('Stopping all services on every node failed, cannot continue', client_ip=this_client.ip, severity='warning') UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('Attempting to start the services again', client_ip=this_client.ip) UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message('Failed to stop all required services, update aborted', client_ip=this_client.ip, severity='error') return # 2. Update packages failed_clients = [] for client in ssh_clients: PackageManager.update(client=client) try: for package_name in packages_to_update: UpdateController._log_message('Installing {0}'.format(package_name), client.ip) PackageManager.install(package_name=package_name, client=client, force=True) UpdateController._log_message('Installed {0}'.format(package_name), client.ip) client.file_delete(upgrade_file) except subprocess.CalledProcessError as cpe: UpdateController._log_message('Upgrade failed with error: {0}'.format(cpe.output), client.ip, 'error') failed_clients.append(client) break if failed_clients: UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('Error occurred. Attempting to start all services again', client_ip=this_client.ip, severity='error') UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message('Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information'.format('\n - '.join([client.ip for client in failed_clients]), this_client.ip), this_client.ip, 'error') return # 3. Post upgrade actions UpdateController._log_message('Executing post upgrade actions', client_ip=this_client.ip) for client in ssh_clients: for function in Toolbox.fetch_hooks('update', 'postupgrade'): UpdateController._log_message('Executing action: {0}'.format(function.__name__), client_ip=client.ip) try: function(client) except Exception as ex: UpdateController._log_message('Post upgrade action failed with error: {0}'.format(ex), client.ip, 'error') # 4. Start services UpdateController._log_message('Starting services', client_ip=this_client.ip) UpdateController._change_services_state(services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('+++ Finished updating +++') except RuntimeError as rte: UpdateController._log_message('Error during volumedriver update: {0}'.format(rte), severity='error') UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) except NoLockAvailableException: UpdateController._log_message('Another volumedriver update is currently in progress!') except Exception as ex: UpdateController._log_message('Error during volumedriver update: {0}'.format(ex), severity='error') UpdateController._remove_lock_files([upgrade_file, upgrade_ongoing_check_file], ssh_clients) finally: filemutex.release()
def execute_update(components): """ Update the specified components on all StorageRouters This is called upon by 'at' :return: None """ filemutex = file_mutex('system_update', wait=2) ssh_clients = [] services_stop_start = set() try: filemutex.acquire() UpdateController._logger.debug('+++ Starting update +++') from ovs.dal.lists.storagerouterlist import StorageRouterList # Create SSHClients to all nodes UpdateController._logger.debug('Generating SSH client connections for each storage router') storage_routers = StorageRouterList.get_storagerouters() master_ips = [] extra_ips = [] for sr in storage_routers: try: ssh_clients.append(SSHClient(sr.ip, username='******')) if sr.node_type == 'MASTER': master_ips.append(sr.ip) elif sr.node_type == 'EXTRA': extra_ips.append(sr.ip) except UnableToConnectException: raise Exception('Update is only allowed on systems where all nodes are online and fully functional') # Create locks for client in ssh_clients: UpdateController._logger.debug('{0}: Creating lock files'.format(client.ip)) client.run(['touch', UpdateController._update_file]) # Prevents manual install or update individual packages client.run(['touch', UpdateController._update_ongoing_file]) # Check requirements packages_to_update = {} services_post_update = set() update_information = UpdateController.get_update_information_all() for component, component_info in update_information.iteritems(): if component in components: UpdateController._logger.debug('Verifying update information for component: {0}'.format(component.upper())) Toolbox.verify_required_params(actual_params=component_info, required_params={'downtime': (list, None), 'packages': (dict, None), 'prerequisites': (list, None), 'services_stop_start': (set, None), 'services_post_update': (set, None)}) if len(component_info['prerequisites']) > 0: raise Exception('Update is only allowed when all prerequisites have been met') packages_to_update.update(component_info['packages']) services_stop_start.update(component_info['services_stop_start']) services_post_update.update(component_info['services_post_update']) if len(packages_to_update) > 0: UpdateController._logger.debug('Packages to be updated: {0}'.format(', '.join(sorted(packages_to_update.keys())))) if len(services_stop_start) > 0: UpdateController._logger.debug('Services to stop before package update: {0}'.format(', '.join(sorted(services_stop_start)))) if len(services_post_update) > 0: UpdateController._logger.debug('Services which will be restarted after update: {0}'.format(', '.join(sorted(services_post_update)))) # Stop services if UpdateController.change_services_state(services=services_stop_start, ssh_clients=ssh_clients, action='stop') is False: raise Exception('Stopping all services on every node failed, cannot continue') # Install packages # First install packages on all StorageRouters individually if packages_to_update: failures = False for client in ssh_clients: UpdateController._logger.debug('{0}: Installing packages'.format(client.ip)) for function in Toolbox.fetch_hooks('update', 'package_install_multi'): try: function(client=client, package_info=packages_to_update, components=components) except Exception as ex: UpdateController._logger.error('{0}: Package installation hook {1} failed with error: {2}'.format(client.ip, function.__name__, ex)) failures = True if set(components).difference({'framework', 'storagedriver'}): # Second install packages on all ALBA nodes for function in Toolbox.fetch_hooks('update', 'package_install_single'): try: function(package_info=packages_to_update, components=components) except Exception as ex: UpdateController._logger.exception('Package installation hook {0} failed with error: {1}'.format(function.__name__, ex)) failures = True if failures is True: raise Exception('Installing the packages failed on 1 or more nodes') # Remove update file for client in ssh_clients: client.file_delete(UpdateController._update_file) # Migrate code if 'framework' in components: failures = [] for client in ssh_clients: UpdateController._logger.debug('{0}: Verifying extensions code migration is required'.format(client.ip)) try: key = '/ovs/framework/hosts/{0}/versions'.format(System.get_my_machine_id(client=client)) old_versions = Configuration.get(key) if Configuration.exists(key) else {} try: with remote(client.ip, [Migrator]) as rem: rem.Migrator.migrate(master_ips, extra_ips) except EOFError as eof: UpdateController._logger.warning('{0}: EOFError during code migration, retrying {1}'.format(client.ip, eof)) with remote(client.ip, [Migrator]) as rem: rem.Migrator.migrate(master_ips, extra_ips) new_versions = Configuration.get(key) if Configuration.exists(key) else {} if old_versions != new_versions: UpdateController._logger.debug('{0}: Finished extensions code migration. Old versions: {1} --> New versions: {2}'.format(client.ip, old_versions, new_versions)) except Exception as ex: failures.append('{0}: {1}'.format(client.ip, str(ex))) if len(failures) > 0: raise Exception('Failed to run the extensions migrate code on all nodes. Errors found:\n\n{0}'.format('\n\n'.join(failures))) # Start memcached if 'memcached' in services_stop_start: services_stop_start.remove('memcached') UpdateController._logger.debug('Starting memcached') UpdateController.change_services_state(services=['memcached'], ssh_clients=ssh_clients, action='start') # Migrate model if 'framework' in components: UpdateController._logger.debug('Verifying DAL code migration is required') old_versions = PersistentFactory.get_client().get('ovs_model_version') if PersistentFactory.get_client().exists('ovs_model_version') else {} from ovs.dal.helpers import Migration with remote(ssh_clients[0].ip, [Migration]) as rem: rem.Migration.migrate() new_versions = PersistentFactory.get_client().get('ovs_model_version') if PersistentFactory.get_client().exists('ovs_model_version') else {} if old_versions != new_versions: UpdateController._logger.debug('Finished DAL code migration. Old versions: {0} --> New versions: {1}'.format(old_versions, new_versions)) # Post update actions for client in ssh_clients: UpdateController._logger.debug('{0}: Executing post-update actions'.format(client.ip)) for function in Toolbox.fetch_hooks('update', 'post_update_multi'): try: function(client=client, components=components) except Exception as ex: UpdateController._logger.exception('{0}: Post update hook {1} failed with error: {2}'.format(client.ip, function.__name__, ex)) for function in Toolbox.fetch_hooks('update', 'post_update_single'): try: function(components=components) except Exception as ex: UpdateController._logger.exception('Post update hook {0} failed with error: {1}'.format(function.__name__, ex)) # Start services UpdateController.change_services_state(services=services_stop_start, ssh_clients=ssh_clients, action='start') UpdateController._refresh_package_information() UpdateController._logger.debug('+++ Finished updating +++') except NoLockAvailableException: UpdateController._logger.debug('Another update is currently in progress!') except Exception as ex: UpdateController._logger.exception('Error during update: {0}'.format(ex)) if len(ssh_clients) > 0: UpdateController.change_services_state(services=services_stop_start, ssh_clients=ssh_clients, action='start') UpdateController._refresh_package_information() UpdateController._logger.error('Failed to update. Please check all the logs for more information') finally: filemutex.release() for ssh_client in ssh_clients: for file_name in [UpdateController._update_file, UpdateController._update_ongoing_file]: try: if ssh_client.file_exists(file_name): ssh_client.file_delete(file_name) except: UpdateController._logger.warning('[0}: Failed to remove lock file {1}'.format(ssh_client.ip, file_name))
def execute_scrub_work(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled :type error_messages: list :return: a list of error messages :rtype: list """ def _verify_mds_config(current_vdisk): current_vdisk.invalidate_dynamics('info') vdisk_configs = current_vdisk.info['metadata_backend_config'] if len(vdisk_configs) == 0: raise RuntimeError('Could not load MDS configuration') return vdisk_configs client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] scrub_directory = '{0}/scrub_work_{1}_{2}'.format(scrub_info['scrub_path'], vpool.name, storagerouter.name) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(vpool.guid, storagerouter.guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(vpool.guid, storagerouter.guid) alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format(vpool.name, storagerouter.name) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod(scrub_directory, 0777) # Celery task executed by 'ovs' user and should be able to write in it if ServiceManager.has_service(name=alba_proxy_service, client=client) is True and ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get('/ovs/framework/hosts/{0}/ports|storagedriver'.format(machine_id)) port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] # Scrub config # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini', # u'fragment_cache': [u'none'], # u'ips': [u'127.0.0.1'], # u'log_level': u'info', # u'manifest_cache_size': 17179869184, # u'port': 0, # u'transport': u'tcp'} # Backend config # {u'alba_connection_host': u'10.100.193.155', # u'alba_connection_port': 26204, # u'alba_connection_preset': u'preset', # u'alba_connection_timeout': 15, # u'alba_connection_transport': u'TCP', # u'backend_interface_retries_on_error': 5, # u'backend_interface_retry_backoff_multiplier': 2.0, # u'backend_interface_retry_interval_secs': 1, # u'backend_type': u'ALBA'} scrub_config = Configuration.get('ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = {'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path('alba_proxy'), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key)} ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) ServiceManager.start_service(name=alba_proxy_service, client=client) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get('ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, vpool.storagedrivers[0].storagedriver_id))['backend_connection_manager'] backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config['port'] Configuration.set(backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message) if client is not None and ServiceManager.has_service(name=alba_proxy_service, client=client) is True: if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ServiceManager.stop_service(name=alba_proxy_service, client=client) ServiceManager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) try: # Empty the queue with vDisks to scrub with remote(storagerouter.ip, [VDisk]) as rem: while True: vdisk = None vdisk_guid = queue.get(False) try: # Check MDS master is local. Trigger MDS handover if necessary vdisk = rem.VDisk(vdisk_guid) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'.format(vpool.name, storagerouter.name, vdisk.name, scrub_directory)) configs = _verify_mds_config(current_vdisk=vdisk) storagedriver = StorageDriverList.get_by_storagedriver_id(vdisk.storagedriver_id) if configs[0].get('ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'.format(vpool.name, storagerouter.name, vdisk.name)) MDSServiceController.ensure_safety(VDisk(vdisk_guid)) # Do not use a remote VDisk instance here configs = _verify_mds_config(current_vdisk=vdisk) if configs[0].get('ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.warning('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'.format(vpool.name, storagerouter.name, vdisk.name)) continue # Do the actual scrubbing with vdisk.storagedriver_client.make_locked_client(str(vdisk.volume_id)) as locked_client: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'.format(vpool.name, storagerouter.name, vdisk.name)) work_units = locked_client.get_scrubbing_workunits() for work_unit in work_units: res = locked_client.scrub(work_unit=work_unit, scratch_dir=scrub_directory, log_sinks=[LogHandler.get_sink_path('scrubber', allow_override=True)], backend_config=Configuration.get_configuration_path(backend_config_key)) locked_client.apply_scrubbing_result(scrubbing_work_result=res) if work_units: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'.format(vpool.name, storagerouter.name, vdisk.name, len(work_units))) else: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'.format(vpool.name, storagerouter.name, vdisk.name)) except Exception: if vdisk is None: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(vpool.name, storagerouter.name, vdisk_guid) else: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(vpool.name, storagerouter.name, vdisk.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) except Empty: # Raised when all items have been fetched from the queue ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'.format(vpool.name, storagerouter.name)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(vpool.name, storagerouter.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if ServiceManager.has_service(alba_proxy_service, client=client): ServiceManager.stop_service(alba_proxy_service, client=client) ServiceManager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message)
def update_framework(): """ Update the framework :return: None """ filemutex = file_mutex('system_update', wait=2) upgrade_file = '/etc/ready_for_upgrade' upgrade_ongoing_check_file = '/etc/upgrade_ongoing' ssh_clients = [] try: filemutex.acquire() UpdateController._log_message('+++ Starting framework update +++') from ovs.dal.lists.storagerouterlist import StorageRouterList UpdateController._log_message( 'Generating SSH client connections for each storage router') upgrade_file = '/etc/ready_for_upgrade' upgrade_ongoing_check_file = '/etc/upgrade_ongoing' storage_routers = StorageRouterList.get_storagerouters() ssh_clients = [] master_ips = [] extra_ips = [] for sr in storage_routers: ssh_clients.append(SSHClient(sr.ip, username='******')) if sr.node_type == 'MASTER': master_ips.append(sr.ip) elif sr.node_type == 'EXTRA': extra_ips.append(sr.ip) this_client = [ client for client in ssh_clients if client.is_local is True ][0] # Create locks UpdateController._log_message('Creating lock files', client_ip=this_client.ip) for client in ssh_clients: client.run( 'touch {0}'.format(upgrade_file) ) # Prevents manual install or upgrade individual packages client.run('touch {0}'.format(upgrade_ongoing_check_file) ) # Prevents clicking x times on 'Update' btn # Check requirements packages_to_update = set() all_services_to_restart = [] for client in ssh_clients: for function in Toolbox.fetch_hooks('update', 'metadata'): UpdateController._log_message( 'Executing function {0}'.format(function.__name__), client_ip=client.ip) output = function(client) for key, value in output.iteritems(): if key != 'framework': continue for package_info in value: packages_to_update.update(package_info['packages']) all_services_to_restart += package_info['services'] services_to_restart = [] for service in all_services_to_restart: if service not in services_to_restart: services_to_restart.append( service ) # Filter out duplicates maintaining the order of services (eg: watcher-framework before memcached) UpdateController._log_message( 'Services which will be restarted --> {0}'.format( ', '.join(services_to_restart))) UpdateController._log_message( 'Packages which will be installed --> {0}'.format( ', '.join(packages_to_update))) # Stop services if UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='stop') is False: UpdateController._log_message( 'Stopping all services on every node failed, cannot continue', client_ip=this_client.ip, severity='warning') UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) # Start services again if a service could not be stopped UpdateController._log_message( 'Attempting to start the services again', client_ip=this_client.ip) UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message( 'Failed to stop all required services, aborting update', client_ip=this_client.ip, severity='error') return # Update packages failed_clients = [] for client in ssh_clients: PackageManager.update(client=client) try: UpdateController._log_message('Installing latest packages', client.ip) for package in packages_to_update: UpdateController._log_message( 'Installing {0}'.format(package), client.ip) PackageManager.install(package_name=package, client=client, force=True) UpdateController._log_message( 'Installed {0}'.format(package), client.ip) client.file_delete(upgrade_file) except subprocess.CalledProcessError as cpe: UpdateController._log_message( 'Upgrade failed with error: {0}'.format(cpe.output), client.ip, 'error') failed_clients.append(client) break if failed_clients: UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message( 'Error occurred. Attempting to start all services again', client_ip=this_client.ip, severity='error') UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message( 'Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information' .format( '\n - '.join([client.ip for client in failed_clients]), this_client.ip), this_client.ip, 'error') return # Migrate code for client in ssh_clients: try: UpdateController._log_message('Started code migration', client.ip) try: with remote(client.ip, [Migrator]) as rem: rem.Migrator.migrate(master_ips, extra_ips) except EOFError as eof: UpdateController._log_message( 'EOFError during code migration, retrying {0}'. format(eof), client.ip, 'warning') with remote(client.ip, [Migrator]) as rem: rem.Migrator.migrate(master_ips, extra_ips) UpdateController._log_message('Finished code migration', client.ip) except Exception as ex: UpdateController._remove_lock_files( [upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message( 'Code migration failed with error: {0}'.format(ex), client.ip, 'error') return # Start services UpdateController._log_message('Starting services', client_ip=this_client.ip) model_services = [] if 'arakoon-ovsdb' in services_to_restart: model_services.append('arakoon-ovsdb') services_to_restart.remove('arakoon-ovsdb') if 'memcached' in services_to_restart: model_services.append('memcached') services_to_restart.remove('memcached') UpdateController._change_services_state(services=model_services, ssh_clients=ssh_clients, action='start') # Migrate model UpdateController._log_message('Started model migration', client_ip=this_client.ip) try: from ovs.dal.helpers import Migration with remote(ssh_clients[0].ip, [Migration]) as rem: rem.Migration.migrate() UpdateController._log_message('Finished model migration', client_ip=this_client.ip) except Exception as ex: UpdateController._remove_lock_files( [upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message( 'An unexpected error occurred: {0}'.format(ex), client_ip=this_client.ip, severity='error') return # Post upgrade actions UpdateController._log_message('Executing post upgrade actions', client_ip=this_client.ip) for client in ssh_clients: with remote(client.ip, [Toolbox, SSHClient]) as rem: for function in rem.Toolbox.fetch_hooks( 'update', 'postupgrade'): UpdateController._log_message( 'Executing action {0}'.format(function.__name__), client_ip=client.ip) try: function(rem.SSHClient(client.ip, username='******')) UpdateController._log_message( 'Executing action {0} completed'.format( function.__name__), client_ip=client.ip) except Exception as ex: UpdateController._log_message( 'Post upgrade action failed with error: {0}'. format(ex), client.ip, 'error') # Start watcher and restart support-agent UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._change_services_state(services=['support-agent'], ssh_clients=ssh_clients, action='restart') UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('+++ Finished updating +++') except RuntimeError as rte: UpdateController._log_message( 'Error during framework update: {0}'.format(rte), severity='error') UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) except NoLockAvailableException: UpdateController._log_message( 'Another framework update is currently in progress!') except Exception as ex: UpdateController._log_message( 'Error during framework update: {0}'.format(ex), severity='error') UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) finally: filemutex.release()
def update_volumedriver(): """ Update the volumedriver :return: None """ filemutex = file_mutex('system_update', wait=2) upgrade_file = '/etc/ready_for_upgrade' upgrade_ongoing_check_file = '/etc/upgrade_ongoing' ssh_clients = [] try: filemutex.acquire() UpdateController._log_message( '+++ Starting volumedriver update +++') from ovs.dal.lists.storagerouterlist import StorageRouterList UpdateController._log_message( 'Generating SSH client connections for each storage router') storage_routers = StorageRouterList.get_storagerouters() ssh_clients = [ SSHClient(storage_router.ip, 'root') for storage_router in storage_routers ] this_client = [ client for client in ssh_clients if client.is_local is True ][0] # Commence update !!!!!!! # 0. Create locks UpdateController._log_message('Creating lock files', client_ip=this_client.ip) for client in ssh_clients: client.run( 'touch {0}'.format(upgrade_file) ) # Prevents manual install or upgrade individual packages client.run('touch {0}'.format(upgrade_ongoing_check_file) ) # Prevents clicking x times on 'Update' btn # 1. Check requirements packages_to_update = set() all_services_to_restart = [] for client in ssh_clients: for function in Toolbox.fetch_hooks('update', 'metadata'): UpdateController._log_message( 'Executing function {0}'.format(function.__name__), client_ip=client.ip) output = function(client) for key, value in output.iteritems(): if key != 'volumedriver': continue for package_info in value: packages_to_update.update(package_info['packages']) all_services_to_restart += package_info['services'] services_to_restart = [] for service in all_services_to_restart: if service not in services_to_restart: services_to_restart.append( service ) # Filter out duplicates keeping the order of services (eg: watcher-framework before memcached) UpdateController._log_message( 'Services which will be restarted --> {0}'.format( ', '.join(services_to_restart))) UpdateController._log_message( 'Packages which will be installed --> {0}'.format( ', '.join(packages_to_update))) # 1. Stop services if UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='stop') is False: UpdateController._log_message( 'Stopping all services on every node failed, cannot continue', client_ip=this_client.ip, severity='warning') UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message( 'Attempting to start the services again', client_ip=this_client.ip) UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message( 'Failed to stop all required services, update aborted', client_ip=this_client.ip, severity='error') return # 2. Update packages failed_clients = [] for client in ssh_clients: PackageManager.update(client=client) try: for package_name in packages_to_update: UpdateController._log_message( 'Installing {0}'.format(package_name), client.ip) PackageManager.install(package_name=package_name, client=client, force=True) UpdateController._log_message( 'Installed {0}'.format(package_name), client.ip) client.file_delete(upgrade_file) except subprocess.CalledProcessError as cpe: UpdateController._log_message( 'Upgrade failed with error: {0}'.format(cpe.output), client.ip, 'error') failed_clients.append(client) break if failed_clients: UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message( 'Error occurred. Attempting to start all services again', client_ip=this_client.ip, severity='error') UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._log_message( 'Failed to upgrade following nodes:\n - {0}\nPlease check /var/log/ovs/lib.log on {1} for more information' .format( '\n - '.join([client.ip for client in failed_clients]), this_client.ip), this_client.ip, 'error') return # 3. Post upgrade actions UpdateController._log_message('Executing post upgrade actions', client_ip=this_client.ip) for client in ssh_clients: for function in Toolbox.fetch_hooks('update', 'postupgrade'): UpdateController._log_message( 'Executing action: {0}'.format(function.__name__), client_ip=client.ip) try: function(client) except Exception as ex: UpdateController._log_message( 'Post upgrade action failed with error: {0}'. format(ex), client.ip, 'error') # 4. Start services UpdateController._log_message('Starting services', client_ip=this_client.ip) UpdateController._change_services_state( services=services_to_restart, ssh_clients=ssh_clients, action='start') UpdateController._remove_lock_files([upgrade_ongoing_check_file], ssh_clients) UpdateController._log_message('+++ Finished updating +++') except RuntimeError as rte: UpdateController._log_message( 'Error during volumedriver update: {0}'.format(rte), severity='error') UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) except NoLockAvailableException: UpdateController._log_message( 'Another volumedriver update is currently in progress!') except Exception as ex: UpdateController._log_message( 'Error during volumedriver update: {0}'.format(ex), severity='error') UpdateController._remove_lock_files( [upgrade_file, upgrade_ongoing_check_file], ssh_clients) finally: filemutex.release()
def execute_scrub_work(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled :type error_messages: list :return: a list of error messages :rtype: list """ def _verify_mds_config(current_vdisk): current_vdisk.invalidate_dynamics('info') vdisk_configs = current_vdisk.info['metadata_backend_config'] if len(vdisk_configs) == 0: raise RuntimeError('Could not load MDS configuration') return vdisk_configs client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] scrub_directory = '{0}/scrub_work_{1}_{2}'.format( scrub_info['scrub_path'], vpool.name, storagerouter.name) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format( vpool.guid, storagerouter.guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format( vpool.guid, storagerouter.guid) alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format( vpool.name, storagerouter.name) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod( scrub_directory, 0777 ) # Celery task executed by 'ovs' user and should be able to write in it if ServiceManager.has_service( name=alba_proxy_service, client=client ) is True and ServiceManager.get_service_status( name=alba_proxy_service, client=client) is True: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get( '/ovs/framework/hosts/{0}/ports|storagedriver'.format( machine_id)) port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] # Scrub config # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini', # u'fragment_cache': [u'none'], # u'ips': [u'127.0.0.1'], # u'log_level': u'info', # u'manifest_cache_size': 17179869184, # u'port': 0, # u'transport': u'tcp'} # Backend config # {u'alba_connection_host': u'10.100.193.155', # u'alba_connection_port': 26204, # u'alba_connection_preset': u'preset', # u'alba_connection_timeout': 15, # u'alba_connection_transport': u'TCP', # u'backend_interface_retries_on_error': 5, # u'backend_interface_retry_backoff_multiplier': 2.0, # u'backend_interface_retry_interval_secs': 1, # u'backend_type': u'ALBA'} scrub_config = Configuration.get( 'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format( vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = { 'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path('alba_proxy'), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key) } ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) ServiceManager.start_service(name=alba_proxy_service, client=client) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get( 'ovs/vpools/{0}/hosts/{1}/config'.format( vpool.guid, vpool.storagedrivers[0].storagedriver_id ))['backend_connection_manager'] backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config['port'] Configuration.set( backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message) if client is not None and ServiceManager.has_service( name=alba_proxy_service, client=client) is True: if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ServiceManager.stop_service(name=alba_proxy_service, client=client) ServiceManager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) try: # Empty the queue with vDisks to scrub with remote(storagerouter.ip, [VDisk]) as rem: while True: vdisk = None vdisk_guid = queue.get(False) try: # Check MDS master is local. Trigger MDS handover if necessary vdisk = rem.VDisk(vdisk_guid) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}' .format(vpool.name, storagerouter.name, vdisk.name, scrub_directory)) configs = _verify_mds_config(current_vdisk=vdisk) storagedriver = StorageDriverList.get_by_storagedriver_id( vdisk.storagedriver_id) if configs[0].get( 'ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover' .format(vpool.name, storagerouter.name, vdisk.name)) MDSServiceController.ensure_safety( VDisk(vdisk_guid) ) # Do not use a remote VDisk instance here configs = _verify_mds_config(current_vdisk=vdisk) if configs[0].get( 'ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.warning( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local' .format(vpool.name, storagerouter.name, vdisk.name)) continue # Do the actual scrubbing with vdisk.storagedriver_client.make_locked_client( str(vdisk.volume_id)) as locked_client: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work' .format(vpool.name, storagerouter.name, vdisk.name)) work_units = locked_client.get_scrubbing_workunits( ) for work_unit in work_units: res = locked_client.scrub( work_unit=work_unit, scratch_dir=scrub_directory, log_sinks=[ LogHandler.get_sink_path( 'scrubber', allow_override=True) ], backend_config=Configuration. get_configuration_path(backend_config_key)) locked_client.apply_scrubbing_result( scrubbing_work_result=res) if work_units: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied' .format(vpool.name, storagerouter.name, vdisk.name, len(work_units))) else: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required' .format(vpool.name, storagerouter.name, vdisk.name)) except Exception: if vdisk is None: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format( vpool.name, storagerouter.name, vdisk_guid) else: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format( vpool.name, storagerouter.name, vdisk.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) except Empty: # Raised when all items have been fetched from the queue ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed' .format(vpool.name, storagerouter.name)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format( vpool.name, storagerouter.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if ServiceManager.has_service(alba_proxy_service, client=client): ServiceManager.stop_service(alba_proxy_service, client=client) ServiceManager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message)