def get_services(): """ Fetch all services :return: """ return ServiceList.get_services()
def checkRequiredPorts(self): self.utility.logger("Checking PORT CONNECTIONS of several services ...", self.module, 3, 'checkRequiredPorts', False) # check ports for OVS services self.utility.logger("Checking OVS services ...", self.module, 3, 'checkOvsServicesPorts', False) for sr in ServiceList.get_services(): if sr.storagerouter_guid == self.machine_details.guid: for port in sr.ports: self._isPortListening(sr.name, port) # check NGINX and memcached self.utility.logger("Checking NGINX and Memcached ...", self.module, 3, 'checkNginxAndMemcached', False) for process, ports in self.req_side_ports.iteritems(): for port in ports: self._isPortListening(process, port) # Check Celery and RabbitMQ self.utility.logger("Checking RabbitMQ/Celery ...", self.module, 3, 'checkRabbitmqCelery', False) if self.utility.node_type == "MASTER": PCOMMAND = "celery inspect ping -b amqp://ovs:0penv5tor4ge@{0}//".format(self.machine_details.ip) pcel = self.utility.executeBashCommand(PCOMMAND.format(process)) if len(pcel) != 1 and 'pong' in pcel[1].strip(): self.utility.logger("Connection successfully established!", self.module, 1, 'port_celery') else: self.utility.logger("Connection FAILED to service Celery, please check 'RabbitMQ' and 'ovs-workers'?", self.module, 0, 'port_celery') else: self.utility.logger("RabbitMQ is not running/active on this server!", self.module, 5, 'port_celery')
def collapse_arakoon(): """ Collapse Arakoon's Tlogs :return: None """ logger.info('Starting arakoon collapse') arakoon_clusters = {} for service in ServiceList.get_services(): if service.type.name in ('Arakoon', 'NamespaceManager', 'AlbaManager'): arakoon_clusters[service.name.replace('arakoon-', '')] = service.storagerouter for cluster, storagerouter in arakoon_clusters.iteritems(): logger.info(' Collapsing cluster {0}'.format(cluster)) contents = EtcdConfiguration.get(ArakoonClusterConfig.ETCD_CONFIG_KEY.format(cluster), raw=True) parser = RawConfigParser() parser.readfp(StringIO(contents)) nodes = {} for node in parser.get('global', 'cluster').split(','): node = node.strip() nodes[node] = ([parser.get(node, 'ip')], parser.get(node, 'client_port')) config = ArakoonClientConfig(str(cluster), nodes) for node in nodes.keys(): logger.info(' Collapsing node: {0}'.format(node)) client = ArakoonAdminClient(node, config) try: client.collapse_tlogs(2) except: logger.exception('Error during collapsing cluster {0} node {1}'.format(cluster, node)) logger.info('Arakoon collapse finished')
def collapse_arakoon(): """ Collapse Arakoon's Tlogs :return: None """ ScheduledTaskController._logger.info('Starting arakoon collapse') arakoon_clusters = [] for service in ServiceList.get_services(): if service.is_internal is True and \ service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): arakoon_clusters.append(service.name.replace('arakoon-', '')) for cluster in arakoon_clusters: ScheduledTaskController._logger.info(' Collapsing cluster {0}'.format(cluster)) contents = EtcdConfiguration.get(ArakoonClusterConfig.ETCD_CONFIG_KEY.format(cluster), raw=True) parser = RawConfigParser() parser.readfp(StringIO(contents)) nodes = {} for node in parser.get('global', 'cluster').split(','): node = node.strip() nodes[node] = ([str(parser.get(node, 'ip'))], int(parser.get(node, 'client_port'))) config = ArakoonClientConfig(str(cluster), nodes) for node in nodes.keys(): ScheduledTaskController._logger.info(' Collapsing node: {0}'.format(node)) client = ArakoonAdmin(config) try: client.collapse(str(node), 2) except: ScheduledTaskController._logger.exception('Error during collapsing cluster {0} node {1}'.format(cluster, node)) ScheduledTaskController._logger.info('Arakoon collapse finished')
def collapse_arakoon(): """ Collapse Arakoon's Tlogs :return: None """ from ovs_extensions.generic.toolbox import ExtensionsToolbox GenericController._logger.info('Arakoon collapse started') cluster_info = [] storagerouters = StorageRouterList.get_storagerouters() if os.environ.get('RUNNING_UNITTESTS') != 'True': cluster_info = [('cacc', storagerouters[0])] cluster_names = [] for service in ServiceList.get_services(): if service.is_internal is True and service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): cluster = ExtensionsToolbox.remove_prefix(service.name, 'arakoon-') if cluster in cluster_names and cluster not in [ARAKOON_NAME, ARAKOON_NAME_UNITTEST]: continue cluster_names.append(cluster) cluster_info.append((cluster, service.storagerouter)) workload = {} cluster_config_map = {} for cluster, storagerouter in cluster_info: GenericController._logger.debug(' Collecting info for cluster {0}'.format(cluster)) ip = storagerouter.ip if cluster in [ARAKOON_NAME, ARAKOON_NAME_UNITTEST] else None try: config = ArakoonClusterConfig(cluster_id=cluster, source_ip=ip) cluster_config_map[cluster] = config except: GenericController._logger.exception(' Retrieving cluster information on {0} for {1} failed'.format(storagerouter.ip, cluster)) continue for node in config.nodes: if node.ip not in workload: workload[node.ip] = {'node_id': node.name, 'clusters': []} workload[node.ip]['clusters'].append((cluster, ip)) for storagerouter in storagerouters: try: if storagerouter.ip not in workload: continue node_workload = workload[storagerouter.ip] client = SSHClient(storagerouter) for cluster, ip in node_workload['clusters']: try: GenericController._logger.debug(' Collapsing cluster {0} on {1}'.format(cluster, storagerouter.ip)) client.run(['arakoon', '--collapse-local', node_workload['node_id'], '2', '-config', cluster_config_map[cluster].external_config_path]) GenericController._logger.debug(' Collapsing cluster {0} on {1} completed'.format(cluster, storagerouter.ip)) except: GenericController._logger.exception(' Collapsing cluster {0} on {1} failed'.format(cluster, storagerouter.ip)) except UnableToConnectException: GenericController._logger.error(' Could not collapse any cluster on {0} (not reachable)'.format(storagerouter.name)) GenericController._logger.info('Arakoon collapse finished')
def collapse_arakoon(): """ Collapse Arakoon's Tlogs :return: None """ ScheduledTaskController._logger.info('Starting arakoon collapse') storagerouters = StorageRouterList.get_storagerouters() cluster_info = [('cacc', storagerouters[0], True)] cluster_names = [] for service in ServiceList.get_services(): if service.is_internal is True and service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): cluster = service.name.replace('arakoon-', '') if cluster in cluster_names: continue cluster_names.append(cluster) cluster_info.append((cluster, service.storagerouter, False)) workload = {} for cluster, storagerouter, filesystem in cluster_info: ScheduledTaskController._logger.debug(' Collecting info for cluster {0}'.format(cluster)) config = ArakoonClusterConfig(cluster, filesystem=filesystem) config.load_config(storagerouter.ip) for node in config.nodes: if node.ip not in workload: workload[node.ip] = {'node_id': node.name, 'clusters': []} workload[node.ip]['clusters'].append((cluster, filesystem)) for storagerouter in storagerouters: try: if storagerouter.ip not in workload: continue node_workload = workload[storagerouter.ip] client = SSHClient(storagerouter) for cluster, filesystem in node_workload['clusters']: try: ScheduledTaskController._logger.debug(' Collapsing cluster {0} on {1}'.format(cluster, storagerouter.ip)) if filesystem is True: config_path = ArakoonClusterConfig.CONFIG_FILE.format(cluster) else: config_path = Configuration.get_configuration_path(ArakoonClusterConfig.CONFIG_KEY.format(cluster)) client.run(['arakoon', '--collapse-local', node_workload['node_id'], '2', '-config', config_path]) ScheduledTaskController._logger.info(' Collapsing cluster {0} on {1} completed'.format(cluster, storagerouter.ip)) except: ScheduledTaskController._logger.exception(' Collapsing cluster {0} on {1} failed'.format(cluster, storagerouter.ip)) except UnableToConnectException: ScheduledTaskController._logger.error(' Could not collapse any cluster on {0} (not reachable)'.format(storagerouter.name)) ScheduledTaskController._logger.info('Arakoon collapse finished')
def ovs_4509_validate_arakoon_collapse_test(): """ Validate arakoon collapse """ node_ips = [sr.ip for sr in GeneralStorageRouter.get_storage_routers()] node_ips.sort() for node_ip in node_ips: root_client = SSHClient(node_ip, username='******') arakoon_clusters = [] for service in ServiceList.get_services(): if service.is_internal is True and service.storagerouter.ip == node_ip and \ service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): arakoon_clusters.append(service.name.replace('arakoon-', '')) for arakoon_cluster in arakoon_clusters: arakoon_config_path = Configuration.get_configuration_path('/ovs/arakoon/{0}/config'.format(arakoon_cluster)) tlog_location = '/opt/OpenvStorage/db/arakoon/{0}/tlogs'.format(arakoon_cluster) # read_tlog_dir with remote(node_ip, [Configuration]) as rem: config_contents = rem.Configuration.get('/ovs/arakoon/{0}/config'.format(arakoon_cluster), raw=True) for line in config_contents.splitlines(): if 'tlog_dir' in line: tlog_location = line.split()[-1] nr_of_tlogs = TestArakoon.get_nr_of_tlogs_in_folder(root_client, tlog_location) old_headdb_timestamp = 0 if root_client.file_exists('/'.join([tlog_location, 'head.db'])): old_headdb_timestamp = root_client.run(['stat', '--format=%Y', tlog_location + '/head.db']) if nr_of_tlogs <= 2: benchmark_command = ['arakoon', '--benchmark', '-n_clients', '1', '-max_n', '5_000', '-config', arakoon_config_path] root_client.run(benchmark_command) GenericController.collapse_arakoon() nr_of_tlogs = TestArakoon.get_nr_of_tlogs_in_folder(root_client, tlog_location) new_headdb_timestamp = root_client.run(['stat', '--format=%Y', tlog_location + '/head.db']) assert nr_of_tlogs <= 2,\ 'Arakoon collapse left {0} tlogs on the environment, expecting less than 2'.format(nr_of_tlogs) assert old_headdb_timestamp != new_headdb_timestamp,\ 'Timestamp of the head_db file was not changed in the process of collapsing tlogs'
def collapse_arakoon(): """ Collapse Arakoon's Tlogs :return: None """ ScheduledTaskController._logger.info('Starting arakoon collapse') arakoon_clusters = [] for service in ServiceList.get_services(): if service.is_internal is True and \ service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): arakoon_clusters.append(service.name.replace('arakoon-', '')) for cluster in arakoon_clusters: ScheduledTaskController._logger.info( ' Collapsing cluster {0}'.format(cluster)) contents = EtcdConfiguration.get( ArakoonClusterConfig.ETCD_CONFIG_KEY.format(cluster), raw=True) parser = RawConfigParser() parser.readfp(StringIO(contents)) nodes = {} for node in parser.get('global', 'cluster').split(','): node = node.strip() nodes[node] = ([str(parser.get(node, 'ip'))], int(parser.get(node, 'client_port'))) config = ArakoonClientConfig(str(cluster), nodes) for node in nodes.keys(): ScheduledTaskController._logger.info( ' Collapsing node: {0}'.format(node)) client = ArakoonAdmin(config) try: client.collapse(str(node), 2) except: ScheduledTaskController._logger.exception( 'Error during collapsing cluster {0} node {1}'.format( cluster, node)) ScheduledTaskController._logger.info('Arakoon collapse finished')
def collapse_arakoon(): """ Collapse Arakoon's Tlogs :return: None """ logger.info('Starting arakoon collapse') arakoon_clusters = {} for service in ServiceList.get_services(): if service.type.name in ('Arakoon', 'NamespaceManager', 'AlbaManager'): arakoon_clusters[service.name.replace( 'arakoon-', '')] = service.storagerouter for cluster, storagerouter in arakoon_clusters.iteritems(): logger.info(' Collapsing cluster {0}'.format(cluster)) contents = EtcdConfiguration.get( ArakoonClusterConfig.ETCD_CONFIG_KEY.format(cluster), raw=True) parser = RawConfigParser() parser.readfp(StringIO(contents)) nodes = {} for node in parser.get('global', 'cluster').split(','): node = node.strip() nodes[node] = ([parser.get(node, 'ip')], parser.get(node, 'client_port')) config = ArakoonClientConfig(str(cluster), nodes) for node in nodes.keys(): logger.info(' Collapsing node: {0}'.format(node)) client = ArakoonAdminClient(node, config) try: client.collapse_tlogs(2) except: logger.exception( 'Error during collapsing cluster {0} node {1}'.format( cluster, node)) logger.info('Arakoon collapse finished')
def promote_node(cluster_ip, master_ip, ip_client_map, unique_id, configure_memcached, configure_rabbitmq): """ Promotes a given node """ from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.servicelist import ServiceList from ovs.dal.hybrids.service import Service Toolbox.log(logger=NodeTypeController._logger, messages='Promoting node', title=True) service_manager = ServiceFactory.get_manager() if configure_memcached is True: if NodeTypeController._validate_local_memcache_servers( ip_client_map) is False: raise RuntimeError( 'Not all memcache nodes can be reached which is required for promoting a node.' ) target_client = ip_client_map[cluster_ip] machine_id = System.get_my_machine_id(target_client) node_name, _ = target_client.get_hostname() master_client = ip_client_map[master_ip] storagerouter = StorageRouterList.get_by_machine_id(unique_id) storagerouter.node_type = 'MASTER' storagerouter.save() external_config = Configuration.get('/ovs/framework/external_config') if external_config is None: Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon configuration cluster') arakoon_installer = ArakoonInstaller(cluster_name='config') arakoon_installer.load(ip=master_ip) arakoon_installer.extend_cluster( new_ip=cluster_ip, base_dir=Configuration.get('/ovs/framework/paths|ovsdb')) arakoon_installer.restart_cluster_after_extending( new_ip=cluster_ip) service_manager.register_service( node_name=machine_id, service_metadata=arakoon_installer.service_metadata[cluster_ip] ) # Find other (arakoon) master nodes arakoon_cluster_name = str( Configuration.get('/ovs/framework/arakoon_clusters|ovsdb')) arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=arakoon_cluster_name) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name) master_node_ips = [node.ip for node in config.nodes] if cluster_ip in master_node_ips: master_node_ips.remove(cluster_ip) if len(master_node_ips) == 0: raise RuntimeError( 'There should be at least one other master node') arakoon_ports = [] if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon OVS DB cluster') arakoon_installer = ArakoonInstaller( cluster_name=arakoon_cluster_name) arakoon_installer.load() arakoon_installer.extend_cluster( new_ip=cluster_ip, base_dir=Configuration.get('/ovs/framework/paths|ovsdb')) arakoon_installer.restart_cluster_after_extending( new_ip=cluster_ip) arakoon_ports = arakoon_installer.ports[cluster_ip] if configure_memcached is True: NodeTypeController.configure_memcached( client=target_client, logger=NodeTypeController._logger) NodeTypeController.add_services(client=target_client, node_type='master', logger=NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Update configurations') if configure_memcached is True: endpoints = Configuration.get('/ovs/framework/memcache|endpoints') endpoint = '{0}:11211'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/memcache|endpoints', endpoints) if configure_rabbitmq is True: endpoints = Configuration.get( '/ovs/framework/messagequeue|endpoints') endpoint = '{0}:5672'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/messagequeue|endpoints', endpoints) if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Restarting master node services') PersistentFactory.store = None VolatileFactory.store = None if 'arakoon-ovsdb' not in [ s.name for s in ServiceList.get_services() if s.is_internal is False or s.storagerouter.ip == cluster_ip ]: service = Service() service.name = 'arakoon-ovsdb' service.type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) service.ports = arakoon_ports service.storagerouter = storagerouter service.save() if configure_rabbitmq is True: NodeTypeController.configure_rabbitmq( client=target_client, logger=NodeTypeController._logger) # Copy rabbitmq cookie rabbitmq_cookie_file = '/var/lib/rabbitmq/.erlang.cookie' Toolbox.log(logger=NodeTypeController._logger, messages='Copying RabbitMQ cookie') contents = master_client.file_read(rabbitmq_cookie_file) master_hostname, _ = master_client.get_hostname() target_client.dir_create(os.path.dirname(rabbitmq_cookie_file)) target_client.file_write(rabbitmq_cookie_file, contents) target_client.file_chmod(rabbitmq_cookie_file, mode=0400) target_client.run(['rabbitmq-server', '-detached']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop_app']) time.sleep(5) target_client.run([ 'rabbitmqctl', 'join_cluster', 'rabbit@{0}'.format(master_hostname) ]) time.sleep(5) target_client.run(['rabbitmqctl', 'stop']) time.sleep(5) # Enable HA for the rabbitMQ queues ServiceFactory.change_service_state(target_client, 'rabbitmq-server', 'start', NodeTypeController._logger) NodeTypeController.check_rabbitmq_and_enable_ha_mode( client=target_client, logger=NodeTypeController._logger) NodeTypeController._configure_amqp_to_volumedriver() Toolbox.log(logger=NodeTypeController._logger, messages='Starting services') services = ['memcached', 'arakoon-ovsdb', 'rabbitmq-server'] if arakoon_metadata['internal'] is True: services.remove('arakoon-ovsdb') for service in services: if service_manager.has_service(service, client=target_client): ServiceFactory.change_service_state(target_client, service, 'start', NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger) if Toolbox.run_hooks(component='nodetype', sub_component='promote', logger=NodeTypeController._logger, cluster_ip=cluster_ip, master_ip=master_ip): Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger) if NodeTypeController.avahi_installed( client=target_client, logger=NodeTypeController._logger) is True: NodeTypeController.configure_avahi( client=target_client, node_name=node_name, node_type='master', logger=NodeTypeController._logger) Configuration.set('/ovs/framework/hosts/{0}/type'.format(machine_id), 'MASTER') target_client.run( ['chown', '-R', 'ovs:ovs', '/opt/OpenvStorage/config']) Configuration.set( '/ovs/framework/hosts/{0}/promotecompleted'.format(machine_id), True) if target_client.file_exists('/tmp/ovs_rollback'): target_client.file_delete('/tmp/ovs_rollback') Toolbox.log(logger=NodeTypeController._logger, messages='Promote complete')
def promote_node(cluster_ip, master_ip, ip_client_map, unique_id, configure_memcached, configure_rabbitmq): """ Promotes a given node """ from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.servicelist import ServiceList from ovs.dal.hybrids.service import Service Toolbox.log(logger=NodeTypeController._logger, messages='Promoting node', title=True) if configure_memcached is True: if NodeTypeController._validate_local_memcache_servers(ip_client_map) is False: raise RuntimeError('Not all memcache nodes can be reached which is required for promoting a node.') target_client = ip_client_map[cluster_ip] machine_id = System.get_my_machine_id(target_client) node_name, _ = target_client.get_hostname() master_client = ip_client_map[master_ip] storagerouter = StorageRouterList.get_by_machine_id(unique_id) storagerouter.node_type = 'MASTER' storagerouter.save() external_config = Configuration.get('/ovs/framework/external_config') if external_config is None: config_store = Configuration.get_store() if config_store == 'arakoon': Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon configuration cluster') metadata = ArakoonInstaller.extend_cluster(master_ip=master_ip, new_ip=cluster_ip, cluster_name='config', base_dir=Configuration.get('/ovs/framework/paths|ovsdb'), ports=[26400, 26401], filesystem=True) ArakoonInstaller.restart_cluster_add(cluster_name='config', current_ips=metadata['ips'], new_ip=cluster_ip, filesystem=True) ServiceManager.register_service(node_name=machine_id, service_metadata=metadata['service_metadata']) else: from ovs.extensions.db.etcd.installer import EtcdInstaller Toolbox.log(logger=NodeTypeController._logger, messages='Joining Etcd cluster') EtcdInstaller.extend_cluster(master_ip, cluster_ip, 'config') # Find other (arakoon) master nodes arakoon_cluster_name = str(Configuration.get('/ovs/framework/arakoon_clusters|ovsdb')) arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=arakoon_cluster_name) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name, filesystem=False) config.load_config() master_node_ips = [node.ip for node in config.nodes] if cluster_ip in master_node_ips: master_node_ips.remove(cluster_ip) if len(master_node_ips) == 0: raise RuntimeError('There should be at least one other master node') arakoon_ports = [] if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon OVS DB cluster') result = ArakoonInstaller.extend_cluster(master_ip=master_ip, new_ip=cluster_ip, cluster_name=arakoon_cluster_name, base_dir=Configuration.get('/ovs/framework/paths|ovsdb')) ArakoonInstaller.restart_cluster_add(cluster_name=arakoon_cluster_name, current_ips=result['ips'], new_ip=cluster_ip, filesystem=False) arakoon_ports = [result['client_port'], result['messaging_port']] if configure_memcached is True: NodeTypeController.configure_memcached(client=target_client, logger=NodeTypeController._logger) NodeTypeController.add_services(client=target_client, node_type='master', logger=NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Update configurations') if configure_memcached is True: endpoints = Configuration.get('/ovs/framework/memcache|endpoints') endpoint = '{0}:11211'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/memcache|endpoints', endpoints) if configure_rabbitmq is True: endpoints = Configuration.get('/ovs/framework/messagequeue|endpoints') endpoint = '{0}:5672'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/messagequeue|endpoints', endpoints) if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Restarting master node services') ArakoonInstaller.restart_cluster_add(cluster_name=arakoon_cluster_name, current_ips=master_node_ips, new_ip=cluster_ip, filesystem=False) PersistentFactory.store = None VolatileFactory.store = None if 'arakoon-ovsdb' not in [s.name for s in ServiceList.get_services() if s.is_internal is False or s.storagerouter.ip == cluster_ip]: service = Service() service.name = 'arakoon-ovsdb' service.type = ServiceTypeList.get_by_name(ServiceType.SERVICE_TYPES.ARAKOON) service.ports = arakoon_ports service.storagerouter = storagerouter service.save() if configure_rabbitmq is True: NodeTypeController.configure_rabbitmq(client=target_client, logger=NodeTypeController._logger) # Copy rabbitmq cookie rabbitmq_cookie_file = '/var/lib/rabbitmq/.erlang.cookie' Toolbox.log(logger=NodeTypeController._logger, messages='Copying Rabbit MQ cookie') contents = master_client.file_read(rabbitmq_cookie_file) master_hostname, _ = master_client.get_hostname() target_client.dir_create(os.path.dirname(rabbitmq_cookie_file)) target_client.file_write(rabbitmq_cookie_file, contents) target_client.file_chmod(rabbitmq_cookie_file, mode=400) target_client.run(['rabbitmq-server', '-detached']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop_app']) time.sleep(5) target_client.run(['rabbitmqctl', 'join_cluster', 'rabbit@{0}'.format(master_hostname)]) time.sleep(5) target_client.run(['rabbitmqctl', 'stop']) time.sleep(5) # Enable HA for the rabbitMQ queues Toolbox.change_service_state(target_client, 'rabbitmq-server', 'start', NodeTypeController._logger) NodeTypeController.check_rabbitmq_and_enable_ha_mode(client=target_client, logger=NodeTypeController._logger) NodeTypeController._configure_amqp_to_volumedriver() Toolbox.log(logger=NodeTypeController._logger, messages='Starting services') services = ['memcached', 'arakoon-ovsdb', 'rabbitmq-server', 'etcd-config'] if arakoon_metadata['internal'] is True: services.remove('arakoon-ovsdb') for service in services: if ServiceManager.has_service(service, client=target_client): Toolbox.change_service_state(target_client, service, 'start', NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services(clients=ip_client_map, logger=NodeTypeController._logger) if Toolbox.run_hooks(component='nodetype', sub_component='promote', logger=NodeTypeController._logger, cluster_ip=cluster_ip, master_ip=master_ip): Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services(clients=ip_client_map, logger=NodeTypeController._logger) if NodeTypeController.avahi_installed(client=target_client, logger=NodeTypeController._logger) is True: NodeTypeController.configure_avahi(client=target_client, node_name=node_name, node_type='master', logger=NodeTypeController._logger) Configuration.set('/ovs/framework/hosts/{0}/type'.format(machine_id), 'MASTER') target_client.run(['chown', '-R', 'ovs:ovs', '/opt/OpenvStorage/config']) Configuration.set('/ovs/framework/hosts/{0}/promotecompleted'.format(machine_id), True) if target_client.file_exists('/tmp/ovs_rollback'): target_client.file_delete('/tmp/ovs_rollback') Toolbox.log(logger=NodeTypeController._logger, messages='Promote complete')
def test_collapse(): """ Test the arakoon collapsing :return: """ ArakoonCollapse.LOGGER.info("Starting validating arakoon collapse") node_ips = StoragerouterHelper.get_storagerouter_ips() node_ips.sort() for node_ip in node_ips: ArakoonCollapse.LOGGER.info( "Fetching arakoons on node `{0}`".format(node_ip)) arakoon_clusters = [] root_client = SSHClient(node_ip, username='******') # fetch arakoon clusters for service in ServiceList.get_services(): if service.is_internal is True and service.storagerouter.ip == node_ip and \ service.type.name in (ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): arakoon_clusters.append( service.name.replace('arakoon-', '')) # perform collapse ArakoonCollapse.LOGGER.info( "Starting arakoon collapse on node `{0}`".format(node_ip)) for arakoon_cluster in arakoon_clusters: ArakoonCollapse.LOGGER.info( "Fetching `{0}` arakoon on node `{1}`".format( arakoon_cluster, node_ip)) arakoon_config_path = Configuration.get_configuration_path( '/ovs/arakoon/{0}/config'.format(arakoon_cluster)) tlog_location = '/opt/OpenvStorage/db/arakoon/{0}/tlogs'.format( arakoon_cluster) # read_tlog_dir with remote(node_ip, [Configuration]) as rem: config_contents = rem.Configuration.get( '/ovs/arakoon/{0}/config'.format(arakoon_cluster), raw=True) for line in config_contents.splitlines(): if 'tlog_dir' in line: tlog_location = line.split()[-1] nr_of_tlogs = ArakoonCollapse.get_nr_of_tlogs_in_folder( root_client, tlog_location) old_headdb_timestamp = 0 if root_client.file_exists('/'.join([tlog_location, 'head.db'])): old_headdb_timestamp = root_client.run([ 'stat', '--format=%Y', '{0}/{1}'.format(tlog_location, 'head.db') ]) if nr_of_tlogs <= 2: benchmark_command = [ 'arakoon', '--benchmark', '-n_clients', '1', '-max_n', '5_000', '-config', arakoon_config_path ] root_client.run(benchmark_command) ArakoonCollapse.LOGGER.info( "Collapsing arakoon `{0}` on node `{1}` ...".format( arakoon_cluster, node_ip)) GenericController.collapse_arakoon() nr_of_tlogs = ArakoonCollapse.get_nr_of_tlogs_in_folder( root_client, tlog_location) new_headdb_timestamp = root_client.run([ 'stat', '--format=%Y', '{0}/{1}'.format(tlog_location, 'head.db') ]) # perform assertion assert nr_of_tlogs <= 2,\ 'Arakoon collapse left {0} tlogs on the environment, expecting less than 2 in `{1}` on node `{1}`'\ .format(nr_of_tlogs, arakoon_cluster, node_ip) assert old_headdb_timestamp != new_headdb_timestamp,\ 'Timestamp of the head_db file was not changed ' \ 'in the process of collapsing tlogs of arakoon `{0}` on node `{1}`'\ .format(arakoon_cluster, node_ip) ArakoonCollapse.LOGGER.info( "Successfully collapsed arakoon `{0}` on node `{1}`". format(arakoon_cluster, node_ip)) ArakoonCollapse.LOGGER.info("Finished validating arakoon collapsing")
def get_disk_safety(): """ Send disk safety for each vpool and the amount of namespaces with the lowest disk safety """ points = [] abms = [] for service in ServiceList.get_services(): if service.type.name == ServiceType.SERVICE_TYPES.ALBA_MGR: abms.append(service.name) abms = list(set(abms)) abl = AlbaBackendList.get_albabackends() for ab in abl: service_name = Service(ab.abm_services[0].service_guid).name if service_name not in abms: continue config = "etcd://127.0.0.1:2379/ovs/arakoon/{}/config".format(service_name) try: disk_safety = AlbaCLI.run('get-disk-safety', config=config, to_json=True) except Exception as ex: StatsmonkeyScheduledTaskController._logger.error('{0}: {1}'.format(service_name, ex.message)) continue presets = ab.presets used_preset = None for preset in presets: try: policies = preset['policy_metadata'] for policy in policies: if policies[policy]['is_active'] and policies[policy]['in_use']: used_preset = policy if used_preset is not None: used_preset = json.loads(used_preset.replace('(', '[').replace(')', ']')) max_disk_safety = used_preset[1] safety = { 'measurement': 'disk_safety', 'tags': { 'backend_name': ab.name, 'max_disk_safety': max_disk_safety, 'min_disk_safety': max_disk_safety }, 'fields': { 'amount_max_disk_safety': 0, 'amount_between_disk_safety': 0, 'amount_min_disk_safety': 0 } } stats = {} for disk in disk_safety: if disk['safety'] is not None: if disk['safety'] not in stats: stats[disk['safety']] = 0 stats[disk['safety']] += 1 min_disk_safety = min(stats.keys()) safety['tags']['min_disk_safety'] = min_disk_safety for stat in stats: if stat == max_disk_safety: safety['fields']['amount_max_disk_safety'] = stats[stat] elif stat == min_disk_safety: safety['fields']['amount_min_disk_safety'] = stats[stat] else: safety['fields']['amount_between_disk_safety'] += stats[stat] points.append(safety) except Exception as ex: StatsmonkeyScheduledTaskController._logger.error(ex.message) if len(points) == 0: StatsmonkeyScheduledTaskController._logger.info("No statistics found") return StatsmonkeyScheduledTaskController._send_stats(points) return points
def get_backend_stats(): """ Send backend stats for each backend to InfluxDB """ points = [] abms = [] abs = [] for service in ServiceList.get_services(): if service.type.name == ServiceType.SERVICE_TYPES.ALBA_MGR: abms.append(service.name) for ab in AlbaNodeList.get_albanodes(): abs.append(ab.node_id) abms = list(set(abms)) config = "etcd://127.0.0.1:2379/ovs/arakoon/{}/config".format(abms[0]) try: decommissioning_osds = AlbaCLI.run('list-decommissioning-osds', config=config, to_json=True) except Exception as ex: StatsmonkeyScheduledTaskController._logger.error('{0}'.format(ex.message)) return None filtered_osds = [] for ab in abs: filtered_osds += [osd for osd in decommissioning_osds if osd['node_id'] == ab] abl = AlbaBackendList.get_albabackends() for ab in abl: try: stat = { 'measurement': 'backend_stats', 'tags': { 'backend_name': ab.name }, 'fields': { 'gets': ab.statistics['multi_get']['n'], 'puts': ab.statistics['apply']['n'] } } stat_asd = { 'decommissioning': len(filtered_osds), 'decommissioned': 0, 'claimed': 0, 'warning': 0, 'failure': 0, 'error': 0 } for disks in ab.local_stack.values(): for disk in disks.values(): for asd in disk['asds'].values(): if asd['alba_backend_guid'] == ab.guid: status = asd['status'] status_detail = asd['status_detail'] if status_detail == 'decommissioned': status = status_detail if status not in stat_asd: stat_asd[status] = 0 stat_asd[status] += 1 for status in stat_asd: stat['fields'][status] = stat_asd[status] points.append(stat) except Exception as ex: StatsmonkeyScheduledTaskController._logger.error(ex.message) if len(points) == 0: StatsmonkeyScheduledTaskController._logger.info("No statistics found") return None StatsmonkeyScheduledTaskController._send_stats(points) return points
def collapse_arakoon(): """ Collapse Arakoon's Tlogs :return: None """ ScheduledTaskController._logger.info('Starting arakoon collapse') storagerouters = StorageRouterList.get_storagerouters() cluster_info = [('cacc', storagerouters[0], True)] cluster_names = [] for service in ServiceList.get_services(): if service.is_internal is True and service.type.name in ( ServiceType.SERVICE_TYPES.ARAKOON, ServiceType.SERVICE_TYPES.NS_MGR, ServiceType.SERVICE_TYPES.ALBA_MGR): cluster = service.name.replace('arakoon-', '') if cluster in cluster_names: continue cluster_names.append(cluster) cluster_info.append((cluster, service.storagerouter, False)) workload = {} for cluster, storagerouter, filesystem in cluster_info: ScheduledTaskController._logger.debug( ' Collecting info for cluster {0}'.format(cluster)) config = ArakoonClusterConfig(cluster, filesystem=filesystem) config.load_config(storagerouter.ip) for node in config.nodes: if node.ip not in workload: workload[node.ip] = {'node_id': node.name, 'clusters': []} workload[node.ip]['clusters'].append((cluster, filesystem)) for storagerouter in storagerouters: try: if storagerouter.ip not in workload: continue node_workload = workload[storagerouter.ip] client = SSHClient(storagerouter) for cluster, filesystem in node_workload['clusters']: try: ScheduledTaskController._logger.debug( ' Collapsing cluster {0} on {1}'.format( cluster, storagerouter.ip)) if filesystem is True: config_path = ArakoonClusterConfig.CONFIG_FILE.format( cluster) else: config_path = Configuration.get_configuration_path( ArakoonClusterConfig.CONFIG_KEY.format( cluster)) client.run([ 'arakoon', '--collapse-local', node_workload['node_id'], '2', '-config', config_path ]) ScheduledTaskController._logger.info( ' Collapsing cluster {0} on {1} completed'.format( cluster, storagerouter.ip)) except: ScheduledTaskController._logger.exception( ' Collapsing cluster {0} on {1} failed'.format( cluster, storagerouter.ip)) except UnableToConnectException: ScheduledTaskController._logger.error( ' Could not collapse any cluster on {0} (not reachable)'. format(storagerouter.name)) ScheduledTaskController._logger.info('Arakoon collapse finished')