def configure_avahi(client, node_name, node_type, logger): """ Configure Avahi :param client: Client on which to configure avahi :type client: ovs_extensions.generic.sshclient.SSHClient :param node_name: Name of the node to set in Avahi :type node_name: str :param node_type: Type of the node ('master' or 'extra') :type node_type: str :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: None """ valid_avahi = NodeTypeController.validate_avahi_cluster_name( ip=client.ip, cluster_name=Configuration.get('/ovs/framework/cluster_name'), node_name=node_name) if valid_avahi[0] is False: raise RuntimeError(valid_avahi[1]) Toolbox.log(logger=logger, messages='Announcing service') client.file_write( NodeTypeController.avahi_filename, """<?xml version="1.0" standalone='no'?> <!--*-nxml-*--> <!DOCTYPE service-group SYSTEM "avahi-service.dtd"> <!-- $Id$ --> <service-group> <name replace-wildcards="yes">{0}</name> <service> <type>_ovs_{1}_node._tcp</type> <port>443</port> </service> </service-group>""".format(valid_avahi[1], node_type)) ServiceFactory.change_service_state(client, 'avahi-daemon', 'restart', NodeTypeController._logger)
def check_rabbitmq_and_enable_ha_mode(client, logger): """ Verify RabbitMQ is running properly and enable HA mode :param client: Client on which to check RabbitMQ :type client: ovs_extensions.generic.sshclient.SSHClient :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: None """ service_manager = ServiceFactory.get_manager() if not service_manager.has_service('rabbitmq-server', client): raise RuntimeError( 'Service rabbitmq-server has not been added on node {0}'. format(client.ip)) rabbitmq_running, same_process = service_manager.is_rabbitmq_running( client=client) if rabbitmq_running is False or same_process is False: ServiceFactory.change_service_state(client, 'rabbitmq-server', 'restart', logger) time.sleep(5) client.run([ 'rabbitmqctl', 'set_policy', 'ha-all', '^(volumerouter|ovs_.*)$', '{"ha-mode":"all"}' ])
def restart_framework_and_memcache_services(clients, logger, offline_node_ips=None): """ Restart framework and Memcached services :param clients: Clients on which to restart these services :type clients: dict :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :param offline_node_ips: IP addresses of offline nodes in the cluster :type offline_node_ips: list :return: None """ from ovs.dal.lists.storagerouterlist import StorageRouterList service_manager = ServiceFactory.get_manager() master_ips = [sr.ip for sr in StorageRouterList.get_masters()] slave_ips = [sr.ip for sr in StorageRouterList.get_slaves()] if offline_node_ips is None: offline_node_ips = [] memcached = 'memcached' watcher = 'watcher-framework' support_agent = 'support-agent' for ip in master_ips + slave_ips: if ip not in offline_node_ips: if service_manager.has_service(watcher, clients[ip]): ServiceFactory.change_service_state( clients[ip], watcher, 'stop', logger) for ip in master_ips: if ip not in offline_node_ips: ServiceFactory.change_service_state(clients[ip], memcached, 'restart', logger) for ip in master_ips + slave_ips: if ip not in offline_node_ips: if service_manager.has_service(watcher, clients[ip]): ServiceFactory.change_service_state( clients[ip], watcher, 'start', logger) if service_manager.has_service(support_agent, clients[ip]): ServiceFactory.change_service_state( clients[ip], support_agent, 'restart', logger) VolatileFactory.store = None
def configure_rabbitmq(client, logger): """ Configure RabbitMQ :param client: Client on which to configure RabbitMQ :type client: ovs_extensions.generic.sshclient.SSHClient :param logger: Logger object used for logging :type logger: ovs.extensions.generic.logger.Logger :return: None """ Toolbox.log(logger=logger, messages='Setting up RabbitMQ') service_manager = ServiceFactory.get_manager() rabbitmq_port = Configuration.get( '/ovs/framework/messagequeue|endpoints')[0].split(':')[1] rabbitmq_login = Configuration.get('/ovs/framework/messagequeue|user') rabbitmq_password = Configuration.get( '/ovs/framework/messagequeue|password') client.file_write( '/etc/rabbitmq/rabbitmq.config', """[ {{rabbit, [{{tcp_listeners, [{0}]}}, {{default_user, <<"{1}">>}}, {{default_pass, <<"{2}">>}}, {{cluster_partition_handling, autoheal}}, {{log_levels, [{{connection, warning}}]}}, {{vm_memory_high_watermark, 0.2}}]}} ].""".format(rabbitmq_port, rabbitmq_login, rabbitmq_password)) rabbitmq_running, same_process = service_manager.is_rabbitmq_running( client=client) if rabbitmq_running is True: # Example output of 'list_users' command # Listing users ... # guest [administrator] # ovs [] # ... done. users = [ user.split('\t')[0] for user in client.run( ['rabbitmqctl', 'list_users']).splitlines() if '\t' in user and '[' in user and ']' in user ] if 'ovs' in users: Toolbox.log(logger=logger, messages='Already configured RabbitMQ') return ServiceFactory.change_service_state(client, 'rabbitmq-server', 'stop', logger) client.run(['rabbitmq-server', '-detached']) time.sleep(5) # Sometimes/At random the rabbitmq server takes longer than 5 seconds to start, # and the next command fails so the best solution is to retry several times # Also retry the add_user/set_permissions, and validate the result retry = 0 while retry < 10: users = Toolbox.retry_client_run( client=client, command=['rabbitmqctl', 'list_users'], logger=logger).splitlines() users = [ usr.split('\t')[0] for usr in users if '\t' in usr and '[' in usr and ']' in usr ] logger.debug('Rabbitmq users {0}'.format(users)) if 'ovs' in users: logger.debug('User ovs configured in rabbitmq') break logger.debug( Toolbox.retry_client_run(client=client, command=[ 'rabbitmqctl', 'add_user', rabbitmq_login, rabbitmq_password ], logger=logger)) logger.debug( Toolbox.retry_client_run(client=client, command=[ 'rabbitmqctl', 'set_permissions', rabbitmq_login, '.*', '.*', '.*' ], logger=logger)) retry += 1 time.sleep(1) client.run(['rabbitmqctl', 'stop']) time.sleep(5)
def demote_node(cluster_ip, master_ip, ip_client_map, unique_id, unconfigure_memcached, unconfigure_rabbitmq, offline_nodes=None): """ Demotes a given node """ from ovs.dal.lists.storagerouterlist import StorageRouterList Toolbox.log(logger=NodeTypeController._logger, messages='Demoting node', title=True) service_manager = ServiceFactory.get_manager() if offline_nodes is None: offline_nodes = [] if unconfigure_memcached is True and len(offline_nodes) == 0: if NodeTypeController._validate_local_memcache_servers( ip_client_map) is False: raise RuntimeError( 'Not all memcache nodes can be reached which is required for demoting a node.' ) # Find other (arakoon) master nodes arakoon_cluster_name = str( Configuration.get('/ovs/framework/arakoon_clusters|ovsdb')) arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=arakoon_cluster_name) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name) master_node_ips = [node.ip for node in config.nodes] shrink = False if cluster_ip in master_node_ips: shrink = True master_node_ips.remove(cluster_ip) if len(master_node_ips) == 0: raise RuntimeError( 'There should be at least one other master node') storagerouter = StorageRouterList.get_by_machine_id(unique_id) storagerouter.node_type = 'EXTRA' storagerouter.save() offline_node_ips = [node.ip for node in offline_nodes] if arakoon_metadata['internal'] is True and shrink is True: Toolbox.log(logger=NodeTypeController._logger, messages='Leaving Arakoon {0} cluster'.format( arakoon_cluster_name)) arakoon_installer = ArakoonInstaller( cluster_name=arakoon_cluster_name) arakoon_installer.load() arakoon_installer.shrink_cluster(removal_ip=cluster_ip, offline_nodes=offline_node_ips) arakoon_installer.restart_cluster_after_shrinking() try: external_config = Configuration.get( '/ovs/framework/external_config') if external_config is None and shrink is True: Toolbox.log(logger=NodeTypeController._logger, messages='Leaving Arakoon config cluster') arakoon_installer = ArakoonInstaller(cluster_name='config') arakoon_installer.load(ip=master_node_ips[0]) arakoon_installer.shrink_cluster( removal_ip=cluster_ip, offline_nodes=offline_node_ips) arakoon_installer.restart_cluster_after_shrinking() except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages=['\nFailed to leave configuration cluster', ex], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Update configurations') try: if unconfigure_memcached is True: endpoints = Configuration.get( '/ovs/framework/memcache|endpoints') endpoint = '{0}:{1}'.format(cluster_ip, 11211) if endpoint in endpoints: endpoints.remove(endpoint) Configuration.set('/ovs/framework/memcache|endpoints', endpoints) if unconfigure_rabbitmq is True: endpoints = Configuration.get( '/ovs/framework/messagequeue|endpoints') endpoint = '{0}:{1}'.format(cluster_ip, 5672) if endpoint in endpoints: endpoints.remove(endpoint) Configuration.set('/ovs/framework/messagequeue|endpoints', endpoints) except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=['\nFailed to update configurations', ex], loglevel='exception') if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Restarting master node services') remaining_nodes = ip_client_map.keys()[:] if cluster_ip in remaining_nodes: remaining_nodes.remove(cluster_ip) PersistentFactory.store = None VolatileFactory.store = None for service in storagerouter.services: if service.name == 'arakoon-ovsdb': service.delete() target_client = None if storagerouter in offline_nodes: if unconfigure_rabbitmq is True: Toolbox.log( logger=NodeTypeController._logger, messages='Removing/unconfiguring offline RabbitMQ node') client = ip_client_map[master_ip] try: client.run([ 'rabbitmqctl', 'forget_cluster_node', 'rabbit@{0}'.format(storagerouter.name) ]) except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=[ '\nFailed to forget RabbitMQ cluster node', ex ], loglevel='exception') else: target_client = ip_client_map[cluster_ip] if unconfigure_rabbitmq is True: Toolbox.log(logger=NodeTypeController._logger, messages='Removing/unconfiguring RabbitMQ') try: if service_manager.has_service('rabbitmq-server', client=target_client): ServiceFactory.change_service_state( target_client, 'rabbitmq-server', 'stop', NodeTypeController._logger) target_client.run(['rabbitmq-server', '-detached']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop_app']) time.sleep(5) target_client.run(['rabbitmqctl', 'reset']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop']) time.sleep(5) target_client.file_unlink( "/var/lib/rabbitmq/.erlang.cookie") ServiceFactory.change_service_state( target_client, 'rabbitmq-server', 'stop', NodeTypeController._logger) # To be sure except Exception as ex: Toolbox.log(logger=NodeTypeController._logger, messages=[ '\nFailed to remove/unconfigure RabbitMQ', ex ], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Stopping services') services = ['memcached', 'rabbitmq-server'] if unconfigure_rabbitmq is False: services.remove('rabbitmq-server') if unconfigure_memcached is False: services.remove('memcached') for service in services: if service_manager.has_service(service, client=target_client): Toolbox.log( logger=NodeTypeController._logger, messages='Stopping service {0}'.format(service)) try: ServiceFactory.change_service_state( target_client, service, 'stop', NodeTypeController._logger) except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages=[ '\nFailed to stop service'.format(service), ex ], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Removing services') services = [ 'scheduled-tasks', 'webapp-api', 'volumerouter-consumer' ] for service in services: if service_manager.has_service(service, client=target_client): Toolbox.log( logger=NodeTypeController._logger, messages='Removing service {0}'.format(service)) try: ServiceFactory.change_service_state( target_client, service, 'stop', NodeTypeController._logger) service_manager.remove_service(service, client=target_client) except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages=[ '\nFailed to remove service'.format(service), ex ], loglevel='exception') if service_manager.has_service('workers', client=target_client): service_manager.add_service( name='workers', client=target_client, params={'WORKER_QUEUE': '{0}'.format(unique_id)}) try: NodeTypeController._configure_amqp_to_volumedriver() except Exception as ex: Toolbox.log( logger=NodeTypeController._logger, messages=['\nFailed to configure AMQP to Storage Driver', ex], loglevel='exception') Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger, offline_node_ips=offline_node_ips) if Toolbox.run_hooks(component='nodetype', sub_component='demote', logger=NodeTypeController._logger, cluster_ip=cluster_ip, master_ip=master_ip, offline_node_ips=offline_node_ips): Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger, offline_node_ips=offline_node_ips) if storagerouter not in offline_nodes: target_client = ip_client_map[cluster_ip] node_name, _ = target_client.get_hostname() if NodeTypeController.avahi_installed( client=target_client, logger=NodeTypeController._logger) is True: NodeTypeController.configure_avahi( client=target_client, node_name=node_name, node_type='extra', logger=NodeTypeController._logger) Configuration.set( '/ovs/framework/hosts/{0}/type'.format(storagerouter.machine_id), 'EXTRA') if target_client is not None and target_client.file_exists( '/tmp/ovs_rollback'): target_client.file_write('/tmp/ovs_rollback', 'rollback') Toolbox.log(logger=NodeTypeController._logger, messages='Demote complete', title=True)
def promote_node(cluster_ip, master_ip, ip_client_map, unique_id, configure_memcached, configure_rabbitmq): """ Promotes a given node """ from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.servicelist import ServiceList from ovs.dal.hybrids.service import Service Toolbox.log(logger=NodeTypeController._logger, messages='Promoting node', title=True) service_manager = ServiceFactory.get_manager() if configure_memcached is True: if NodeTypeController._validate_local_memcache_servers( ip_client_map) is False: raise RuntimeError( 'Not all memcache nodes can be reached which is required for promoting a node.' ) target_client = ip_client_map[cluster_ip] machine_id = System.get_my_machine_id(target_client) node_name, _ = target_client.get_hostname() master_client = ip_client_map[master_ip] storagerouter = StorageRouterList.get_by_machine_id(unique_id) storagerouter.node_type = 'MASTER' storagerouter.save() external_config = Configuration.get('/ovs/framework/external_config') if external_config is None: Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon configuration cluster') arakoon_installer = ArakoonInstaller(cluster_name='config') arakoon_installer.load(ip=master_ip) arakoon_installer.extend_cluster( new_ip=cluster_ip, base_dir=Configuration.get('/ovs/framework/paths|ovsdb')) arakoon_installer.restart_cluster_after_extending( new_ip=cluster_ip) service_manager.register_service( node_name=machine_id, service_metadata=arakoon_installer.service_metadata[cluster_ip] ) # Find other (arakoon) master nodes arakoon_cluster_name = str( Configuration.get('/ovs/framework/arakoon_clusters|ovsdb')) arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=arakoon_cluster_name) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name) master_node_ips = [node.ip for node in config.nodes] if cluster_ip in master_node_ips: master_node_ips.remove(cluster_ip) if len(master_node_ips) == 0: raise RuntimeError( 'There should be at least one other master node') arakoon_ports = [] if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon OVS DB cluster') arakoon_installer = ArakoonInstaller( cluster_name=arakoon_cluster_name) arakoon_installer.load() arakoon_installer.extend_cluster( new_ip=cluster_ip, base_dir=Configuration.get('/ovs/framework/paths|ovsdb')) arakoon_installer.restart_cluster_after_extending( new_ip=cluster_ip) arakoon_ports = arakoon_installer.ports[cluster_ip] if configure_memcached is True: NodeTypeController.configure_memcached( client=target_client, logger=NodeTypeController._logger) NodeTypeController.add_services(client=target_client, node_type='master', logger=NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Update configurations') if configure_memcached is True: endpoints = Configuration.get('/ovs/framework/memcache|endpoints') endpoint = '{0}:11211'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/memcache|endpoints', endpoints) if configure_rabbitmq is True: endpoints = Configuration.get( '/ovs/framework/messagequeue|endpoints') endpoint = '{0}:5672'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/messagequeue|endpoints', endpoints) if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Restarting master node services') PersistentFactory.store = None VolatileFactory.store = None if 'arakoon-ovsdb' not in [ s.name for s in ServiceList.get_services() if s.is_internal is False or s.storagerouter.ip == cluster_ip ]: service = Service() service.name = 'arakoon-ovsdb' service.type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) service.ports = arakoon_ports service.storagerouter = storagerouter service.save() if configure_rabbitmq is True: NodeTypeController.configure_rabbitmq( client=target_client, logger=NodeTypeController._logger) # Copy rabbitmq cookie rabbitmq_cookie_file = '/var/lib/rabbitmq/.erlang.cookie' Toolbox.log(logger=NodeTypeController._logger, messages='Copying RabbitMQ cookie') contents = master_client.file_read(rabbitmq_cookie_file) master_hostname, _ = master_client.get_hostname() target_client.dir_create(os.path.dirname(rabbitmq_cookie_file)) target_client.file_write(rabbitmq_cookie_file, contents) target_client.file_chmod(rabbitmq_cookie_file, mode=0400) target_client.run(['rabbitmq-server', '-detached']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop_app']) time.sleep(5) target_client.run([ 'rabbitmqctl', 'join_cluster', 'rabbit@{0}'.format(master_hostname) ]) time.sleep(5) target_client.run(['rabbitmqctl', 'stop']) time.sleep(5) # Enable HA for the rabbitMQ queues ServiceFactory.change_service_state(target_client, 'rabbitmq-server', 'start', NodeTypeController._logger) NodeTypeController.check_rabbitmq_and_enable_ha_mode( client=target_client, logger=NodeTypeController._logger) NodeTypeController._configure_amqp_to_volumedriver() Toolbox.log(logger=NodeTypeController._logger, messages='Starting services') services = ['memcached', 'arakoon-ovsdb', 'rabbitmq-server'] if arakoon_metadata['internal'] is True: services.remove('arakoon-ovsdb') for service in services: if service_manager.has_service(service, client=target_client): ServiceFactory.change_service_state(target_client, service, 'start', NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger) if Toolbox.run_hooks(component='nodetype', sub_component='promote', logger=NodeTypeController._logger, cluster_ip=cluster_ip, master_ip=master_ip): Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger) if NodeTypeController.avahi_installed( client=target_client, logger=NodeTypeController._logger) is True: NodeTypeController.configure_avahi( client=target_client, node_name=node_name, node_type='master', logger=NodeTypeController._logger) Configuration.set('/ovs/framework/hosts/{0}/type'.format(machine_id), 'MASTER') target_client.run( ['chown', '-R', 'ovs:ovs', '/opt/OpenvStorage/config']) Configuration.set( '/ovs/framework/hosts/{0}/promotecompleted'.format(machine_id), True) if target_client.file_exists('/tmp/ovs_rollback'): target_client.file_delete('/tmp/ovs_rollback') Toolbox.log(logger=NodeTypeController._logger, messages='Promote complete')