def _get_arakoon_clusters(cls, result_handler): """ Retrieves all Arakoon clusters registered in this OVSCluster :param result_handler: Logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: Dict with the Arakoon cluster types as key and list with dicts which contain cluster names and pyrakoon clients :rtype: dict(str, list[dict]) """ result_handler.info('Fetching available arakoon clusters.', add_to_result=False) arakoon_clusters = {} for cluster_name in list(Configuration.list('/ovs/arakoon')) + ['cacc']: # Determine Arakoon type is_cacc = cluster_name == 'cacc' arakoon_config = ArakoonClusterConfig(cluster_id=cluster_name, load_config=not is_cacc) if is_cacc is True: with open(Configuration.CACC_LOCATION) as config_file: contents = config_file.read() arakoon_config.read_config(contents=contents) try: arakoon_client = ArakoonInstaller.build_client(arakoon_config) except (ArakoonNoMaster, ArakoonNoMasterResult) as ex: result_handler.failure('Unable to find a master for Arakoon cluster {0}. (Message: {1})'.format(cluster_name, str(ex)), code=ErrorCodes.master_none) except Exception as ex: msg = 'Unable to connect to Arakoon cluster {0}. (Message: {1})'.format(cluster_name, str(ex)) result_handler.exception(msg, code=ErrorCodes.unhandled_exception) cls.logger.exception(msg) continue metadata = json.loads(arakoon_client.get(ArakoonInstaller.METADATA_KEY)) cluster_type = metadata['cluster_type'] if cluster_type not in arakoon_clusters: arakoon_clusters[cluster_type] = [] arakoon_clusters[cluster_type].append({'cluster_name': cluster_name, 'client': arakoon_client, 'config': arakoon_config}) return arakoon_clusters
def _get_arakoon_clusters(cls, result_handler): """ Retrieves all Arakoon clusters registered in this OVSCluster :param result_handler: Logging object :type result_handler: ovs.extensions.healthcheck.result.HCResults :return: Dict with the Arakoon cluster types as key and list with dicts which contain cluster names and pyrakoon clients :rtype: dict(str, list[dict]) """ result_handler.info('Fetching available arakoon clusters.', add_to_result=False) arakoon_clusters = {} for cluster_name in list( Configuration.list('/ovs/arakoon')) + ['cacc']: # Determine Arakoon type is_cacc = cluster_name == 'cacc' arakoon_config = ArakoonClusterConfig(cluster_id=cluster_name, load_config=not is_cacc) if is_cacc is True: with open(Configuration.CACC_LOCATION) as config_file: contents = config_file.read() arakoon_config.read_config(contents=contents) try: arakoon_client = ArakoonInstaller.build_client(arakoon_config) except (ArakoonNoMaster, ArakoonNoMasterResult) as ex: result_handler.failure( 'Unable to find a master for Arakoon cluster {0}. (Message: {1})' .format(cluster_name, str(ex)), code=ErrorCodes.master_none) except Exception as ex: msg = 'Unable to connect to Arakoon cluster {0}. (Message: {1})'.format( cluster_name, str(ex)) result_handler.exception(msg, code=ErrorCodes.unhandled_exception) cls.logger.exception(msg) continue metadata = json.loads( arakoon_client.get(ArakoonInstaller.METADATA_KEY)) cluster_type = metadata['cluster_type'] if cluster_type not in arakoon_clusters: arakoon_clusters[cluster_type] = [] arakoon_clusters[cluster_type].append({ 'cluster_name': cluster_name, 'client': arakoon_client, 'config': arakoon_config }) return arakoon_clusters
def services_running(self, target): """ Check all services are running :param target: Target to check :return: Boolean """ try: key = 'ovs-watcher-{0}'.format(str(uuid.uuid4())) value = str(time.time()) if target in ['config', 'framework']: self.log_message(target, 'Testing configuration store...', 0) from ovs.extensions.generic.configuration import Configuration try: Configuration.list('/') except Exception as ex: self.log_message( target, ' Error during configuration store test: {0}'.format( ex), 2) return False from ovs.extensions.db.arakooninstaller import ArakoonInstaller, ArakoonClusterConfig from ovs_extensions.db.arakoon.pyrakoon.pyrakoon.compat import NoGuarantee from ovs.extensions.generic.configuration import Configuration with open(Configuration.CACC_LOCATION) as config_file: contents = config_file.read() config = ArakoonClusterConfig( cluster_id=Configuration.ARAKOON_NAME, load_config=False) config.read_config(contents=contents) client = ArakoonInstaller.build_client(config) contents = client.get(ArakoonInstaller.INTERNAL_CONFIG_KEY, consistency=NoGuarantee()) if Watcher.LOG_CONTENTS != contents: try: config.read_config( contents=contents ) # Validate whether the contents are not corrupt except Exception as ex: self.log_message( target, ' Configuration stored in configuration store seems to be corrupt: {0}' .format(ex), 2) return False temp_filename = '{0}~'.format(Configuration.CACC_LOCATION) with open(temp_filename, 'w') as config_file: config_file.write(contents) config_file.flush() os.fsync(config_file) os.rename(temp_filename, Configuration.CACC_LOCATION) Watcher.LOG_CONTENTS = contents self.log_message(target, ' Configuration store OK', 0) if target == 'framework': # Volatile self.log_message(target, 'Testing volatile store...', 0) max_tries = 5 tries = 0 while tries < max_tries: try: try: logging.disable(logging.WARNING) from ovs.extensions.storage.volatilefactory import VolatileFactory VolatileFactory.store = None volatile = VolatileFactory.get_client() volatile.set(key, value) if volatile.get(key) == value: volatile.delete(key) break volatile.delete(key) finally: logging.disable(logging.NOTSET) except Exception as message: self.log_message( target, ' Error during volatile store test: {0}'.format( message), 2) key = 'ovs-watcher-{0}'.format(str( uuid.uuid4())) # Get another key time.sleep(1) tries += 1 if tries == max_tries: self.log_message(target, ' Volatile store not working correctly', 2) return False self.log_message( target, ' Volatile store OK after {0} tries'.format(tries), 0) # Persistent self.log_message(target, 'Testing persistent store...', 0) max_tries = 5 tries = 0 while tries < max_tries: try: try: logging.disable(logging.WARNING) persistent = PersistentFactory.get_client() persistent.nop() break finally: logging.disable(logging.NOTSET) except Exception as message: self.log_message( target, ' Error during persistent store test: {0}'.format( message), 2) time.sleep(1) tries += 1 if tries == max_tries: self.log_message( target, ' Persistent store not working correctly', 2) return False self.log_message( target, ' Persistent store OK after {0} tries'.format(tries), 0) if target == 'volumedriver': # Arakoon, voldrv cluster self.log_message(target, 'Testing arakoon (voldrv)...', 0) max_tries = 5 tries = 0 while tries < max_tries: try: from ovs.extensions.generic.configuration import Configuration from ovs_extensions.storage.persistent.pyrakoonstore import PyrakoonStore cluster_name = str( Configuration.get( '/ovs/framework/arakoon_clusters|voldrv')) configuration = Configuration.get( '/ovs/arakoon/{0}/config'.format(cluster_name), raw=True) client = PyrakoonStore(cluster=cluster_name, configuration=configuration) client.nop() break except Exception as message: self.log_message( target, ' Error during arakoon (voldrv) test: {0}'.format( message), 2) time.sleep(1) tries += 1 if tries == max_tries: self.log_message( target, ' Arakoon (voldrv) not working correctly', 2) return False self.log_message(target, ' Arakoon (voldrv) OK', 0) if target in ['framework', 'volumedriver']: # RabbitMQ self.log_message(target, 'Test rabbitMQ...', 0) import pika from ovs.extensions.generic.configuration import Configuration messagequeue = Configuration.get('/ovs/framework/messagequeue') rmq_servers = messagequeue['endpoints'] good_node = False for server in rmq_servers: try: connection_string = '{0}://{1}:{2}@{3}/%2F'.format( messagequeue['protocol'], messagequeue['user'], messagequeue['password'], server) connection = pika.BlockingConnection( pika.URLParameters(connection_string)) channel = connection.channel() channel.basic_publish( '', 'ovs-watcher', str(time.time()), pika.BasicProperties(content_type='text/plain', delivery_mode=1)) connection.close() good_node = True except Exception as message: self.log_message( target, ' Error during rabbitMQ test on node {0}: {1}'. format(server, message), 2) if good_node is False: self.log_message( target, ' No working rabbitMQ node could be found', 2) return False self.log_message(target, ' RabbitMQ test OK', 0) self.log_message(target, 'All tests OK', 0) return True except Exception as ex: self.log_message(target, 'Unexpected exception: {0}'.format(ex), 2) return False
def promote_or_demote_node(node_action, cluster_ip=None, execute_rollback=False): """ Promotes or demotes the local node :param node_action: Demote or promote :type node_action: str :param cluster_ip: IP of node to promote or demote :type cluster_ip: str :param execute_rollback: In case of failure revert the changes made :type execute_rollback: bool :return: None """ if node_action not in ('promote', 'demote'): raise ValueError('Nodes can only be promoted or demoted') Toolbox.log(logger=NodeTypeController._logger, messages='Open vStorage Setup - {0}'.format( node_action.capitalize()), boxed=True) try: Toolbox.log(logger=NodeTypeController._logger, messages='Collecting information', title=True) machine_id = System.get_my_machine_id() if Configuration.get('/ovs/framework/hosts/{0}/setupcompleted'. format(machine_id)) is False: raise RuntimeError('No local OVS setup found.') if cluster_ip and not re.match(Toolbox.regex_ip, cluster_ip): raise RuntimeError( 'Incorrect IP provided ({0})'.format(cluster_ip)) if cluster_ip: client = SSHClient(endpoint=cluster_ip) machine_id = System.get_my_machine_id(client) node_type = Configuration.get( '/ovs/framework/hosts/{0}/type'.format(machine_id)) if node_action == 'promote' and node_type == 'MASTER': raise RuntimeError('This node is already master.') elif node_action == 'demote' and node_type == 'EXTRA': raise RuntimeError('This node should be a master.') elif node_type not in ['MASTER', 'EXTRA']: raise RuntimeError('This node is not correctly configured.') master_ip = None offline_nodes = [] online = True target_client = None if node_action == 'demote' and cluster_ip: # Demote an offline node from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.lib.storagedriver import StorageDriverController ip = cluster_ip unique_id = None ip_client_map = {} for storage_router in StorageRouterList.get_storagerouters(): try: client = SSHClient(storage_router.ip, username='******') if storage_router.node_type == 'MASTER': master_ip = storage_router.ip ip_client_map[storage_router.ip] = client except UnableToConnectException: if storage_router.ip == cluster_ip: online = False unique_id = storage_router.machine_id StorageDriverController.mark_offline( storagerouter_guid=storage_router.guid) offline_nodes.append(storage_router) if online is True: raise RuntimeError( "If the node is online, please use 'ovs setup demote' executed on the node you wish to demote" ) if master_ip is None: raise RuntimeError( 'Failed to retrieve another responsive MASTER node') else: target_password = Toolbox.ask_validate_password( ip='127.0.0.1', logger=NodeTypeController._logger) target_client = SSHClient('127.0.0.1', username='******', password=target_password) unique_id = System.get_my_machine_id(target_client) ip = Configuration.get( '/ovs/framework/hosts/{0}/ip'.format(unique_id)) storagerouter_info = NodeTypeController.retrieve_storagerouter_info_via_host( ip=target_client.ip, password=target_password) node_ips = [ sr_info['ip'] for sr_info in storagerouter_info.itervalues() ] master_node_ips = [ sr_info['ip'] for sr_info in storagerouter_info.itervalues() if sr_info['type'] == 'master' and sr_info['ip'] != ip ] if len(master_node_ips) == 0: if node_action == 'promote': raise RuntimeError('No master node could be found') else: raise RuntimeError( 'It is not possible to remove the only master') master_ip = master_node_ips[0] ip_client_map = dict( (node_ip, SSHClient(node_ip, username='******')) for node_ip in node_ips) if node_action == 'demote': for cluster_name in Configuration.list('/ovs/arakoon'): config = ArakoonClusterConfig(cluster_id=cluster_name) arakoon_client = ArakoonInstaller.build_client(config) metadata = json.loads( arakoon_client.get(ArakoonInstaller.METADATA_KEY)) if len(config.nodes) == 1 and config.nodes[ 0].ip == ip and metadata.get('internal') is True: raise RuntimeError( 'Demote is not supported when single node Arakoon cluster(s) are present on the node to be demoted.' ) configure_rabbitmq = Toolbox.is_service_internally_managed( service='rabbitmq') configure_memcached = Toolbox.is_service_internally_managed( service='memcached') if node_action == 'promote': try: NodeTypeController.promote_node( cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, configure_memcached=configure_memcached, configure_rabbitmq=configure_rabbitmq) except Exception: if execute_rollback is True: NodeTypeController.demote_node( cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, unconfigure_memcached=configure_memcached, unconfigure_rabbitmq=configure_rabbitmq, offline_nodes=offline_nodes) elif target_client is not None: target_client.file_write('/tmp/ovs_rollback', 'demote') raise else: try: NodeTypeController.demote_node( cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, unconfigure_memcached=configure_memcached, unconfigure_rabbitmq=configure_rabbitmq, offline_nodes=offline_nodes) except Exception: if execute_rollback is True: NodeTypeController.promote_node( cluster_ip=ip, master_ip=master_ip, ip_client_map=ip_client_map, unique_id=unique_id, configure_memcached=configure_memcached, configure_rabbitmq=configure_rabbitmq) elif target_client is not None: target_client.file_write('/tmp/ovs_rollback', 'promote') raise Toolbox.log(logger=NodeTypeController._logger, messages='\n') Toolbox.log(logger=NodeTypeController._logger, messages='{0} complete.'.format( node_action.capitalize()), boxed=True) except Exception as exception: Toolbox.log(logger=NodeTypeController._logger, messages='\n') Toolbox.log( logger=NodeTypeController._logger, messages=['An unexpected error occurred:', str(exception)], boxed=True, loglevel='exception') sys.exit(1) except KeyboardInterrupt: Toolbox.log(logger=NodeTypeController._logger, messages='\n') Toolbox.log( logger=NodeTypeController._logger, messages= 'This setup was aborted. Open vStorage may be in an inconsistent state, make sure to validate the installation.', boxed=True, loglevel='error') sys.exit(1)
def services_running(self): # type: () -> bool """ Check if all services are running :return: Boolean """ try: key = 'ovs-watcher-{0}'.format(str(uuid.uuid4())) value = str(time.time()) if self.target in [WatcherTypes.CONFIG, WatcherTypes.FWK]: self.log_message('Testing configuration store...') try: Configuration.list('/') except Exception as ex: self.log_message(' Error during configuration store test: {0}'.format(ex), 2) return False with open(CACC_LOCATION) as config_file: contents = config_file.read() config = ArakoonClusterConfig(cluster_id=ARAKOON_NAME, load_config=False) config.read_config(contents=contents) client = ArakoonInstaller.build_client(config) contents = client.get(ArakoonInstaller.INTERNAL_CONFIG_KEY, consistency=NoGuarantee()) if Watcher.LOG_CONTENTS != contents: try: config.read_config(contents=contents) # Validate whether the contents are not corrupt except Exception as ex: self.log_message(' Configuration stored in configuration store seems to be corrupt: {0}'.format(ex), 2) return False temp_filename = '{0}~'.format(CACC_LOCATION) with open(temp_filename, 'w') as config_file: config_file.write(contents) config_file.flush() os.fsync(config_file) os.rename(temp_filename, CACC_LOCATION) Watcher.LOG_CONTENTS = contents self.log_message(' Configuration store OK', 0) if self.target == WatcherTypes.FWK: self._test_store('volatile', key, value) self._test_store('persistent') if self.target == WatcherTypes.VOLDRV: # Arakoon, voldrv cluster self._test_store('arakoon_voldrv') if self.target in [WatcherTypes.FWK, WatcherTypes.VOLDRV]: # RabbitMQ self.log_message('Test rabbitMQ...', 0) messagequeue = Configuration.get('/ovs/framework/messagequeue') rmq_servers = messagequeue['endpoints'] good_node = False for server in rmq_servers: try: connection_string = '{0}://{1}:{2}@{3}/%2F'.format(messagequeue['protocol'], messagequeue['user'], messagequeue['password'], server) connection = pika.BlockingConnection(pika.URLParameters(connection_string)) channel = connection.channel() channel.basic_publish('', 'ovs-watcher', str(time.time()), pika.BasicProperties(content_type='text/plain', delivery_mode=1)) connection.close() good_node = True except Exception as message: self.log_message(' Error during rabbitMQ test on node {0}: {1}'.format(server, message), 2) if good_node is False: self.log_message(' No working rabbitMQ node could be found', 2) return False self.log_message(' RabbitMQ test OK') self.log_message('All tests OK') return True except Exception as ex: self.log_message('Unexpected exception: {0}'.format(ex), 2) return False