def install_plugins(): """ (Re)load plugins """ if ServiceManager.has_service('ovs-watcher-framework', SSHClient('127.0.0.1', username='******')): # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed from ovs.dal.lists.storagerouterlist import StorageRouterList clients = [] try: for storagerouter in StorageRouterList.get_storagerouters(): clients.append(SSHClient(storagerouter, username='******')) except UnableToConnectException: raise RuntimeError('Not all StorageRouters are reachable') for client in clients: for service_name in ['watcher-framework', 'memcached']: ServiceManager.stop_service(service_name, client=client) wait = 30 while wait > 0: if ServiceManager.get_service_status( service_name, client=client) is False: break time.sleep(1) wait -= 1 if wait == 0: raise RuntimeError( 'Could not stop service: {0}'.format(service_name)) for client in clients: for service_name in ['memcached', 'watcher-framework']: ServiceManager.start_service(service_name, client=client) wait = 30 while wait > 0: if ServiceManager.get_service_status( service_name, client=client) is True: break time.sleep(1) wait -= 1 if wait == 0: raise RuntimeError( 'Could not start service: {0}'.format( service_name)) from ovs.dal.helpers import Migration Migration.migrate() from ovs.lib.helpers.toolbox import Toolbox ip = System.get_my_storagerouter().ip functions = Toolbox.fetch_hooks('plugin', 'postinstall') for function in functions: function(ip=ip)
def stop(cluster_name, client): """ Stops an arakoon service """ if ServiceManager.has_service('arakoon-{0}'.format(cluster_name), client=client) is True and \ ServiceManager.get_service_status('arakoon-{0}'.format(cluster_name), client=client) is True: ServiceManager.stop_service('arakoon-{0}'.format(cluster_name), client=client)
def change_service_state(client, name, state, logger=None): """ Starts/stops/restarts a service :param client: SSHClient on which to connect and change service state :param name: Name of the service :param state: State to put the service in :param logger: LogHandler Object """ action = None status, _ = ServiceManager.get_service_status(name, client=client) if status is False and state in ['start', 'restart']: if logger is not None: logger.debug(' {0:<15} - Starting service {1}'.format(client.ip, name)) ServiceManager.start_service(name, client=client) action = 'started' elif status is True and state == 'stop': if logger is not None: logger.debug(' {0:<15} - Stopping service {1}'.format(client.ip, name)) ServiceManager.stop_service(name, client=client) action = 'stopped' elif status is True and state == 'restart': if logger is not None: logger.debug(' {0:<15} - Restarting service {1}'.format(client.ip, name)) ServiceManager.restart_service(name, client=client) action = 'restarted' if action is None: print ' [{0}] {1} already {2}'.format(client.ip, name, 'running' if status is True else 'halted') else: logger.debug(' {0:<15} - Service {1} {2}'.format(client.ip, name, action)) print ' [{0}] {1} {2}'.format(client.ip, name, action)
def _setup_proxy(initial_cluster, slave_client, cluster_name, force=False): base_name = 'ovs-etcd-proxy' target_name = 'ovs-etcd-{0}'.format(cluster_name) if force is False and ServiceManager.has_service(target_name, slave_client) and \ ServiceManager.get_service_status(target_name, slave_client) is True: logger.info('Service {0} already configured and running'.format(target_name)) return EtcdInstaller.stop(cluster_name, slave_client) data_dir = EtcdInstaller.DATA_DIR.format(EtcdInstaller.DB_DIR, cluster_name) wal_dir = EtcdInstaller.WAL_DIR.format(EtcdInstaller.DB_DIR, cluster_name) abs_paths = [data_dir, wal_dir] slave_client.dir_delete(abs_paths) slave_client.dir_create(data_dir) slave_client.dir_chmod(data_dir, 0755, recursive=True) slave_client.dir_chown(data_dir, 'ovs', 'ovs', recursive=True) ServiceManager.add_service(base_name, slave_client, params={'CLUSTER': cluster_name, 'DATA_DIR': data_dir, 'LOCAL_CLIENT_URL': EtcdInstaller.CLIENT_URL.format('127.0.0.1'), 'INITIAL_CLUSTER': initial_cluster}, target_name=target_name) EtcdInstaller.start(cluster_name, slave_client) EtcdInstaller.wait_for_cluster(cluster_name, slave_client)
def install_plugins(): """ (Re)load plugins """ if ServiceManager.has_service('ovs-watcher-framework', SSHClient('127.0.0.1', username='******')): # If the watcher is running, 'ovs setup' was executed and we need to restart everything to load # the plugin. In the other case, the plugin will be loaded once 'ovs setup' is executed from ovs.dal.lists.storagerouterlist import StorageRouterList clients = [] try: for storagerouter in StorageRouterList.get_storagerouters(): clients.append(SSHClient(storagerouter, username='******')) except UnableToConnectException: raise RuntimeError('Not all StorageRouters are reachable') for client in clients: for service_name in ['watcher-framework', 'memcached']: ServiceManager.stop_service(service_name, client=client) wait = 30 while wait > 0: if ServiceManager.get_service_status(service_name, client=client) is False: break time.sleep(1) wait -= 1 if wait == 0: raise RuntimeError('Could not stop service: {0}'.format(service_name)) for client in clients: for service_name in ['memcached', 'watcher-framework']: ServiceManager.start_service(service_name, client=client) wait = 30 while wait > 0: if ServiceManager.get_service_status(service_name, client=client) is True: break time.sleep(1) wait -= 1 if wait == 0: raise RuntimeError('Could not start service: {0}'.format(service_name)) from ovs.dal.helpers import Migration Migration.migrate() from ovs.lib.helpers.toolbox import Toolbox ip = System.get_my_storagerouter().ip functions = Toolbox.fetch_hooks('plugin', 'postinstall') for function in functions: function(ip=ip)
def start(cluster_name, client): """ Starts an arakoon cluster """ if ServiceManager.get_service_status( 'arakoon-{0}'.format(cluster_name), client=client) is False: ServiceManager.start_service('arakoon-{0}'.format(cluster_name), client=client)
def get_service_status(name, client): """ Check the status of the service :param name: Name of the service :param client: SSHClient object :return: True if service is running """ return ServiceManager.get_service_status(name, client)
def stop(cluster_name, client): """ Stops an arakoon service """ if ServiceManager.has_service('arakoon-{0}'.format(cluster_name), client=client) is True and \ ServiceManager.get_service_status('arakoon-{0}'.format(cluster_name), client=client) is True: ServiceManager.stop_service('arakoon-{0}'.format(cluster_name), client=client)
def stop(cluster_name, client): """ Stops an arakoon service :param client: Client on which to stop the service :param cluster_name: The name of the cluster service to stop """ if ServiceManager.has_service('arakoon-{0}'.format(cluster_name), client=client) is True and \ ServiceManager.get_service_status('arakoon-{0}'.format(cluster_name), client=client) is True: ServiceManager.stop_service('arakoon-{0}'.format(cluster_name), client=client)
def is_running(cluster_name, client): """ Checks if arakoon service is running :param client: Client on which to stop the service :param cluster_name: The name of the cluster service to stop """ if ServiceManager.has_service('arakoon-{0}'.format(cluster_name), client=client): return ServiceManager.get_service_status('arakoon-{0}'.format(cluster_name), client=client) return False
def stop(cluster_name, client): """ Stops an arakoon service """ if ( ServiceManager.has_service("arakoon-{0}".format(cluster_name), client=client) is True and ServiceManager.get_service_status("arakoon-{0}".format(cluster_name), client=client) is True ): ServiceManager.stop_service("arakoon-{0}".format(cluster_name), client=client)
def start(cluster_name, client): """ Starts an etcd cluster :param client: Client on which to start the service :param cluster_name: The name of the cluster service to start """ if ServiceManager.has_service('etcd-{0}'.format(cluster_name), client=client) is True and \ ServiceManager.get_service_status('etcd-{0}'.format(cluster_name), client=client) is False: ServiceManager.start_service('etcd-{0}'.format(cluster_name), client=client)
def manage_running_tasks(tasklist, timesleep=10): """ Manage a list of running celery task - discard PENDING tasks after a certain timeout - validate RUNNING tasks are actually running :param tasklist: Dictionary of tasks to wait {IP address: AsyncResult} :type tasklist: dict :param timesleep: leep between checks - -for long running tasks it's better to sleep for a longer period of time to reduce number of ssh calls :type timesleep: int :return: results :rtype: dict """ logger = LogHandler.get('lib', name='celery toolbox') ssh_clients = {} tasks_pending = {} tasks_pending_timeout = 1800 # 30 minutes results = {} failed_nodes = [] while len(tasklist.keys()) > 0: for ip, task in tasklist.items(): if task.state in ('SUCCESS', 'FAILURE'): logger.info('Task {0} finished: {1}'.format(task.id, task.state)) results[ip] = task.get(propagate=False) del tasklist[ip] elif task.state == 'PENDING': if task.id not in tasks_pending: tasks_pending[task.id] = time.time() else: task_pending_since = tasks_pending[task.id] if time.time() - task_pending_since > tasks_pending_timeout: logger.warning('Task {0} is pending since {1} on node {2}. Task will be revoked'.format(task.id, datetime.datetime.fromtimestamp(task_pending_since), ip)) revoke(task.id) del tasklist[ip] del tasks_pending[task.id] failed_nodes.append(ip) elif task.state == 'STARTED': if ip not in ssh_clients: ssh_clients[ip] = SSHClient(ip, username='******') client = ssh_clients[ip] if ServiceManager.get_service_status('workers', client) is False: logger.error('Service ovs-workers on node {0} appears halted while there is a task PENDING for it {1}. Task will be revoked.'.format(ip, task.id)) revoke(task.id) del tasklist[ip] failed_nodes.append(ip) else: ping_result = task.app.control.inspect().ping() storage_router = StorageRouterList.get_by_ip(ip) if "celery@{0}".format(storage_router.name) not in ping_result: logger.error('Service ovs-workers on node {0} is not reachable via rabbitmq while there is a task STARTED for it {1}. Task will be revoked.'.format(ip, task.id)) revoke(task.id) del tasklist[ip] failed_nodes.append(ip) if len(tasklist.keys()) > 0: time.sleep(timesleep) return results, failed_nodes
def manage_running_tasks(tasklist, timesleep=10): """ Manage a list of running celery task - discard PENDING tasks after a certain timeout - validate RUNNING tasks are actually running :param tasklist: Dictionary of tasks to wait {IP address: AsyncResult} :type tasklist: dict :param timesleep: leep between checks - -for long running tasks it's better to sleep for a longer period of time to reduce number of ssh calls :type timesleep: int :return: results :rtype: dict """ logger = LogHandler.get('lib', name='celery toolbox') ssh_clients = {} tasks_pending = {} tasks_pending_timeout = 1800 # 30 minutes results = {} failed_nodes = [] while len(tasklist.keys()) > 0: for ip, task in tasklist.items(): if task.state in ('SUCCESS', 'FAILURE'): logger.info('Task {0} finished: {1}'.format(task.id, task.state)) results[ip] = task.get(propagate=False) del tasklist[ip] elif task.state == 'PENDING': if task.id not in tasks_pending: tasks_pending[task.id] = time.time() else: task_pending_since = tasks_pending[task.id] if time.time() - task_pending_since > tasks_pending_timeout: logger.warning('Task {0} is pending since {1} on node {2}. Task will be revoked'.format(task.id, datetime.datetime.fromtimestamp(task_pending_since), ip)) revoke(task.id) del tasklist[ip] del tasks_pending[task.id] failed_nodes.append(ip) elif task.state == 'STARTED': if ip not in ssh_clients: ssh_clients[ip] = SSHClient(ip, username='******') client = ssh_clients[ip] if ServiceManager.get_service_status('workers', client) is False: logger.error('Service ovs-workers on node {0} appears halted while there is a task PENDING for it {1}. Task will be revoked.'.format(ip, task.id)) revoke(task.id) del tasklist[ip] failed_nodes.append(ip) else: ping_result = task.app.control.inspect().ping() storage_router = StorageRouterList.get_by_ip(ip) if "celery@{0}".format(storage_router.name) not in ping_result: logger.error('Service ovs-workers on node {0} is not reachable via rabbitmq while there is a task STARTED for it {1}. Task will be revoked.'.format(ip, task.id)) revoke(task.id) del tasklist[ip] failed_nodes.append(ip) if len(tasklist.keys()) > 0: time.sleep(timesleep) return results, failed_nodes
def start(cluster_name, client): """ Starts an arakoon cluster :param client: Client on which to start the service :param cluster_name: The name of the cluster service to start """ if ServiceManager.has_service('arakoon-{0}'.format(cluster_name), client=client) is True and \ ServiceManager.get_service_status('arakoon-{0}'.format(cluster_name), client=client) is False: ServiceManager.start_service('arakoon-{0}'.format(cluster_name), client=client)
def wait_for_service(client, name, status, logger): """ Wait for service to enter status :param client: SSHClient to run commands :param name: name of service :param status: True - running/False - not running :param logger: Logging object """ tries = 10 while tries > 0: service_status, _ = ServiceManager.get_service_status(name, client) if service_status == status: break logger.debug('... waiting for service {0}'.format(name)) tries -= 1 time.sleep(10 - tries) service_status, output = ServiceManager.get_service_status(name, client) if service_status != status: raise RuntimeError('Service {0} does not have expected status: {1}'.format(name, output))
def stop(cluster_name, client): """ Stops an etcd service :param client: Client on which to stop the service :param cluster_name: The name of the cluster service to stop """ if ServiceManager.has_service('etcd-{0}'.format(cluster_name), client=client) is True and \ ServiceManager.get_service_status('etcd-{0}'.format(cluster_name), client=client) is True: ServiceManager.stop_service('etcd-{0}'.format(cluster_name), client=client)
def is_running(cluster_name, client): """ Checks if arakoon service is running :param cluster_name: The name of the cluster service to check :type cluster_name: str :param client: Client on which to check the service :type client: SSHClient :return: None """ service_name = ArakoonInstaller.get_service_name_for_cluster(cluster_name=cluster_name) if ServiceManager.has_service(name=service_name, client=client): return ServiceManager.get_service_status(name=service_name, client=client)[0] return False
def _is_cinder_running(self): if self.is_devstack: try: return 'cinder-volume' in str(self.client.run('ps aux | grep cinder-volume | grep -v grep')) except SystemExit: return False if self.is_openstack: try: cinder_service = OSManager.get_openstack_cinder_service_name() return ServiceManager.get_service_status(cinder_service, self.client) except SystemExit: return False return False
def wait_for_service(client, name, status, logger): """ Wait for service to enter status :param client: SSHClient to run commands :param name: name of service :param status: True - running/False - not running :param logger: Logging object """ tries = 10 while tries > 0: service_status, _ = ServiceManager.get_service_status(name, client) if service_status == status: break logger.debug('... waiting for service {0}'.format(name)) tries -= 1 time.sleep(10 - tries) service_status, output = ServiceManager.get_service_status( name, client) if service_status != status: raise RuntimeError( 'Service {0} does not have expected status: {1}'.format( name, output))
def create_cluster(cluster_name, ip, server_port=DEFAULT_SERVER_PORT, client_port=DEFAULT_CLIENT_PORT): """ Creates a cluster :param cluster_name: Name of the cluster :type cluster_name: str :param ip: IP address of the first node of the new cluster :type ip: str :param server_port: Port to be used by server :type server_port: int :param client_port: Port to be used by client :type client_port: int :return: None """ EtcdInstaller._logger.debug('Creating cluster "{0}" on {1}'.format(cluster_name, ip)) client = SSHClient(ip, username='******') target_name = 'ovs-etcd-{0}'.format(cluster_name) if ServiceManager.has_service(target_name, client) and ServiceManager.get_service_status(target_name, client) is True: EtcdInstaller._logger.info('Service {0} already configured and running'.format(target_name)) return node_name = System.get_my_machine_id(client) data_dir = EtcdInstaller.DATA_DIR.format(cluster_name) wal_dir = EtcdInstaller.WAL_DIR.format(cluster_name) abs_paths = [data_dir, wal_dir] client.dir_delete(abs_paths) client.dir_create(abs_paths) client.dir_chmod(abs_paths, 0755, recursive=True) client.dir_chown(abs_paths, 'ovs', 'ovs', recursive=True) base_name = 'ovs-etcd' ServiceManager.add_service(base_name, client, params={'CLUSTER': cluster_name, 'NODE_ID': node_name, 'DATA_DIR': data_dir, 'WAL_DIR': wal_dir, 'SERVER_URL': EtcdInstaller.SERVER_URL.format(ip, server_port), 'CLIENT_URL': EtcdInstaller.CLIENT_URL.format(ip, client_port), 'LOCAL_CLIENT_URL': EtcdInstaller.CLIENT_URL.format('127.0.0.1', client_port), 'INITIAL_CLUSTER': '{0}={1}'.format(node_name, EtcdInstaller.SERVER_URL.format(ip, server_port)), 'INITIAL_STATE': 'new', 'INITIAL_PEERS': '-initial-advertise-peer-urls {0}'.format(EtcdInstaller.SERVER_URL.format(ip, server_port))}, target_name=target_name) EtcdInstaller.start(cluster_name, client) EtcdInstaller.wait_for_cluster(cluster_name, client, client_port=client_port) EtcdInstaller._logger.debug('Creating cluster "{0}" on {1} completed'.format(cluster_name, ip))
def create_cluster(cluster_name, ip, server_port=DEFAULT_SERVER_PORT, client_port=DEFAULT_CLIENT_PORT): """ Creates a cluster :param cluster_name: Name of the cluster :type cluster_name: str :param ip: IP address of the first node of the new cluster :type ip: str :param server_port: Port to be used by server :type server_port: int :param client_port: Port to be used by client :type client_port: int :return: None """ EtcdInstaller._logger.debug('Creating cluster "{0}" on {1}'.format(cluster_name, ip)) client = SSHClient(ip, username='******') target_name = 'ovs-etcd-{0}'.format(cluster_name) if ServiceManager.has_service(target_name, client) and ServiceManager.get_service_status(target_name, client) is True: EtcdInstaller._logger.info('Service {0} already configured and running'.format(target_name)) return node_name = System.get_my_machine_id(client) data_dir = EtcdInstaller.DATA_DIR.format(cluster_name) wal_dir = EtcdInstaller.WAL_DIR.format(cluster_name) abs_paths = [data_dir, wal_dir] client.dir_delete(abs_paths) client.dir_create(abs_paths) client.dir_chmod(abs_paths, 0755, recursive=True) client.dir_chown(abs_paths, 'ovs', 'ovs', recursive=True) base_name = 'ovs-etcd' ServiceManager.add_service(base_name, client, params={'CLUSTER': cluster_name, 'NODE_ID': node_name, 'DATA_DIR': data_dir, 'WAL_DIR': wal_dir, 'SERVER_URL': EtcdInstaller.SERVER_URL.format(ip, server_port), 'CLIENT_URL': EtcdInstaller.CLIENT_URL.format(ip, client_port), 'LOCAL_CLIENT_URL': EtcdInstaller.CLIENT_URL.format('127.0.0.1', client_port), 'INITIAL_CLUSTER': '{0}={1}'.format(node_name, EtcdInstaller.SERVER_URL.format(ip, server_port)), 'INITIAL_STATE': 'new', 'INITIAL_PEERS': '-initial-advertise-peer-urls {0}'.format(EtcdInstaller.SERVER_URL.format(ip, server_port))}, target_name=target_name) EtcdInstaller.start(cluster_name, client) EtcdInstaller.wait_for_cluster(cluster_name, client, client_port=client_port) EtcdInstaller._logger.debug('Creating cluster "{0}" on {1} completed'.format(cluster_name, ip))
def is_running(cluster_name, client): """ Checks if arakoon service is running :param cluster_name: The name of the cluster service to check :type cluster_name: str :param client: Client on which to check the service :type client: SSHClient :return: None """ if ServiceManager.has_service('arakoon-{0}'.format(cluster_name), client=client): return ServiceManager.get_service_status('arakoon-{0}'.format(cluster_name), client=client) return False
def is_running(cluster_name, client): """ Checks if arakoon service is running :param cluster_name: The name of the cluster service to check :type cluster_name: str :param client: Client on which to check the service :type client: SSHClient :return: None """ if ServiceManager.has_service('arakoon-{0}'.format(cluster_name), client=client): return ServiceManager.get_service_status('arakoon-{0}'.format(cluster_name), client=client)[0] return False
def _is_cinder_running(self): if self.is_devstack: try: return 'cinder-volume' in str( self.client.run( 'ps aux | grep cinder-volume | grep -v grep')) except SystemExit: return False if self.is_openstack: try: cinder_service = OSManager.get_openstack_cinder_service_name() return ServiceManager.get_service_status( cinder_service, self.client) except SystemExit: return False return False
def change_service_state(client, name, state, logger=None): """ Starts/stops/restarts a service :param client: SSHClient on which to connect and change service state :param name: Name of the service :param state: State to put the service in :param logger: LogHandler Object """ action = None # Enable service before changing the state status = ServiceManager.is_enabled(name, client=client) if status is False: if logger is not None: logger.debug(' {0:<15} - Enabling service {1}'.format( client.ip, name)) ServiceManager.enable_service(name, client=client) status = ServiceManager.get_service_status(name, client=client) if status is False and state in ['start', 'restart']: if logger is not None: logger.debug(' {0:<15} - Starting service {1}'.format( client.ip, name)) ServiceManager.start_service(name, client=client) action = 'started' elif status is True and state == 'stop': if logger is not None: logger.debug(' {0:<15} - Stopping service {1}'.format( client.ip, name)) ServiceManager.stop_service(name, client=client) action = 'stopped' elif status is True and state == 'restart': if logger is not None: logger.debug(' {0:<15} - Restarting service {1}'.format( client.ip, name)) ServiceManager.restart_service(name, client=client) action = 'restarted' if action is None: print ' [{0}] {1} already {2}'.format( client.ip, name, 'running' if status is True else 'halted') else: logger.debug(' {0:<15} - Service {1} {2}'.format( client.ip, name, action)) print ' [{0}] {1} {2}'.format(client.ip, name, action)
def _setup_proxy(initial_cluster, slave_client, cluster_name, force=False, client_port=DEFAULT_CLIENT_PORT): base_name = 'ovs-etcd-proxy' target_name = 'ovs-etcd-{0}'.format(cluster_name) if force is False and ServiceManager.has_service( target_name, slave_client) and ServiceManager.get_service_status( target_name, slave_client)[0] is True: EtcdInstaller._logger.info( 'Service {0} already configured and running'.format( target_name)) return EtcdInstaller.stop(cluster_name, slave_client) data_dir = EtcdInstaller.DATA_DIR.format(cluster_name) wal_dir = EtcdInstaller.WAL_DIR.format(cluster_name) abs_paths = [data_dir, wal_dir] slave_client.dir_delete(abs_paths) slave_client.dir_create(data_dir) slave_client.dir_chmod(data_dir, 0755, recursive=True) slave_client.dir_chown(data_dir, 'ovs', 'ovs', recursive=True) ServiceManager.add_service(base_name, slave_client, params={ 'CLUSTER': cluster_name, 'DATA_DIR': data_dir, 'LOCAL_CLIENT_URL': EtcdInstaller.CLIENT_URL.format( '127.0.0.1', client_port), 'INITIAL_CLUSTER': initial_cluster }, target_name=target_name) EtcdInstaller.start(cluster_name, slave_client) EtcdInstaller.wait_for_cluster(cluster_name, slave_client, client_port=client_port)
def validate_vpool_sanity(expected_settings): """ Check if all requirements are met for a healthy vPool :param expected_settings: Parameters used to create a vPool, which will be verified :type expected_settings: dict :return: None """ if not isinstance(expected_settings, dict) or len(expected_settings) == 0: raise ValueError("Cannot validate vpool when no settings are passed") generic_settings = expected_settings.values()[0] vpool_name = generic_settings["vpool_name"] mountpoint = "/mnt/{0}".format(vpool_name) backend_type = generic_settings["type"] rdma_enabled = ( generic_settings["config_params"]["dtl_transport"] == StorageDriverClient.FRAMEWORK_DTL_TRANSPORT_RSOCKET ) vpool = GeneralVPool.get_vpool_by_name(vpool_name=vpool_name) assert vpool is not None, "Could not find vPool with name {0}".format(vpool_name) vpool_config = GeneralVPool.get_configuration(vpool) # Verify some basic vPool attributes assert vpool.name == vpool_name, "Expected name {0} for vPool".format(vpool_name) assert vpool.status == VPool.STATUSES.RUNNING, "vPool does not have RUNNING status" assert vpool.rdma_enabled == rdma_enabled, "RDMA enabled setting is incorrect" assert set(expected_settings.keys()) == set( [sd.storagerouter for sd in vpool.storagedrivers] ), "vPool storagerouters don't match the expected Storage Routers" # Verify vPool Storage Driver configuration expected_vpool_config = copy.deepcopy(generic_settings["config_params"]) for key, value in vpool_config.iteritems(): if key == "dtl_enabled" or key == "tlog_multiplier" or key == "dtl_config_mode": continue if key not in expected_vpool_config: raise ValueError("Expected settings does not contain key {0}".format(key)) if value != expected_vpool_config[key]: raise ValueError( "vPool does not have expected configuration {0} for key {1}".format(expected_vpool_config[key], key) ) expected_vpool_config.pop(key) if len(expected_vpool_config) > 0: raise ValueError( "Actual vPool configuration does not contain keys: {0}".format(", ".join(expected_vpool_config.keys())) ) # Prepare some fields to check config = generic_settings["config_params"] dtl_mode = config["dtl_mode"] sco_size = config["sco_size"] cluster_size = config["cluster_size"] write_buffer = config["write_buffer"] dtl_transport = config["dtl_transport"] # @TODO: Add more validations for other expected settings (instead of None) expected_config = { "backend_connection_manager": { "backend_interface_retries_on_error": 5, "backend_interface_retry_interval_secs": 1, "backend_interface_retry_backoff_multiplier": 2.0, }, "content_addressed_cache": { "clustercache_mount_points": None, "read_cache_serialization_path": u"/var/rsp/{0}".format(vpool.name), }, "distributed_lock_store": { "dls_arakoon_cluster_id": None, "dls_arakoon_cluster_nodes": None, "dls_type": u"Arakoon", }, "distributed_transaction_log": {"dtl_path": None, "dtl_transport": dtl_transport.upper()}, "event_publisher": {"events_amqp_routing_key": u"volumerouter", "events_amqp_uris": None}, "file_driver": {"fd_cache_path": None, "fd_extent_cache_capacity": u"1024", "fd_namespace": None}, "filesystem": { "fs_dtl_config_mode": u"Automatic", "fs_dtl_mode": u"{0}".format(StorageDriverClient.VPOOL_DTL_MODE_MAP[dtl_mode]), "fs_enable_shm_interface": 1, "fs_file_event_rules": None, "fs_metadata_backend_arakoon_cluster_nodes": None, "fs_metadata_backend_mds_nodes": None, "fs_metadata_backend_type": u"MDS", "fs_raw_disk_suffix": None, "fs_virtual_disk_format": None, }, "metadata_server": {"mds_nodes": None}, "scocache": {"backoff_gap": u"2GB", "scocache_mount_points": None, "trigger_gap": u"1GB"}, "threadpool_component": {"num_threads": 16}, "volume_manager": { "clean_interval": 1, "default_cluster_size": 1024 * cluster_size, "dtl_throttle_usecs": 4000, "metadata_path": None, "non_disposable_scos_factor": float(write_buffer) / StorageDriverClient.TLOG_MULTIPLIER_MAP[sco_size] / sco_size, "number_of_scos_in_tlog": StorageDriverClient.TLOG_MULTIPLIER_MAP[sco_size], "tlog_path": None, }, "volume_registry": {"vregistry_arakoon_cluster_id": u"voldrv", "vregistry_arakoon_cluster_nodes": None}, "volume_router": { "vrouter_backend_sync_timeout_ms": 5000, "vrouter_file_read_threshold": 1024, "vrouter_file_write_threshold": 1024, "vrouter_id": None, "vrouter_max_workers": 16, "vrouter_migrate_timeout_ms": 5000, "vrouter_min_workers": 4, "vrouter_redirect_timeout_ms": u"5000", "vrouter_routing_retries": 10, "vrouter_sco_multiplier": 1024, "vrouter_volume_read_threshold": 1024, "vrouter_volume_write_threshold": 1024, }, "volume_router_cluster": {"vrouter_cluster_id": None}, } vpool_services = { "all": [ "ovs-watcher-volumedriver", "ovs-dtl_{0}".format(vpool.name), "ovs-volumedriver_{0}".format(vpool.name), "ovs-volumerouter-consumer", ], "extra": [], "master": ["ovs-arakoon-voldrv"], } sd_partitions = {"DB": ["MD", "MDS", "TLOG"], "WRITE": ["FD", "DTL", "SCO"]} assert Configuration.exists("/ovs/arakoon/voldrv/config", raw=True), "Volumedriver arakoon does not exist" # Do some verifications for all SDs storage_ip = None voldrv_config = GeneralArakoon.get_config("voldrv") all_files = GeneralVPool.get_related_files(vpool=vpool) all_directories = GeneralVPool.get_related_directories(vpool=vpool) for storagedriver in vpool.storagedrivers: storagerouter = storagedriver.storagerouter root_client = SSHClient(storagerouter, username="******") assert Configuration.exists( "/ovs/vpools/{0}/hosts/{1}/config".format(vpool.guid, storagedriver.storagedriver_id), raw=True ), "vPool config not found in configuration" # @todo: replace next lines with implementation defined in: http://jira.openvstorage.com/browse/OVS-4577 # current_config_sections = set([item for item in Configuration.list('/ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, storagedriver.storagedriver_id))]) # assert not current_config_sections.difference(set(expected_config.keys())), 'New section appeared in the storage driver config in configuration' # assert not set(expected_config.keys()).difference(current_config_sections), 'Config section expected for storage driver, but not found in configuration' # # for key, values in expected_config.iteritems(): # current_config = Configuration.get('/ovs/vpools/{0}/hosts/{1}/config/{2}'.format(vpool.guid, storagedriver.storagedriver_id, key)) # assert set(current_config.keys()).union(set(values.keys())) == set(values.keys()), 'Not all expected keys match for key "{0}" on Storage Driver {1}'.format(key, storagedriver.name) # # for sub_key, value in current_config.iteritems(): # expected_value = values[sub_key] # if expected_value is None: # continue # assert value == expected_value, 'Key: {0} - Sub key: {1} - Value: {2} - Expected value: {3}'.format(key, sub_key, value, expected_value) # Check services if storagerouter.node_type == "MASTER": for service_name in vpool_services["all"] + vpool_services["master"]: if ( service_name == "ovs-arakoon-voldrv" and GeneralStorageDriver.has_role(storagedriver, "DB") is False ): continue exitcode, output = ServiceManager.get_service_status(name=service_name, client=root_client) if exitcode is not True: raise ValueError( "Service {0} is not running on node {1} - {2}".format( service_name, storagerouter.ip, output ) ) else: for service_name in vpool_services["all"] + vpool_services["extra"]: exitcode, output = ServiceManager.get_service_status(name=service_name, client=root_client) if exitcode is not True: raise ValueError( "Service {0} is not running on node {1} - {2}".format( service_name, storagerouter.ip, output ) ) # Check arakoon config if not voldrv_config.has_section(storagerouter.machine_id): raise ValueError("Voldrv arakoon cluster does not have section {0}".format(storagerouter.machine_id)) # Basic SD checks assert ( storagedriver.cluster_ip == storagerouter.ip ), "Incorrect cluster IP. Expected: {0} - Actual: {1}".format(storagerouter.ip, storagedriver.cluster_ip) assert storagedriver.mountpoint == "/mnt/{0}".format( vpool.name ), "Incorrect mountpoint. Expected: {0} - Actual: {1}".format(mountpoint, storagedriver.mountpoint) if storage_ip is not None: assert ( storagedriver.storage_ip == storage_ip ), "Incorrect storage IP. Expected: {0} - Actual: {1}".format(storage_ip, storagedriver.storage_ip) storage_ip = storagedriver.storage_ip # Check required directories and files if storagerouter.guid not in all_directories: raise ValueError("Could not find directory information for Storage Router {0}".format(storagerouter.ip)) if storagerouter.guid not in all_files: raise ValueError("Could not find file information for Storage Router {0}".format(storagerouter.ip)) for directory in all_directories[storagerouter.guid]: if root_client.dir_exists(directory) is False: raise ValueError( "Directory {0} does not exist on Storage Router {1}".format(directory, storagerouter.ip) ) for file_name in all_files[storagerouter.guid]: if root_client.file_exists(file_name) is False: raise ValueError( "File {0} does not exist on Storage Router {1}".format(file_name, storagerouter.ip) ) # @TODO: check roles and sub_roles for all storagedrivers and not just once for partition in storagedriver.partitions: if partition.role in sd_partitions and partition.sub_role in sd_partitions[partition.role]: sd_partitions[partition.role].remove(partition.sub_role) elif ( partition.role in sd_partitions and partition.sub_role is None and len(sd_partitions[partition.role]) ): sd_partitions[partition.role].remove("None") # Verify vPool writeable if GeneralHypervisor.get_hypervisor_type() == "VMWARE": GeneralVPool.mount_vpool(vpool=vpool, root_client=root_client) vdisk = GeneralVDisk.create_volume(size=10, vpool=vpool, root_client=root_client) GeneralVDisk.write_to_volume( vdisk=vdisk, vpool=vpool, root_client=root_client, count=10, bs="1M", input_type="random" ) GeneralVDisk.delete_volume(vdisk=vdisk, vpool=vpool, root_client=root_client) for role, sub_roles in sd_partitions.iteritems(): for sub_role in sub_roles: raise ValueError( "Not a single Storage Driver found with partition role {0} and sub-role {1}".format(role, sub_role) )
def start(cluster_name, client): """ Starts an arakoon cluster """ if ServiceManager.get_service_status("arakoon-{0}".format(cluster_name), client=client) is False: ServiceManager.start_service("arakoon-{0}".format(cluster_name), client=client)
def execute_scrub_work(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled :type error_messages: list :return: a list of error messages :rtype: list """ def _verify_mds_config(current_vdisk): current_vdisk.invalidate_dynamics('info') vdisk_configs = current_vdisk.info['metadata_backend_config'] if len(vdisk_configs) == 0: raise RuntimeError('Could not load MDS configuration') return vdisk_configs client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] scrub_directory = '{0}/scrub_work_{1}_{2}'.format(scrub_info['scrub_path'], vpool.name, storagerouter.name) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(vpool.guid, storagerouter.guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(vpool.guid, storagerouter.guid) alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format(vpool.name, storagerouter.name) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod(scrub_directory, 0777) # Celery task executed by 'ovs' user and should be able to write in it if ServiceManager.has_service(name=alba_proxy_service, client=client) is True and ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get('/ovs/framework/hosts/{0}/ports|storagedriver'.format(machine_id)) port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] # Scrub config # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini', # u'fragment_cache': [u'none'], # u'ips': [u'127.0.0.1'], # u'log_level': u'info', # u'manifest_cache_size': 17179869184, # u'port': 0, # u'transport': u'tcp'} # Backend config # {u'alba_connection_host': u'10.100.193.155', # u'alba_connection_port': 26204, # u'alba_connection_preset': u'preset', # u'alba_connection_timeout': 15, # u'alba_connection_transport': u'TCP', # u'backend_interface_retries_on_error': 5, # u'backend_interface_retry_backoff_multiplier': 2.0, # u'backend_interface_retry_interval_secs': 1, # u'backend_type': u'ALBA'} scrub_config = Configuration.get('ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = {'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path('alba_proxy'), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key)} ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) ServiceManager.start_service(name=alba_proxy_service, client=client) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get('ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, vpool.storagedrivers[0].storagedriver_id))['backend_connection_manager'] backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config['port'] Configuration.set(backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message) if client is not None and ServiceManager.has_service(name=alba_proxy_service, client=client) is True: if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ServiceManager.stop_service(name=alba_proxy_service, client=client) ServiceManager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) try: # Empty the queue with vDisks to scrub with remote(storagerouter.ip, [VDisk]) as rem: while True: vdisk = None vdisk_guid = queue.get(False) try: # Check MDS master is local. Trigger MDS handover if necessary vdisk = rem.VDisk(vdisk_guid) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'.format(vpool.name, storagerouter.name, vdisk.name, scrub_directory)) configs = _verify_mds_config(current_vdisk=vdisk) storagedriver = StorageDriverList.get_by_storagedriver_id(vdisk.storagedriver_id) if configs[0].get('ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'.format(vpool.name, storagerouter.name, vdisk.name)) MDSServiceController.ensure_safety(VDisk(vdisk_guid)) # Do not use a remote VDisk instance here configs = _verify_mds_config(current_vdisk=vdisk) if configs[0].get('ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.warning('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'.format(vpool.name, storagerouter.name, vdisk.name)) continue # Do the actual scrubbing with vdisk.storagedriver_client.make_locked_client(str(vdisk.volume_id)) as locked_client: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'.format(vpool.name, storagerouter.name, vdisk.name)) work_units = locked_client.get_scrubbing_workunits() for work_unit in work_units: res = locked_client.scrub(work_unit=work_unit, scratch_dir=scrub_directory, log_sinks=[LogHandler.get_sink_path('scrubber', allow_override=True)], backend_config=Configuration.get_configuration_path(backend_config_key)) locked_client.apply_scrubbing_result(scrubbing_work_result=res) if work_units: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'.format(vpool.name, storagerouter.name, vdisk.name, len(work_units))) else: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'.format(vpool.name, storagerouter.name, vdisk.name)) except Exception: if vdisk is None: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(vpool.name, storagerouter.name, vdisk_guid) else: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(vpool.name, storagerouter.name, vdisk.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) except Empty: # Raised when all items have been fetched from the queue ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'.format(vpool.name, storagerouter.name)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(vpool.name, storagerouter.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if ServiceManager.has_service(alba_proxy_service, client=client): ServiceManager.stop_service(alba_proxy_service, client=client) ServiceManager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message)
def gather_scrub_work(): """ Retrieve and execute scrub work :return: None """ ScheduledTaskController._logger.info('Gather Scrub - Started') scrub_locations = {} for storage_driver in StorageDriverList.get_storagedrivers(): for partition in storage_driver.partitions: if DiskPartition.ROLES.SCRUB == partition.role: ScheduledTaskController._logger.info( 'Gather Scrub - Storage Router {0:<15} has SCRUB partition at {1}' .format(storage_driver.storagerouter.ip, partition.path)) if storage_driver.storagerouter not in scrub_locations: try: sshclient = SSHClient(storage_driver.storagerouter) # Use ServiceManager(sshclient) to make sure ovs-workers are actually running if ServiceManager.get_service_status( 'workers', sshclient) is False: ScheduledTaskController._logger.warning( 'Gather Scrub - Storage Router {0:<15} - workers are not running' .format(storage_driver.storagerouter.ip)) else: scrub_locations[ storage_driver.storagerouter] = str( partition.path) except UnableToConnectException: ScheduledTaskController._logger.warning( 'Gather Scrub - Storage Router {0:<15} is not reachable' .format(storage_driver.storagerouter.ip)) if len(scrub_locations) == 0: raise RuntimeError('No scrub locations found') vdisk_guids = set() for vmachine in VMachineList.get_customer_vmachines(): for vdisk in vmachine.vdisks: if vdisk.info['object_type'] == 'BASE': vdisk_guids.add(vdisk.guid) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] == 'BASE': vdisk_guids.add(vdisk.guid) if len(vdisk_guids) == 0: ScheduledTaskController._logger.info( 'Gather Scrub - No scrub work needed'.format(len(vdisk_guids))) return ScheduledTaskController._logger.info( 'Gather Scrub - Checking {0} volumes for scrub work'.format( len(vdisk_guids))) local_machineid = System.get_my_machine_id() local_storage_router = None local_scrub_location = None local_vdisks_to_scrub = [] result_set = {} storage_router_list = [] scrub_map = {} for index, scrub_info in enumerate(scrub_locations.items()): start_index = index * len(vdisk_guids) / len(scrub_locations) end_index = (index + 1) * len(vdisk_guids) / len(scrub_locations) storage_router = scrub_info[0] vdisk_guids_to_scrub = list(vdisk_guids)[start_index:end_index] local = storage_router.machine_id == local_machineid ScheduledTaskController._logger.info( 'Gather Scrub - Storage Router {0:<15} ({1}) - Scrubbing {2} virtual disks' .format(storage_router.ip, 'local' if local is True else 'remote', len(vdisk_guids_to_scrub))) if local is True: local_storage_router = storage_router local_scrub_location = scrub_info[1] local_vdisks_to_scrub = vdisk_guids_to_scrub else: result_set[storage_router. ip] = ScheduledTaskController._execute_scrub_work.s( scrub_location=scrub_info[1], vdisk_guids=vdisk_guids_to_scrub).apply_async( routing_key='sr.{0}'.format( storage_router.machine_id)) storage_router_list.append(storage_router) scrub_map[storage_router.ip] = vdisk_guids_to_scrub # Remote tasks have been launched, now start the local task and then wait for remote tasks to finish processed_guids = [] if local_scrub_location is not None and len(local_vdisks_to_scrub) > 0: try: processed_guids = ScheduledTaskController._execute_scrub_work( scrub_location=local_scrub_location, vdisk_guids=local_vdisks_to_scrub) except Exception as ex: ScheduledTaskController._logger.error( 'Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}' .format(local_storage_router.ip, ex)) all_results, failed_nodes = CeleryToolbox.manage_running_tasks( result_set, timesleep=60) # Check every 60 seconds if tasks are still running for ip, result in all_results.iteritems(): if isinstance(result, list): processed_guids.extend(result) else: ScheduledTaskController._logger.error( 'Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}' .format(ip, result)) result_set = {} for failed_node in failed_nodes: ScheduledTaskController._logger.warning( 'Scrubbing failed on node {0}. Will reschedule on another node.' .format(failed_node)) vdisk_guids_to_scrub = scrub_map[failed_node] rescheduled_work = False for storage_router, scrub_location in scrub_locations.items(): if storage_router.ip not in failed_nodes: if storage_router.machine_id != local_machineid: ScheduledTaskController._logger.info( 'Rescheduled scrub work from node {0} to node {1}.' .format(failed_node, storage_router.ip)) result_set[ storage_router. ip] = ScheduledTaskController._execute_scrub_work.s( scrub_location=scrub_location, vdisk_guids=vdisk_guids_to_scrub).apply_async( routing_key='sr.{0}'.format( storage_router.machine_id)) storage_router_list.append(storage_router) rescheduled_work = True break if rescheduled_work is False: if local_scrub_location is not None: try: processed_guids.extend( ScheduledTaskController._execute_scrub_work( scrub_location=local_scrub_location, vdisk_guids=vdisk_guids_to_scrub)) except Exception as ex: ScheduledTaskController._logger.error( 'Gather Scrub - Storage Router Local - Scrubbing failed with error:\n - {0}' .format(ex)) else: ScheduledTaskController._logger.warning( 'No nodes left to reschedule work from node {0}'. format(failed_node)) if len(result_set) > 0: all_results2, failed_nodes = CeleryToolbox.manage_running_tasks( result_set, timesleep=60 ) # Check every 60 seconds if tasks are still running for ip, result in all_results2.iteritems(): if isinstance(result, list): processed_guids.extend(result) else: ScheduledTaskController._logger.error( 'Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}' .format(ip, result)) if len(set(processed_guids)) != len(vdisk_guids) or set( processed_guids).difference(vdisk_guids): raise RuntimeError('Scrubbing failed for 1 or more storagerouters') ScheduledTaskController._logger.info('Gather Scrub - Finished')
def getStatusOfService(self, service_name): return ServiceManager.get_service_status(str(service_name), self.client)
def is_host_configured(self, ip): if (self._is_devstack is False and self._is_openstack is False) or self._cinder_installed is False or self._nova_installed is False: self._logger.warning('Host configured: No OpenStack nor DevStack installation detected or Cinder and Nova plugins are not installed') return False # 1. Check driver code if self._is_devstack is True: if not self.client.file_exists(filename = self._devstack_driver): self._logger.info(' File "{0}" does not exist'.format(self._devstack_driver)) return False else: if not self.client.file_exists(filename = '{0}/cinder/volume/drivers/openvstorage.py'.format(self._driver_location)): self._logger.info(' File "{0}/cinder/volume/drivers/openvstorage.py" does not exist'.format(self._driver_location)) return False # 2. Check configured users ovs_id = self.client.run('id -u ovs') if not ovs_id: self._logger.info('Failed to determine the OVS user group ID') return False users = ['libvirt-qemu', 'stack'] if self._is_devstack is True else self._openstack_users for user in users: if '{0}(ovs)'.format(ovs_id) not in self.client.run('id -a {0}'.format(user)): self._logger.info('User "{0}" is not part of the OVS user group') return False # 3. Check patches nova_base_path = self._get_base_path('nova') cinder_base_path = self._get_base_path('cinder') if self._stack_version in ('liberty', 'mitaka', 'newton'): try: import os_brick cinder_brick_initiator_file = "{0}/initiator/connector.py".format(os.path.dirname(os_brick.__file__)) except ImportError: cinder_brick_initiator_file = '' if self._is_devstack is True: nova_volume_file = '{0}/virt/libvirt/volume/volume.py'.format(nova_base_path) else: nova_volume_file = '{0}/nova/virt/libvirt/volume/volume.py'.format(self._driver_location) else: if self._is_devstack is True: nova_volume_file = '{0}/virt/libvirt/volume.py'.format(nova_base_path) else: nova_volume_file = '{0}/nova/virt/libvirt/volume.py'.format(self._driver_location) cinder_brick_initiator_file = '{0}/brick/initiator/connector.py'.format(cinder_base_path) if self._is_devstack is True: nova_driver_file = '{0}/virt/libvirt/driver.py'.format(nova_base_path) else: nova_driver_file = '{0}/nova/virt/libvirt/driver.py'.format(self._driver_location) file_contents = self.client.file_read(nova_volume_file) if 'class LibvirtFileVolumeDriver(LibvirtBaseVolumeDriver):' not in file_contents: self._logger.info('File "{0}" is not configured properly'.format(nova_volume_file)) return False if self._stack_version in ('liberty', 'mitaka'): check_line = 'file=nova.virt.libvirt.volume.volume.LibvirtFileVolumeDriver' else: check_line = 'file=nova.virt.libvirt.volume.LibvirtFileVolumeDriver' file_contents = self.client.file_read(nova_driver_file) if check_line not in file_contents: self._logger.info('File "{0}" is not configured properly'.format(nova_driver_file)) return False if os.path.exists(cinder_brick_initiator_file): file_contents = self.client.file_read(cinder_brick_initiator_file) if self._stack_version in ('liberty', 'mitaka', 'newton'): if 'elif protocol in [LOCAL, "FILE"]:' not in file_contents: self._logger.info('File "{0}" is not configured properly'.format(cinder_brick_initiator_file)) return False else: if 'elif protocol in ["LOCAL", "FILE"]:' not in file_contents: self._logger.info('File "{0}" is not configured properly'.format(cinder_brick_initiator_file)) return False # 4. Check messaging driver configuration nova_messaging_driver = 'nova.openstack.common.notifier.rpc_notifier' if self._stack_version == 'juno' else 'messaging' cinder_messaging_driver = 'cinder.openstack.common.notifier.rpc_notifier' if self._stack_version == 'juno' else 'messaging' host_configured = True with remote(ip, [RawConfigParser], 'root') as rem: for config_file, driver in {self._NOVA_CONF: nova_messaging_driver, self._CINDER_CONF: cinder_messaging_driver}.iteritems(): cfg = rem.RawConfigParser() cfg.read([config_file]) host_configured &= cfg.get("DEFAULT", "notification_driver") == driver host_configured &= "notifications" in cfg.get("DEFAULT", "notification_topics") if config_file == self._NOVA_CONF: host_configured &= cfg.get("DEFAULT", "notify_on_any_change") == "True" host_configured &= cfg.get("DEFAULT", "notify_on_state_change") == "vm_and_task_state" if host_configured is False: self._logger.info('Nova and/or Cinder configuration files are not configured properly') return host_configured # 5. Check events consumer service service_name = 'ovs-openstack-events-consumer' if not (ServiceManager.has_service(service_name, self.client) and ServiceManager.get_service_status(service_name, self.client) is True): self._logger.info('Service "{0}" is not configured properly'.format(service_name)) return False return True
def validate_vpool_sanity(expected_settings): """ Check if all requirements are met for a healthy vPool :param expected_settings: Parameters used to create a vPool, which will be verified :type expected_settings: dict :return: None """ if not isinstance(expected_settings, dict) or len(expected_settings) == 0: raise ValueError('Cannot validate vpool when no settings are passed') generic_settings = expected_settings.values()[0] vpool_name = generic_settings['vpool_name'] mountpoint = '/mnt/{0}'.format(vpool_name) backend_type = generic_settings['type'] rdma_enabled = generic_settings['config_params']['dtl_transport'] == StorageDriverClient.FRAMEWORK_DTL_TRANSPORT_RSOCKET vpool = GeneralVPool.get_vpool_by_name(vpool_name=vpool_name) assert vpool is not None, 'Could not find vPool with name {0}'.format(vpool_name) vpool_config = GeneralVPool.get_configuration(vpool) # Verify some basic vPool attributes assert vpool.name == vpool_name, 'Expected name {0} for vPool'.format(vpool_name) assert vpool.backend_type.code == backend_type, 'Expected backend type {0}'.format(backend_type) assert vpool.status == VPool.STATUSES.RUNNING, 'vPool does not have RUNNING status' assert vpool.rdma_enabled == rdma_enabled, 'RDMA enabled setting is incorrect' assert set(expected_settings.keys()) == set([sd.storagerouter for sd in vpool.storagedrivers]), "vPool storagerouters don't match the expected Storage Routers" # Verify vPool Storage Driver configuration expected_vpool_config = copy.deepcopy(generic_settings['config_params']) for key, value in vpool_config.iteritems(): if key == 'dtl_enabled' or key == 'tlog_multiplier': continue if key not in expected_vpool_config: raise ValueError('Expected settings does not contain key {0}'.format(key)) if value != expected_vpool_config[key]: raise ValueError('vPool does not have expected configuration {0} for key {1}'.format(expected_vpool_config[key], key)) expected_vpool_config.pop(key) if len(expected_vpool_config) > 0: raise ValueError('Actual vPool configuration does not contain keys: {0}'.format(', '.join(expected_vpool_config.keys()))) # Prepare some fields to check config = generic_settings['config_params'] dtl_mode = config['dtl_mode'] sco_size = config['sco_size'] dedupe_mode = config['dedupe_mode'] cluster_size = config['cluster_size'] write_buffer = config['write_buffer'] dtl_transport = config['dtl_transport'] cache_strategy = config['cache_strategy'] # @TODO: Add more validations for other expected settings (instead of None) expected_config = {'backend_connection_manager': {'backend_interface_retries_on_error': 5, 'backend_interface_retry_interval_secs': 1, 'backend_interface_retry_backoff_multiplier': 2.0}, 'content_addressed_cache': {'clustercache_mount_points': None, 'read_cache_serialization_path': u'/var/rsp/{0}'.format(vpool.name)}, 'distributed_lock_store': {'dls_arakoon_cluster_id': None, 'dls_arakoon_cluster_nodes': None, 'dls_type': u'Arakoon'}, 'distributed_transaction_log': {'dtl_path': None, 'dtl_transport': dtl_transport.upper()}, 'event_publisher': {'events_amqp_routing_key': u'volumerouter', 'events_amqp_uris': None}, 'file_driver': {'fd_cache_path': None, 'fd_extent_cache_capacity': u'1024', 'fd_namespace': None}, 'filesystem': {'fs_dtl_config_mode': u'Automatic', 'fs_dtl_mode': u'{0}'.format(StorageDriverClient.VPOOL_DTL_MODE_MAP[dtl_mode]), 'fs_enable_shm_interface': 1, 'fs_file_event_rules': None, 'fs_metadata_backend_arakoon_cluster_nodes': None, 'fs_metadata_backend_mds_nodes': None, 'fs_metadata_backend_type': u'MDS', 'fs_raw_disk_suffix': None, 'fs_virtual_disk_format': None}, 'metadata_server': {'mds_nodes': None}, 'scocache': {'backoff_gap': u'2GB', 'scocache_mount_points': None, 'trigger_gap': u'1GB'}, 'threadpool_component': {'num_threads': 16}, 'volume_manager': {'clean_interval': 1, 'default_cluster_size': 1024 * cluster_size, 'dtl_throttle_usecs': 4000, 'metadata_path': None, 'non_disposable_scos_factor': float(write_buffer) / StorageDriverClient.TLOG_MULTIPLIER_MAP[sco_size] / sco_size, 'number_of_scos_in_tlog': StorageDriverClient.TLOG_MULTIPLIER_MAP[sco_size], 'read_cache_default_behaviour': StorageDriverClient.VPOOL_CACHE_MAP[cache_strategy], 'read_cache_default_mode': StorageDriverClient.VPOOL_DEDUPE_MAP[dedupe_mode], 'tlog_path': None}, 'volume_registry': {'vregistry_arakoon_cluster_id': u'voldrv', 'vregistry_arakoon_cluster_nodes': None}, 'volume_router': {'vrouter_backend_sync_timeout_ms': 5000, 'vrouter_file_read_threshold': 1024, 'vrouter_file_write_threshold': 1024, 'vrouter_id': None, 'vrouter_max_workers': 16, 'vrouter_migrate_timeout_ms': 5000, 'vrouter_min_workers': 4, 'vrouter_redirect_timeout_ms': u'5000', 'vrouter_routing_retries': 10, 'vrouter_sco_multiplier': 1024, 'vrouter_volume_read_threshold': 1024, 'vrouter_volume_write_threshold': 1024}, 'volume_router_cluster': {'vrouter_cluster_id': None}} vpool_services = {'all': ['ovs-watcher-volumedriver', 'ovs-dtl_{0}'.format(vpool.name), 'ovs-volumedriver_{0}'.format(vpool.name), 'ovs-volumerouter-consumer'], 'extra': [], 'master': ['ovs-arakoon-voldrv']} sd_partitions = {'DB': ['MD', 'MDS', 'TLOG'], 'READ': ['None'], 'WRITE': ['FD', 'DTL', 'SCO'], 'SCRUB': ['None']} if backend_type == 'alba': backend_metadata = {'name': (str, None), 'preset': (str, Toolbox.regex_preset), 'backend_guid': (str, Toolbox.regex_guid), 'arakoon_config': (dict, None), 'connection': (dict, {'host': (str, Toolbox.regex_ip, False), 'port': (int, {'min': 1, 'max': 65535}), 'client_id': (str, Toolbox.regex_guid), 'client_secret': (str, None), 'local': (bool, None)}), 'backend_info': (dict, {'policies': (list, None), 'sco_size': (float, None), 'frag_size': (float, None), 'total_size': (float, None), 'nsm_partition_guids': (list, Toolbox.regex_guid)})} required = {'backend': (dict, backend_metadata), 'backend_aa': (dict, backend_metadata, False)} Toolbox.verify_required_params(required_params=required, actual_params=vpool.metadata) vpool_services['all'].append("ovs-albaproxy_{0}".format(vpool.name)) sd_partitions['WRITE'].append('FCACHE') expected_config['backend_connection_manager'].update({'alba_connection_host': None, 'alba_connection_port': None, 'alba_connection_preset': None, 'alba_connection_timeout': 15, 'backend_type': u'{0}'.format(vpool.backend_type.code.upper())}) elif backend_type == 'distributed': expected_config['backend_connection_manager'].update({'backend_type': u'LOCAL', 'local_connection_path': u'{0}'.format(generic_settings['distributed_mountpoint'])}) assert EtcdConfiguration.exists('/ovs/arakoon/voldrv/config', raw=True), 'Volumedriver arakoon does not exist' # Do some verifications for all SDs storage_ip = None voldrv_config = GeneralArakoon.get_config('voldrv') all_files = GeneralVPool.get_related_files(vpool=vpool) all_directories = GeneralVPool.get_related_directories(vpool=vpool) for storagedriver in vpool.storagedrivers: storagerouter = storagedriver.storagerouter root_client = SSHClient(storagerouter, username='******') assert EtcdConfiguration.exists('/ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, storagedriver.storagedriver_id), raw=True), 'vPool config not found in etcd' current_config_sections = set([item for item in EtcdConfiguration.list('/ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, storagedriver.storagedriver_id))]) assert not current_config_sections.difference(set(expected_config.keys())), 'New section appeared in the storage driver config in etcd' assert not set(expected_config.keys()).difference(current_config_sections), 'Config section expected for storage driver, but not found in etcd' for key, values in expected_config.iteritems(): current_config = EtcdConfiguration.get('/ovs/vpools/{0}/hosts/{1}/config/{2}'.format(vpool.guid, storagedriver.storagedriver_id, key)) assert set(current_config.keys()).union(set(values.keys())) == set(values.keys()), 'Not all expected keys match for key "{0}" on Storage Driver {1}'.format(key, storagedriver.name) for sub_key, value in current_config.iteritems(): expected_value = values[sub_key] if expected_value is None: continue assert value == expected_value, 'Key: {0} - Sub key: {1} - Value: {2} - Expected value: {3}'.format(key, sub_key, value, expected_value) # Check services if storagerouter.node_type == 'MASTER': for service_name in vpool_services['all'] + vpool_services['master']: if service_name == 'ovs-arakoon-voldrv' and GeneralStorageDriver.has_role(storagedriver, 'DB') is False: continue if ServiceManager.get_service_status(name=service_name, client=root_client) is not True: raise ValueError('Service {0} is not running on node {1}'.format(service_name, storagerouter.ip)) else: for service_name in vpool_services['all'] + vpool_services['extra']: if ServiceManager.get_service_status(name=service_name, client=root_client) is not True: raise ValueError('Service {0} is not running on node {1}'.format(service_name, storagerouter.ip)) # Check arakoon config if not voldrv_config.has_section(storagerouter.machine_id): raise ValueError('Voldrv arakoon cluster does not have section {0}'.format(storagerouter.machine_id)) # Basic SD checks assert storagedriver.cluster_ip == storagerouter.ip, 'Incorrect cluster IP. Expected: {0} - Actual: {1}'.format(storagerouter.ip, storagedriver.cluster_ip) assert storagedriver.mountpoint == '/mnt/{0}'.format(vpool.name), 'Incorrect mountpoint. Expected: {0} - Actual: {1}'.format(mountpoint, storagedriver.mountpoint) if storage_ip is not None: assert storagedriver.storage_ip == storage_ip, 'Incorrect storage IP. Expected: {0} - Actual: {1}'.format(storage_ip, storagedriver.storage_ip) storage_ip = storagedriver.storage_ip # Check required directories and files if storagerouter.guid not in all_directories: raise ValueError('Could not find directory information for Storage Router {0}'.format(storagerouter.ip)) if storagerouter.guid not in all_files: raise ValueError('Could not find file information for Storage Router {0}'.format(storagerouter.ip)) for directory in all_directories[storagerouter.guid]: if root_client.dir_exists(directory) is False: raise ValueError('Directory {0} does not exist on Storage Router {1}'.format(directory, storagerouter.ip)) for file_name in all_files[storagerouter.guid]: if root_client.file_exists(file_name) is False: raise ValueError('File {0} does not exist on Storage Router {1}'.format(file_name, storagerouter.ip)) for partition in storagedriver.partitions: if partition.role in sd_partitions and partition.sub_role in sd_partitions[partition.role]: sd_partitions[partition.role].remove(partition.sub_role) elif partition.role in sd_partitions and partition.sub_role is None: sd_partitions[partition.role].remove('None') # Verify vPool writeable if storagerouter.pmachine.hvtype == 'VMWARE': GeneralVPool.mount_vpool(vpool=vpool, root_client=root_client) vdisk = GeneralVDisk.create_volume(size=10, vpool=vpool, root_client=root_client) GeneralVDisk.write_to_volume(vdisk=vdisk, vpool=vpool, root_client=root_client, count=10, bs='1M', input_type='random') GeneralVDisk.delete_volume(vdisk=vdisk, vpool=vpool, root_client=root_client) for role, sub_roles in sd_partitions.iteritems(): for sub_role in sub_roles: raise ValueError('Not a single Storage Driver found with partition role {0} and sub-role {1}'.format(role, sub_role))
def dtl_checkup(vpool_guid=None, vdisk_guid=None, storagerouters_to_exclude=None): """ Check DTL for all volumes :param vpool_guid: vPool to check the DTL configuration of all its disks :type vpool_guid: String :param vdisk_guid: Virtual Disk to check its DTL configuration :type vdisk_guid: String :param storagerouters_to_exclude: Storage Routers to exclude from possible targets :type storagerouters_to_exclude: List :return: None """ if vpool_guid is not None and vdisk_guid is not None: raise ValueError('vpool and vdisk are mutually exclusive') if storagerouters_to_exclude is None: storagerouters_to_exclude = [] from ovs.lib.vpool import VPoolController logger.info('DTL checkup started') required_params = {'dtl_mode': (str, StorageDriverClient.VPOOL_DTL_MODE_MAP.keys()), 'dtl_enabled': (bool, None)} vdisk = VDisk(vdisk_guid) if vdisk_guid else None vpool = VPool(vpool_guid) if vpool_guid else None errors_found = False root_client_map = {} vpool_dtl_config_cache = {} vdisks = VDiskList.get_vdisks() if vdisk is None and vpool is None else vpool.vdisks if vpool is not None else [vdisk] for vdisk in vdisks: logger.info(' Verifying vDisk {0} with guid {1}'.format(vdisk.name, vdisk.guid)) vdisk.invalidate_dynamics(['storagedriver_client', 'storagerouter_guid']) if vdisk.storagedriver_client is None: continue vpool = vdisk.vpool if vpool.guid not in vpool_dtl_config_cache: vpool_config = VPoolController.get_configuration(vpool.guid) # Config on vPool is permanent for DTL settings vpool_dtl_config_cache[vpool.guid] = vpool_config Toolbox.verify_required_params(required_params, vpool_config) volume_id = str(vdisk.volume_id) vpool_config = vpool_dtl_config_cache[vpool.guid] dtl_vpool_enabled = vpool_config['dtl_enabled'] try: current_dtl_config = vdisk.storagedriver_client.get_dtl_config(volume_id) current_dtl_config_mode = vdisk.storagedriver_client.get_dtl_config_mode(volume_id) except RuntimeError as rte: # Can occur when a volume has not been stolen yet from a dead node logger.error('Retrieving DTL configuration from storage driver failed with error: {0}'.format(rte)) errors_found = True continue if dtl_vpool_enabled is False and (current_dtl_config is None or current_dtl_config.host == 'null'): logger.info(' DTL is globally disabled for vPool {0} with guid {1}'.format(vpool.name, vpool.guid)) vdisk.storagedriver_client.set_manual_dtl_config(volume_id, None) continue elif current_dtl_config_mode == DTLConfigMode.MANUAL and (current_dtl_config is None or current_dtl_config.host == 'null'): logger.info(' DTL is disabled for virtual disk {0} with guid {1}'.format(vdisk.name, vdisk.guid)) continue storage_router = StorageRouter(vdisk.storagerouter_guid) available_storagerouters = [] # 1. Check available storage routers in the backup failure domain if storage_router.secondary_failure_domain is not None: for storagerouter in storage_router.secondary_failure_domain.primary_storagerouters: if vpool.guid not in storagerouter.vpools_guids: continue if storagerouter not in root_client_map: try: root_client = SSHClient(storagerouter, username='******') except UnableToConnectException: logger.warning(' Storage Router with IP {0} of vDisk {1} is not reachable'.format(storagerouter.ip, vdisk.name)) continue root_client_map[storagerouter] = root_client else: root_client = root_client_map[storagerouter] if ServiceManager.get_service_status('dtl_{0}'.format(vpool.name), client=root_client) is True: available_storagerouters.append(storagerouter) # 2. Check available storage routers in the same failure domain as current storage router if len(available_storagerouters) == 0: for storagerouter in storage_router.primary_failure_domain.primary_storagerouters: if vpool.guid not in storagerouter.vpools_guids or storagerouter == storage_router: continue if storagerouter not in root_client_map: try: root_client = SSHClient(storagerouter, username='******') except UnableToConnectException: logger.warning(' Storage Router with IP {0} of vDisk {1} is not reachable'.format(storagerouter.ip, vdisk.name)) continue root_client_map[storagerouter] = root_client else: root_client = root_client_map[storagerouter] if ServiceManager.get_service_status('dtl_{0}'.format(vpool.name), client=root_client) is True: available_storagerouters.append(storagerouter) # Remove storage routers to exclude for sr_guid in storagerouters_to_exclude: sr_to_exclude = StorageRouter(sr_guid) if sr_to_exclude in available_storagerouters: available_storagerouters.remove(sr_to_exclude) if len(available_storagerouters) == 0: logger.info(' No Storage Routers could be found as valid DTL target') vdisk.storagedriver_client.set_manual_dtl_config(volume_id, None) continue # Check whether reconfiguration is required reconfigure_required = False if current_dtl_config is None: logger.info(' No DTL configuration found, but there are Storage Routers available') reconfigure_required = True elif current_dtl_config_mode == DTLConfigMode.AUTOMATIC: logger.info(' DTL configuration set to AUTOMATIC, switching to manual') reconfigure_required = True else: dtl_host = current_dtl_config.host dtl_port = current_dtl_config.port storage_drivers = [sd for sd in vpool.storagedrivers if sd.storagerouter.ip == dtl_host] logger.info(' DTL host: {0}'.format(dtl_host or '-')) logger.info(' DTL port: {0}'.format(dtl_port or '-')) if dtl_host not in [sr.ip for sr in available_storagerouters]: logger.info(' Host not in available Storage Routers') reconfigure_required = True elif dtl_port != storage_drivers[0].ports[2]: logger.info(' Configured port does not match expected port ({0} vs {1})'.format(dtl_port, storage_drivers[0].ports[2])) reconfigure_required = True # Perform the reconfiguration if reconfigure_required is True: logger.info(' Reconfigure required') index = random.randint(0, len(available_storagerouters) - 1) dtl_target = available_storagerouters[index] storage_drivers = [sd for sd in vpool.storagedrivers if sd.storagerouter == dtl_target] if len(storage_drivers) == 0: raise ValueError('Could not retrieve related storagedriver') port = storage_drivers[0].ports[2] vpool_dtl_mode = vpool_config.get('dtl_mode', StorageDriverClient.FRAMEWORK_DTL_ASYNC) logger.info(' DTL config that will be set --> Host: {0}, Port: {1}, Mode: {2}'.format(dtl_target.ip, port, vpool_dtl_mode)) dtl_config = DTLConfig(str(dtl_target.ip), port, StorageDriverClient.VDISK_DTL_MODE_MAP[vpool_dtl_mode]) vdisk.storagedriver_client.set_manual_dtl_config(volume_id, dtl_config) if errors_found is True: logger.error('DTL checkup ended with errors') raise Exception('DTL checkup failed with errors. Please check /var/log/ovs/lib.log for more information') logger.info('DTL checkup ended')
def gather_scrub_work(): """ Retrieve and execute scrub work :return: None """ ScheduledTaskController._logger.info('Gather Scrub - Started') scrub_locations = {} for storage_driver in StorageDriverList.get_storagedrivers(): for partition in storage_driver.partitions: if DiskPartition.ROLES.SCRUB == partition.role: ScheduledTaskController._logger.info('Gather Scrub - Storage Router {0:<15} has SCRUB partition at {1}'.format(storage_driver.storagerouter.ip, partition.path)) if storage_driver.storagerouter not in scrub_locations: try: sshclient = SSHClient(storage_driver.storagerouter) # Use ServiceManager(sshclient) to make sure ovs-workers are actually running if ServiceManager.get_service_status('workers', sshclient) is False: ScheduledTaskController._logger.warning('Gather Scrub - Storage Router {0:<15} - workers are not running'.format(storage_driver.storagerouter.ip)) else: scrub_locations[storage_driver.storagerouter] = str(partition.path) except UnableToConnectException: ScheduledTaskController._logger.warning('Gather Scrub - Storage Router {0:<15} is not reachable'.format(storage_driver.storagerouter.ip)) if len(scrub_locations) == 0: raise RuntimeError('No scrub locations found') vdisk_guids = set() for vmachine in VMachineList.get_customer_vmachines(): for vdisk in vmachine.vdisks: if vdisk.info['object_type'] == 'BASE': vdisk_guids.add(vdisk.guid) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] == 'BASE': vdisk_guids.add(vdisk.guid) if len(vdisk_guids) == 0: ScheduledTaskController._logger.info('Gather Scrub - No scrub work needed'.format(len(vdisk_guids))) return ScheduledTaskController._logger.info('Gather Scrub - Checking {0} volumes for scrub work'.format(len(vdisk_guids))) local_machineid = System.get_my_machine_id() local_storage_router = None local_scrub_location = None local_vdisks_to_scrub = [] result_set = {} storage_router_list = [] scrub_map = {} for index, scrub_info in enumerate(scrub_locations.items()): start_index = index * len(vdisk_guids) / len(scrub_locations) end_index = (index + 1) * len(vdisk_guids) / len(scrub_locations) storage_router = scrub_info[0] vdisk_guids_to_scrub = list(vdisk_guids)[start_index:end_index] local = storage_router.machine_id == local_machineid ScheduledTaskController._logger.info('Gather Scrub - Storage Router {0:<15} ({1}) - Scrubbing {2} virtual disks'.format(storage_router.ip, 'local' if local is True else 'remote', len(vdisk_guids_to_scrub))) if local is True: local_storage_router = storage_router local_scrub_location = scrub_info[1] local_vdisks_to_scrub = vdisk_guids_to_scrub else: result_set[storage_router.ip] = ScheduledTaskController._execute_scrub_work.s(scrub_location=scrub_info[1], vdisk_guids=vdisk_guids_to_scrub).apply_async(routing_key='sr.{0}'.format(storage_router.machine_id)) storage_router_list.append(storage_router) scrub_map[storage_router.ip] = vdisk_guids_to_scrub # Remote tasks have been launched, now start the local task and then wait for remote tasks to finish processed_guids = [] if local_scrub_location is not None and len(local_vdisks_to_scrub) > 0: try: processed_guids = ScheduledTaskController._execute_scrub_work(scrub_location=local_scrub_location, vdisk_guids=local_vdisks_to_scrub) except Exception as ex: ScheduledTaskController._logger.error('Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}'.format(local_storage_router.ip, ex)) all_results, failed_nodes = CeleryToolbox.manage_running_tasks(result_set, timesleep=60) # Check every 60 seconds if tasks are still running for ip, result in all_results.iteritems(): if isinstance(result, list): processed_guids.extend(result) else: ScheduledTaskController._logger.error('Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}'.format(ip, result)) result_set = {} for failed_node in failed_nodes: ScheduledTaskController._logger.warning('Scrubbing failed on node {0}. Will reschedule on another node.'.format(failed_node)) vdisk_guids_to_scrub = scrub_map[failed_node] rescheduled_work = False for storage_router, scrub_location in scrub_locations.items(): if storage_router.ip not in failed_nodes: if storage_router.machine_id != local_machineid: ScheduledTaskController._logger.info('Rescheduled scrub work from node {0} to node {1}.'.format(failed_node, storage_router.ip)) result_set[storage_router.ip] = ScheduledTaskController._execute_scrub_work.s(scrub_location=scrub_location, vdisk_guids=vdisk_guids_to_scrub).apply_async( routing_key='sr.{0}'.format(storage_router.machine_id)) storage_router_list.append(storage_router) rescheduled_work = True break if rescheduled_work is False: if local_scrub_location is not None: try: processed_guids.extend(ScheduledTaskController._execute_scrub_work(scrub_location=local_scrub_location, vdisk_guids=vdisk_guids_to_scrub)) except Exception as ex: ScheduledTaskController._logger.error( 'Gather Scrub - Storage Router Local - Scrubbing failed with error:\n - {0}'.format(ex)) else: ScheduledTaskController._logger.warning('No nodes left to reschedule work from node {0}'.format(failed_node)) if len(result_set) > 0: all_results2, failed_nodes = CeleryToolbox.manage_running_tasks(result_set, timesleep=60) # Check every 60 seconds if tasks are still running for ip, result in all_results2.iteritems(): if isinstance(result, list): processed_guids.extend(result) else: ScheduledTaskController._logger.error('Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}'.format(ip, result)) if len(set(processed_guids)) != len(vdisk_guids) or set(processed_guids).difference(vdisk_guids): raise RuntimeError('Scrubbing failed for 1 or more storagerouters') ScheduledTaskController._logger.info('Gather Scrub - Finished')
def execute_scrub_work(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled :type error_messages: list :return: a list of error messages :rtype: list """ def _verify_mds_config(current_vdisk): current_vdisk.invalidate_dynamics('info') vdisk_configs = current_vdisk.info['metadata_backend_config'] if len(vdisk_configs) == 0: raise RuntimeError('Could not load MDS configuration') return vdisk_configs client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] scrub_directory = '{0}/scrub_work_{1}_{2}'.format( scrub_info['scrub_path'], vpool.name, storagerouter.name) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format( vpool.guid, storagerouter.guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format( vpool.guid, storagerouter.guid) alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format( vpool.name, storagerouter.name) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod( scrub_directory, 0777 ) # Celery task executed by 'ovs' user and should be able to write in it if ServiceManager.has_service( name=alba_proxy_service, client=client ) is True and ServiceManager.get_service_status( name=alba_proxy_service, client=client) is True: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get( '/ovs/framework/hosts/{0}/ports|storagedriver'.format( machine_id)) port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] # Scrub config # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini', # u'fragment_cache': [u'none'], # u'ips': [u'127.0.0.1'], # u'log_level': u'info', # u'manifest_cache_size': 17179869184, # u'port': 0, # u'transport': u'tcp'} # Backend config # {u'alba_connection_host': u'10.100.193.155', # u'alba_connection_port': 26204, # u'alba_connection_preset': u'preset', # u'alba_connection_timeout': 15, # u'alba_connection_transport': u'TCP', # u'backend_interface_retries_on_error': 5, # u'backend_interface_retry_backoff_multiplier': 2.0, # u'backend_interface_retry_interval_secs': 1, # u'backend_type': u'ALBA'} scrub_config = Configuration.get( 'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format( vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = { 'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path('alba_proxy'), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key) } ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) ServiceManager.start_service(name=alba_proxy_service, client=client) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get( 'ovs/vpools/{0}/hosts/{1}/config'.format( vpool.guid, vpool.storagedrivers[0].storagedriver_id ))['backend_connection_manager'] backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config['port'] Configuration.set( backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message) if client is not None and ServiceManager.has_service( name=alba_proxy_service, client=client) is True: if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ServiceManager.stop_service(name=alba_proxy_service, client=client) ServiceManager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) try: # Empty the queue with vDisks to scrub with remote(storagerouter.ip, [VDisk]) as rem: while True: vdisk = None vdisk_guid = queue.get(False) try: # Check MDS master is local. Trigger MDS handover if necessary vdisk = rem.VDisk(vdisk_guid) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}' .format(vpool.name, storagerouter.name, vdisk.name, scrub_directory)) configs = _verify_mds_config(current_vdisk=vdisk) storagedriver = StorageDriverList.get_by_storagedriver_id( vdisk.storagedriver_id) if configs[0].get( 'ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover' .format(vpool.name, storagerouter.name, vdisk.name)) MDSServiceController.ensure_safety( VDisk(vdisk_guid) ) # Do not use a remote VDisk instance here configs = _verify_mds_config(current_vdisk=vdisk) if configs[0].get( 'ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.warning( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local' .format(vpool.name, storagerouter.name, vdisk.name)) continue # Do the actual scrubbing with vdisk.storagedriver_client.make_locked_client( str(vdisk.volume_id)) as locked_client: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work' .format(vpool.name, storagerouter.name, vdisk.name)) work_units = locked_client.get_scrubbing_workunits( ) for work_unit in work_units: res = locked_client.scrub( work_unit=work_unit, scratch_dir=scrub_directory, log_sinks=[ LogHandler.get_sink_path( 'scrubber', allow_override=True) ], backend_config=Configuration. get_configuration_path(backend_config_key)) locked_client.apply_scrubbing_result( scrubbing_work_result=res) if work_units: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied' .format(vpool.name, storagerouter.name, vdisk.name, len(work_units))) else: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required' .format(vpool.name, storagerouter.name, vdisk.name)) except Exception: if vdisk is None: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format( vpool.name, storagerouter.name, vdisk_guid) else: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format( vpool.name, storagerouter.name, vdisk.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) except Empty: # Raised when all items have been fetched from the queue ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed' .format(vpool.name, storagerouter.name)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format( vpool.name, storagerouter.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if ServiceManager.has_service(alba_proxy_service, client=client): ServiceManager.stop_service(alba_proxy_service, client=client) ServiceManager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message)
def is_host_configured(self, ip): if ( self._is_devstack is False and self._is_openstack is False ) or self._cinder_installed is False or self._nova_installed is False: self._logger.warning( 'Host configured: No OpenStack nor DevStack installation detected or Cinder and Nova plugins are not installed' ) return False # 1. Check driver code if self._is_devstack is True: if not self.client.file_exists(filename=self._devstack_driver): self._logger.info(' File "{0}" does not exist'.format( self._devstack_driver)) return False else: if not self.client.file_exists( filename='{0}/cinder/volume/drivers/openvstorage.py'. format(self._driver_location)): self._logger.info( ' File "{0}/cinder/volume/drivers/openvstorage.py" does not exist' .format(self._driver_location)) return False # 2. Check configured users ovs_id = self.client.run('id -u ovs') if not ovs_id: self._logger.info('Failed to determine the OVS user group ID') return False users = ['libvirt-qemu', 'stack' ] if self._is_devstack is True else self._openstack_users for user in users: if '{0}(ovs)'.format(ovs_id) not in self.client.run( 'id -a {0}'.format(user)): self._logger.info( 'User "{0}" is not part of the OVS user group') return False # 3. Check patches nova_base_path = self._get_base_path('nova') cinder_base_path = self._get_base_path('cinder') if self._stack_version in ('liberty', 'mitaka', 'newton'): try: import os_brick cinder_brick_initiator_file = "{0}/initiator/connector.py".format( os.path.dirname(os_brick.__file__)) except ImportError: cinder_brick_initiator_file = '' if self._is_devstack is True: nova_volume_file = '{0}/virt/libvirt/volume/volume.py'.format( nova_base_path) else: nova_volume_file = '{0}/nova/virt/libvirt/volume/volume.py'.format( self._driver_location) else: if self._is_devstack is True: nova_volume_file = '{0}/virt/libvirt/volume.py'.format( nova_base_path) else: nova_volume_file = '{0}/nova/virt/libvirt/volume.py'.format( self._driver_location) cinder_brick_initiator_file = '{0}/brick/initiator/connector.py'.format( cinder_base_path) if self._is_devstack is True: nova_driver_file = '{0}/virt/libvirt/driver.py'.format( nova_base_path) else: nova_driver_file = '{0}/nova/virt/libvirt/driver.py'.format( self._driver_location) file_contents = self.client.file_read(nova_volume_file) if 'class LibvirtFileVolumeDriver(LibvirtBaseVolumeDriver):' not in file_contents: self._logger.info('File "{0}" is not configured properly'.format( nova_volume_file)) return False if self._stack_version in ('liberty', 'mitaka'): check_line = 'file=nova.virt.libvirt.volume.volume.LibvirtFileVolumeDriver' else: check_line = 'file=nova.virt.libvirt.volume.LibvirtFileVolumeDriver' file_contents = self.client.file_read(nova_driver_file) if check_line not in file_contents: self._logger.info('File "{0}" is not configured properly'.format( nova_driver_file)) return False if os.path.exists(cinder_brick_initiator_file): file_contents = self.client.file_read(cinder_brick_initiator_file) if self._stack_version in ('liberty', 'mitaka', 'newton'): if 'elif protocol in [LOCAL, "FILE"]:' not in file_contents: self._logger.info( 'File "{0}" is not configured properly'.format( cinder_brick_initiator_file)) return False else: if 'elif protocol in ["LOCAL", "FILE"]:' not in file_contents: self._logger.info( 'File "{0}" is not configured properly'.format( cinder_brick_initiator_file)) return False # 4. Check messaging driver configuration nova_messaging_driver = 'nova.openstack.common.notifier.rpc_notifier' if self._stack_version == 'juno' else 'messaging' cinder_messaging_driver = 'cinder.openstack.common.notifier.rpc_notifier' if self._stack_version == 'juno' else 'messaging' host_configured = True with remote(ip, [RawConfigParser], 'root') as rem: for config_file, driver in { self._NOVA_CONF: nova_messaging_driver, self._CINDER_CONF: cinder_messaging_driver }.iteritems(): cfg = rem.RawConfigParser() cfg.read([config_file]) host_configured &= cfg.get("DEFAULT", "notification_driver") == driver host_configured &= "notifications" in cfg.get( "DEFAULT", "notification_topics") if config_file == self._NOVA_CONF: host_configured &= cfg.get( "DEFAULT", "notify_on_any_change") == "True" host_configured &= cfg.get( "DEFAULT", "notify_on_state_change") == "vm_and_task_state" if host_configured is False: self._logger.info( 'Nova and/or Cinder configuration files are not configured properly' ) return host_configured # 5. Check events consumer service service_name = 'ovs-openstack-events-consumer' if not (ServiceManager.has_service(service_name, self.client) and ServiceManager.get_service_status(service_name, self.client) is True): self._logger.info( 'Service "{0}" is not configured properly'.format( service_name)) return False return True