Пример #1
0
    def extend_cluster(master_ip, new_ip, cluster_name, base_dir, locked=True, filesystem=False, ports=None):
        """
        Extends a cluster to a given new node
        :param master_ip: IP of one of the already existing nodes
        :type master_ip: str
        :param new_ip: IP address of the node to be added
        :type new_ip: str
        :param cluster_name: Name of the cluster to be extended
        :type cluster_name: str
        :param base_dir: Base directory that will hold the db and tlogs
        :type base_dir: str
        :param locked: Indicates whether the extend should run in a locked context (e.g. to prevent port conflicts)
        :type locked: bool
        :param filesystem: Indicates whether the configuration should be on the filesystem or in a configuration cluster
        :type filesystem: bool
        :param ports: A list of ports to be used for this cluster's node
        :type ports: list
        :return: Ports used by arakoon cluster
        :rtype: dict
        """
        ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2}'.format(cluster_name, master_ip, new_ip))
        base_dir = base_dir.rstrip('/')

        config = ArakoonClusterConfig(cluster_name, filesystem)
        config.load_config(master_ip)

        client = SSHClient(new_ip, username=ArakoonInstaller.SSHCLIENT_USER)
        node_name = System.get_my_machine_id(client)

        home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name)
        tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name)
        ArakoonInstaller.clean_leftover_arakoon_data(new_ip, [home_dir, tlog_dir])

        port_mutex = None
        try:
            if locked is True:
                from ovs.extensions.generic.volatilemutex import volatile_mutex
                port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(new_ip))
                port_mutex.acquire(wait=60)
            if ports is None:
                ports = ArakoonInstaller._get_free_ports(client)
            if node_name not in [node.name for node in config.nodes]:
                config.nodes.append(ArakoonNodeConfig(name=node_name,
                                                      ip=new_ip,
                                                      client_port=ports[0],
                                                      messaging_port=ports[1],
                                                      log_sinks=LogHandler.get_sink_path('arakoon_server'),
                                                      crash_log_sinks=LogHandler.get_sink_path('arakoon_server_crash'),
                                                      home=home_dir,
                                                      tlog_dir=tlog_dir))
            ArakoonInstaller._deploy(config, filesystem=filesystem)
        finally:
            if port_mutex is not None:
                port_mutex.release()

        ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2} completed'.format(cluster_name, master_ip, new_ip))
        return {'client_port': ports[0],
                'messaging_port': ports[1],
                'ips': [node.ip for node in config.nodes]}
Пример #2
0
    def extend_cluster(master_ip, new_ip, cluster_name, base_dir, locked=True):
        """
        Extends a cluster to a given new node
        :param master_ip: IP of one of the already existing nodes
        :type master_ip: str

        :param new_ip: IP address of the node to be added
        :type new_ip: str

        :param cluster_name: Name of the cluster to be extended
        :type cluster_name: str

        :param base_dir: Base directory that will hold the db and tlogs
        :type base_dir: str

        :param locked: Indicates whether the extend should run in a locked context (e.g. to prevent port conflicts)
        :type locked: bool

        :return: Ports used by arakoon cluster
        :rtype: dict
        """
        ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2}'.format(cluster_name, master_ip, new_ip))
        base_dir = base_dir.rstrip('/')

        config = ArakoonClusterConfig(cluster_name)
        config.load_config()

        client = SSHClient(new_ip, username=ArakoonInstaller.SSHCLIENT_USER)
        node_name = System.get_my_machine_id(client)

        home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name)
        tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name)
        ArakoonInstaller.clean_leftover_arakoon_data(new_ip, [home_dir, tlog_dir])

        port_mutex = None
        try:
            if locked is True:
                from ovs.extensions.generic.volatilemutex import volatile_mutex
                port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(new_ip))
                port_mutex.acquire(wait=60)
            ports = ArakoonInstaller._get_free_ports(client)
            if node_name not in [node.name for node in config.nodes]:
                config.nodes.append(ArakoonNodeConfig(name=node_name,
                                                      ip=new_ip,
                                                      client_port=ports[0],
                                                      messaging_port=ports[1],
                                                      log_sinks=LogHandler.get_sink_path('arakoon_server'),
                                                      crash_log_sinks=LogHandler.get_sink_path('arakoon_server_crash'),
                                                      home=home_dir,
                                                      tlog_dir=tlog_dir))
            ArakoonInstaller._deploy(config)
        finally:
            if port_mutex is not None:
                port_mutex.release()

        ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2} completed'.format(cluster_name, master_ip, new_ip))
        return {'client_port': ports[0],
                'messaging_port': ports[1]}
Пример #3
0
    def create_cluster(cluster_name, cluster_type, ip, base_dir, plugins=None, locked=True, internal=True, claim=False):
        """
        Always creates a cluster but marks it's usage according to the internal flag
        :param cluster_name: Name of the cluster
        :type cluster_name: str

        :param cluster_type: Type of the cluster (See ServiceType.ARAKOON_CLUSTER_TYPES)
        :type cluster_type: str

        :param ip: IP address of the first node of the new cluster
        :type ip: str

        :param base_dir: Base directory that should contain the data and tlogs
        :type base_dir: str

        :param plugins: Plugins that should be added to the configuration file
        :type plugins: list

        :param locked: Indicates whether the create should run in a locked context (e.g. to prevent port conflicts)
        :type locked: bool

        :param internal: Is cluster internally managed by OVS
        :type internal: bool

        :param claim: Claim the cluster right away
        :type claim: bool

        :return: Ports used by arakoon cluster
        :rtype: dict
        """
        if cluster_type not in ServiceType.ARAKOON_CLUSTER_TYPES:
            raise ValueError('Cluster type {0} is not supported. Please choose from {1}'.format(cluster_type, ', '.join(ServiceType.ARAKOON_CLUSTER_TYPES)))

        if EtcdConfiguration.dir_exists('/ovs/arakoon/{0}'.format(cluster_name)):
            raise ValueError('An Arakoon cluster with name "{0}" already exists'.format(cluster_name))

        ArakoonInstaller._logger.debug('Creating cluster {0} on {1}'.format(cluster_name, ip))
        base_dir = base_dir.rstrip('/')

        client = SSHClient(ip, username=ArakoonInstaller.SSHCLIENT_USER)
        if ArakoonInstaller.is_running(cluster_name, client):
            ArakoonInstaller._logger.info('Arakoon service running for cluster {0}'.format(cluster_name))
            config = ArakoonClusterConfig(cluster_name, plugins)
            config.load_config()
            for node in config.nodes:
                if node.ip == ip:
                    return {'client_port': node.client_port,
                            'messaging_port': node.messaging_port}

        node_name = System.get_my_machine_id(client)

        home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name)
        tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name)
        ArakoonInstaller.clean_leftover_arakoon_data(ip, [home_dir, tlog_dir])

        port_mutex = None
        try:
            if locked is True:
                from ovs.extensions.generic.volatilemutex import volatile_mutex
                port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(ip))
                port_mutex.acquire(wait=60)
            ports = ArakoonInstaller._get_free_ports(client)
            config = ArakoonClusterConfig(cluster_name, plugins)
            config.nodes.append(ArakoonNodeConfig(name=node_name,
                                                  ip=ip,
                                                  client_port=ports[0],
                                                  messaging_port=ports[1],
                                                  log_sinks=LogHandler.get_sink_path('arakoon_server'),
                                                  crash_log_sinks=LogHandler.get_sink_path('arakoon_server_crash'),
                                                  home=home_dir,
                                                  tlog_dir=tlog_dir))
            ArakoonInstaller._deploy(config)

            metadata = ArakoonClusterMetadata(cluster_id=cluster_name)
            metadata.internal = internal
            metadata.cluster_type = cluster_type.upper()
            metadata.write()
            if claim is True:
                metadata.claim()
        finally:
            if port_mutex is not None:
                port_mutex.release()

        ArakoonInstaller._logger.debug('Creating cluster {0} on {1} completed'.format(cluster_name, ip))
        return {'metadata': metadata,
                'client_port': ports[0],
                'messaging_port': ports[1]}
Пример #4
0
    def _voldrv_arakoon_checkup(create_cluster):
        def _add_service(service_storagerouter, arakoon_ports, service_name):
            """ Add a service to the storage router """
            new_service = Service()
            new_service.name = service_name
            new_service.type = service_type
            new_service.ports = arakoon_ports
            new_service.storagerouter = service_storagerouter
            new_service.save()
            return new_service

        current_ips = []
        current_services = []
        service_type = ServiceTypeList.get_by_name(
            ServiceType.SERVICE_TYPES.ARAKOON)
        cluster_name = Configuration.get(
            '/ovs/framework/arakoon_clusters').get('voldrv')
        if cluster_name is not None:
            arakoon_service_name = ArakoonInstaller.get_service_name_for_cluster(
                cluster_name=cluster_name)
            for service in service_type.services:
                if service.name == arakoon_service_name:
                    current_services.append(service)
                    if service.is_internal is True:
                        current_ips.append(service.storagerouter.ip)

        all_sr_ips = [
            storagerouter.ip
            for storagerouter in StorageRouterList.get_slaves()
        ]
        available_storagerouters = {}
        for storagerouter in StorageRouterList.get_masters():
            storagerouter.invalidate_dynamics(['partition_config'])
            if len(storagerouter.partition_config[DiskPartition.ROLES.DB]) > 0:
                available_storagerouters[storagerouter] = DiskPartition(
                    storagerouter.partition_config[DiskPartition.ROLES.DB][0])
            all_sr_ips.append(storagerouter.ip)

        if create_cluster is True and len(
                current_services) == 0:  # Create new cluster
            metadata = ArakoonInstaller.get_unused_arakoon_metadata_and_claim(
                cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD)
            if metadata is None:  # No externally managed cluster found, we create 1 ourselves
                if not available_storagerouters:
                    raise RuntimeError(
                        'Could not find any Storage Router with a DB role')

                storagerouter, partition = available_storagerouters.items()[0]
                arakoon_voldrv_cluster = 'voldrv'
                arakoon_installer = ArakoonInstaller(
                    cluster_name=arakoon_voldrv_cluster)
                arakoon_installer.create_cluster(
                    cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD,
                    ip=storagerouter.ip,
                    base_dir=partition.folder,
                    log_sinks=LogHandler.get_sink_path(
                        'arakoon-server_{0}'.format(arakoon_voldrv_cluster)),
                    crash_log_sinks=LogHandler.get_sink_path(
                        'arakoon-server-crash_{0}'.format(
                            arakoon_voldrv_cluster)))
                arakoon_installer.start_cluster()
                ports = arakoon_installer.ports[storagerouter.ip]
                metadata = arakoon_installer.metadata
                current_ips.append(storagerouter.ip)
            else:
                ports = []
                storagerouter = None

            cluster_name = metadata['cluster_name']
            Configuration.set('/ovs/framework/arakoon_clusters|voldrv',
                              cluster_name)
            StorageDriverController._logger.info(
                'Claiming {0} managed arakoon cluster: {1}'.format(
                    'externally' if storagerouter is None else 'internally',
                    cluster_name))
            StorageDriverController._configure_arakoon_to_volumedriver(
                cluster_name=cluster_name)
            current_services.append(
                _add_service(
                    service_storagerouter=storagerouter,
                    arakoon_ports=ports,
                    service_name=ArakoonInstaller.get_service_name_for_cluster(
                        cluster_name=cluster_name)))

        cluster_name = Configuration.get(
            '/ovs/framework/arakoon_clusters').get('voldrv')
        if cluster_name is None:
            return
        metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
            cluster_name=cluster_name)
        if 0 < len(current_services) < len(
                available_storagerouters) and metadata['internal'] is True:
            for storagerouter, partition in available_storagerouters.iteritems(
            ):
                if storagerouter.ip in current_ips:
                    continue
                arakoon_installer = ArakoonInstaller(cluster_name=cluster_name)
                arakoon_installer.load()
                arakoon_installer.extend_cluster(
                    new_ip=storagerouter.ip,
                    base_dir=partition.folder,
                    log_sinks=LogHandler.get_sink_path(
                        'arakoon-server_{0}'.format(cluster_name)),
                    crash_log_sinks=LogHandler.get_sink_path(
                        'arakoon-server-crash_{0}'.format(cluster_name)))
                _add_service(
                    service_storagerouter=storagerouter,
                    arakoon_ports=arakoon_installer.ports[storagerouter.ip],
                    service_name=ArakoonInstaller.get_service_name_for_cluster(
                        cluster_name=cluster_name))
                current_ips.append(storagerouter.ip)
                arakoon_installer.restart_cluster_after_extending(
                    new_ip=storagerouter.ip)
            StorageDriverController._configure_arakoon_to_volumedriver(
                cluster_name=cluster_name)
Пример #5
0
    def execute_scrub_work(queue, vpool, scrub_info, error_messages):
        """
        Executes scrub work for a given vDisk queue and vPool, based on scrub_info
        :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool)
        :type queue: Queue
        :param vpool: the vPool object of the vDisks
        :type vpool: VPool
        :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter
                           that needs to do the work
        :type scrub_info: dict
        :param error_messages: A list of error messages to be filled
        :type error_messages: list
        :return: a list of error messages
        :rtype: list
        """

        def _verify_mds_config(current_vdisk):
            current_vdisk.invalidate_dynamics('info')
            vdisk_configs = current_vdisk.info['metadata_backend_config']
            if len(vdisk_configs) == 0:
                raise RuntimeError('Could not load MDS configuration')
            return vdisk_configs

        client = None
        lock_time = 5 * 60
        storagerouter = scrub_info['storage_router']
        scrub_directory = '{0}/scrub_work_{1}_{2}'.format(scrub_info['scrub_path'], vpool.name, storagerouter.name)
        scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(vpool.guid, storagerouter.guid)
        backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(vpool.guid, storagerouter.guid)
        alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format(vpool.name, storagerouter.name)

        # Deploy a proxy
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_create(scrub_directory)
                client.dir_chmod(scrub_directory, 0777)  # Celery task executed by 'ovs' user and should be able to write in it
                if ServiceManager.has_service(name=alba_proxy_service, client=client) is True and ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True:
                    ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))
                    scrub_config = Configuration.get(scrub_config_key)
                else:
                    machine_id = System.get_my_machine_id(client)
                    port_range = Configuration.get('/ovs/framework/hosts/{0}/ports|storagedriver'.format(machine_id))
                    port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0]
                    # Scrub config
                    # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini',
                    #  u'fragment_cache': [u'none'],
                    #  u'ips': [u'127.0.0.1'],
                    #  u'log_level': u'info',
                    #  u'manifest_cache_size': 17179869184,
                    #  u'port': 0,
                    #  u'transport': u'tcp'}

                    # Backend config
                    # {u'alba_connection_host': u'10.100.193.155',
                    #  u'alba_connection_port': 26204,
                    #  u'alba_connection_preset': u'preset',
                    #  u'alba_connection_timeout': 15,
                    #  u'alba_connection_transport': u'TCP',
                    #  u'backend_interface_retries_on_error': 5,
                    #  u'backend_interface_retry_backoff_multiplier': 2.0,
                    #  u'backend_interface_retry_interval_secs': 1,
                    #  u'backend_type': u'ALBA'}
                    scrub_config = Configuration.get('ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid))
                    scrub_config['port'] = port
                    scrub_config['transport'] = 'tcp'
                    Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True)

                    params = {'VPOOL_NAME': vpool.name,
                              'LOG_SINK': LogHandler.get_sink_path('alba_proxy'),
                              'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key)}
                    ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service)
                    ServiceManager.start_service(name=alba_proxy_service, client=client)
                    ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))

                backend_config = Configuration.get('ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, vpool.storagedrivers[0].storagedriver_id))['backend_connection_manager']
                backend_config['alba_connection_host'] = '127.0.0.1'
                backend_config['alba_connection_port'] = scrub_config['port']
                Configuration.set(backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True)
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)
            if client is not None and ServiceManager.has_service(name=alba_proxy_service, client=client) is True:
                if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True:
                    ServiceManager.stop_service(name=alba_proxy_service, client=client)
                ServiceManager.remove_service(name=alba_proxy_service, client=client)
            if Configuration.exists(scrub_config_key):
                Configuration.delete(scrub_config_key)

        try:
            # Empty the queue with vDisks to scrub
            with remote(storagerouter.ip, [VDisk]) as rem:
                while True:
                    vdisk = None
                    vdisk_guid = queue.get(False)
                    try:
                        # Check MDS master is local. Trigger MDS handover if necessary
                        vdisk = rem.VDisk(vdisk_guid)
                        ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'.format(vpool.name, storagerouter.name, vdisk.name, scrub_directory))
                        configs = _verify_mds_config(current_vdisk=vdisk)
                        storagedriver = StorageDriverList.get_by_storagedriver_id(vdisk.storagedriver_id)
                        if configs[0].get('ip') != storagedriver.storagerouter.ip:
                            ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'.format(vpool.name, storagerouter.name, vdisk.name))
                            MDSServiceController.ensure_safety(VDisk(vdisk_guid))  # Do not use a remote VDisk instance here
                            configs = _verify_mds_config(current_vdisk=vdisk)
                            if configs[0].get('ip') != storagedriver.storagerouter.ip:
                                ScheduledTaskController._logger.warning('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'.format(vpool.name, storagerouter.name, vdisk.name))
                                continue

                        # Do the actual scrubbing
                        with vdisk.storagedriver_client.make_locked_client(str(vdisk.volume_id)) as locked_client:
                            ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'.format(vpool.name, storagerouter.name, vdisk.name))
                            work_units = locked_client.get_scrubbing_workunits()
                            for work_unit in work_units:
                                res = locked_client.scrub(work_unit=work_unit,
                                                          scratch_dir=scrub_directory,
                                                          log_sinks=[LogHandler.get_sink_path('scrubber', allow_override=True)],
                                                          backend_config=Configuration.get_configuration_path(backend_config_key))
                                locked_client.apply_scrubbing_result(scrubbing_work_result=res)
                            if work_units:
                                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'.format(vpool.name, storagerouter.name, vdisk.name, len(work_units)))
                            else:
                                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'.format(vpool.name, storagerouter.name, vdisk.name))
                    except Exception:
                        if vdisk is None:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(vpool.name, storagerouter.name, vdisk_guid)
                        else:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(vpool.name, storagerouter.name, vdisk.name)
                        error_messages.append(message)
                        ScheduledTaskController._logger.exception(message)

        except Empty:  # Raised when all items have been fetched from the queue
            ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'.format(vpool.name, storagerouter.name))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(vpool.name, storagerouter.name)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)

        # Delete the proxy again
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_delete(scrub_directory)
                if ServiceManager.has_service(alba_proxy_service, client=client):
                    ServiceManager.stop_service(alba_proxy_service, client=client)
                    ServiceManager.remove_service(alba_proxy_service, client=client)
                if Configuration.exists(scrub_config_key):
                    Configuration.delete(scrub_config_key)
                ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)
Пример #6
0
    def _deploy_stack_and_scrub(queue, vpool, scrub_info, error_messages):
        """
        Executes scrub work for a given vDisk queue and vPool, based on scrub_info
        :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool)
        :type queue: Queue
        :param vpool: the vPool object of the vDisks
        :type vpool: VPool
        :param scrub_info: A dict containing scrub information:
                           `scrub_path` with the path where to scrub
                           `storage_router` with the StorageRouter that needs to do the work
        :type scrub_info: dict
        :param error_messages: A list of error messages to be filled (by reference)
        :type error_messages: list
        :return: None
        :rtype: NoneType
        """
        if len(vpool.storagedrivers
               ) == 0 or not vpool.storagedrivers[0].storagedriver_id:
            error_messages.append(
                'vPool {0} does not have any valid StorageDrivers configured'.
                format(vpool.name))
            return

        service_manager = ServiceFactory.get_manager()
        client = None
        lock_time = 5 * 60
        storagerouter = scrub_info['storage_router']
        partition_guid = scrub_info['partition_guid']
        alba_proxy_service = 'ovs-albaproxy_{0}_{1}_{2}_scrub'.format(
            vpool.name, storagerouter.name, partition_guid)
        scrub_directory = '{0}/scrub_work_{1}_{2}'.format(
            scrub_info['scrub_path'], vpool.name, partition_guid)
        scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(
            vpool.guid, partition_guid)
        backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(
            vpool.guid, partition_guid)

        # Deploy a proxy
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                GenericController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_create(scrub_directory)
                client.dir_chmod(
                    scrub_directory, 0777
                )  # Celery task executed by 'ovs' user and should be able to write in it
                if service_manager.has_service(
                        name=alba_proxy_service, client=client
                ) is True and service_manager.get_service_status(
                        name=alba_proxy_service, client=client) == 'active':
                    GenericController._logger.info(
                        'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'
                        .format(vpool.name, storagerouter.name,
                                alba_proxy_service))
                    scrub_config = Configuration.get(scrub_config_key)
                else:
                    machine_id = System.get_my_machine_id(client)
                    port_range = Configuration.get(
                        '/ovs/framework/hosts/{0}/ports|storagedriver'.format(
                            machine_id))
                    with volatile_mutex('deploy_proxy_for_scrub_{0}'.format(
                            storagerouter.guid),
                                        wait=30):
                        port = System.get_free_ports(selected_range=port_range,
                                                     nr=1,
                                                     client=client)[0]
                    scrub_config = Configuration.get(
                        'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(
                            vpool.guid))
                    scrub_config['port'] = port
                    scrub_config['transport'] = 'tcp'
                    Configuration.set(scrub_config_key,
                                      json.dumps(scrub_config, indent=4),
                                      raw=True)

                    params = {
                        'VPOOL_NAME':
                        vpool.name,
                        'LOG_SINK':
                        LogHandler.get_sink_path(alba_proxy_service),
                        'CONFIG_PATH':
                        Configuration.get_configuration_path(scrub_config_key)
                    }
                    service_manager.add_service(name='ovs-albaproxy',
                                                params=params,
                                                client=client,
                                                target_name=alba_proxy_service)
                    service_manager.start_service(name=alba_proxy_service,
                                                  client=client)
                    GenericController._logger.info(
                        'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'
                        .format(vpool.name, storagerouter.name,
                                alba_proxy_service))

                backend_config = Configuration.get(
                    'ovs/vpools/{0}/hosts/{1}/config'.format(
                        vpool.guid, vpool.storagedrivers[0].storagedriver_id
                    ))['backend_connection_manager']
                if backend_config.get('backend_type') != 'MULTI':
                    backend_config['alba_connection_host'] = '127.0.0.1'
                    backend_config['alba_connection_port'] = scrub_config[
                        'port']
                else:
                    for value in backend_config.itervalues():
                        if isinstance(value, dict):
                            value['alba_connection_host'] = '127.0.0.1'
                            value['alba_connection_port'] = scrub_config[
                                'port']
                # Copy backend connection manager information in separate key
                Configuration.set(
                    backend_config_key,
                    json.dumps({"backend_connection_manager": backend_config},
                               indent=4),
                    raw=True)
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(
                vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            GenericController._logger.exception(message)
            if client is not None and service_manager.has_service(
                    name=alba_proxy_service, client=client) is True:
                if service_manager.get_service_status(
                        name=alba_proxy_service, client=client) == 'active':
                    service_manager.stop_service(name=alba_proxy_service,
                                                 client=client)
                service_manager.remove_service(name=alba_proxy_service,
                                               client=client)
            if Configuration.exists(scrub_config_key):
                Configuration.delete(scrub_config_key)

        # Execute the actual scrubbing
        threads = []
        threads_key = '/ovs/framework/hosts/{0}/config|scrub_stack_threads'.format(
            storagerouter.machine_id)
        amount_threads = Configuration.get(
            key=threads_key) if Configuration.exists(key=threads_key) else 2
        if not isinstance(amount_threads, int):
            error_messages.append(
                'Amount of threads to spawn must be an integer for StorageRouter with ID {0}'
                .format(storagerouter.machine_id))
            return

        amount_threads = max(amount_threads,
                             1)  # Make sure amount_threads is at least 1
        amount_threads = min(min(queue.qsize(), amount_threads),
                             20)  # Make sure amount threads is max 20
        GenericController._logger.info(
            'Scrubber - vPool {0} - StorageRouter {1} - Spawning {2} threads for proxy service {3}'
            .format(vpool.name, storagerouter.name, amount_threads,
                    alba_proxy_service))
        for index in range(amount_threads):
            thread = Thread(name='execute_scrub_{0}_{1}_{2}'.format(
                vpool.guid, partition_guid, index),
                            target=GenericController._execute_scrub,
                            args=(queue, vpool, scrub_info, scrub_directory,
                                  error_messages))
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()

        # Delete the proxy again
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                GenericController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_delete(scrub_directory)
                if service_manager.has_service(alba_proxy_service,
                                               client=client):
                    service_manager.stop_service(alba_proxy_service,
                                                 client=client)
                    service_manager.remove_service(alba_proxy_service,
                                                   client=client)
                if Configuration.exists(scrub_config_key):
                    Configuration.delete(scrub_config_key)
                GenericController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(
                vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            GenericController._logger.exception(message)
Пример #7
0
    def _execute_scrub(queue, vpool, scrub_info, scrub_dir, error_messages):
        def _verify_mds_config(current_vdisk):
            current_vdisk.invalidate_dynamics('info')
            vdisk_configs = current_vdisk.info['metadata_backend_config']
            if len(vdisk_configs) == 0:
                raise RuntimeError('Could not load MDS configuration')
            return vdisk_configs

        storagerouter = scrub_info['storage_router']
        partition_guid = scrub_info['partition_guid']
        volatile_client = VolatileFactory.get_client()
        backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(
            vpool.guid, partition_guid)
        try:
            # Empty the queue with vDisks to scrub
            with remote(storagerouter.ip, [VDisk]) as rem:
                while True:
                    vdisk = None
                    vdisk_guid = queue.get(
                        False
                    )  # Raises Empty Exception when queue is empty, so breaking the while True loop
                    volatile_key = 'ovs_scrubbing_vdisk_{0}'.format(vdisk_guid)
                    try:
                        # Check MDS master is local. Trigger MDS handover if necessary
                        vdisk = rem.VDisk(vdisk_guid)
                        GenericController._logger.info(
                            'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'
                            .format(vpool.name, storagerouter.name, vdisk.name,
                                    scrub_dir))
                        configs = _verify_mds_config(current_vdisk=vdisk)
                        storagedriver = StorageDriverList.get_by_storagedriver_id(
                            vdisk.storagedriver_id)
                        if configs[0].get(
                                'ip') != storagedriver.storagerouter.ip:
                            GenericController._logger.info(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            MDSServiceController.ensure_safety(
                                VDisk(vdisk_guid)
                            )  # Do not use a remote VDisk instance here
                            configs = _verify_mds_config(current_vdisk=vdisk)
                            if configs[0].get(
                                    'ip') != storagedriver.storagerouter.ip:
                                GenericController._logger.warning(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name))
                                continue

                        # Check if vDisk is already being scrubbed
                        if volatile_client.add(key=volatile_key,
                                               value=volatile_key,
                                               time=24 * 60 * 60) is False:
                            GenericController._logger.warning(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because vDisk is already being scrubbed'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            continue

                        # Do the actual scrubbing
                        with vdisk.storagedriver_client.make_locked_client(
                                str(vdisk.volume_id)) as locked_client:
                            GenericController._logger.info(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            work_units = locked_client.get_scrubbing_workunits(
                            )
                            for work_unit in work_units:
                                res = locked_client.scrub(
                                    work_unit=work_unit,
                                    scratch_dir=scrub_dir,
                                    log_sinks=[
                                        LogHandler.get_sink_path(
                                            'scrubber_{0}'.format(vpool.name),
                                            allow_override=True,
                                            forced_target_type='file')
                                    ],
                                    backend_config=Configuration.
                                    get_configuration_path(backend_config_key))
                                locked_client.apply_scrubbing_result(
                                    scrubbing_work_result=res)
                            if work_units:
                                GenericController._logger.info(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name, len(work_units)))
                            else:
                                GenericController._logger.info(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name))
                    except Exception:
                        if vdisk is None:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(
                                vpool.name, storagerouter.name, vdisk_guid)
                        else:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(
                                vpool.name, storagerouter.name, vdisk.name)
                        error_messages.append(message)
                        GenericController._logger.exception(message)
                    finally:
                        # Remove vDisk from volatile memory
                        volatile_client.delete(volatile_key)

        except Empty:  # Raised when all items have been fetched from the queue
            GenericController._logger.info(
                'Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'
                .format(vpool.name, storagerouter.name))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(
                vpool.name, storagerouter.name)
            error_messages.append(message)
            GenericController._logger.exception(message)
Пример #8
0
    def test_node_config_checkup(self):
        """
        Validates correct working of cluster registry checkup
        """
        base_structure = {'1': {'vrouter_id': '1',
                                'message_host': '10.0.1.1',
                                'message_port': 1,
                                'xmlrpc_host': '10.0.0.1',
                                'xmlrpc_port': 2,
                                'failovercache_host': '10.0.1.1',
                                'failovercache_port': 3,
                                'network_server_uri': 'tcp://10.0.1.1:4',
                                'node_distance_map': None},
                          '2': {'vrouter_id': '2',
                                'message_host': '10.0.1.2',
                                'message_port': 1,
                                'xmlrpc_host': '10.0.0.2',
                                'xmlrpc_port': 2,
                                'failovercache_host': '10.0.1.2',
                                'failovercache_port': 3,
                                'network_server_uri': 'tcp://10.0.1.2:4',
                                'node_distance_map': None}}

        def _validate_node_config(_config, _expected_map):
            expected = copy.deepcopy(base_structure[_config.vrouter_id])
            expected['node_distance_map'] = _expected_map[_config.vrouter_id]
            self.assertDictEqual(expected, {'vrouter_id': _config.vrouter_id,
                                            'message_host': _config.message_host,
                                            'message_port': _config.message_port,
                                            'xmlrpc_host': _config.xmlrpc_host,
                                            'xmlrpc_port': _config.xmlrpc_port,
                                            'failovercache_host': _config.failovercache_host,
                                            'failovercache_port': _config.failovercache_port,
                                            'network_server_uri': _config.network_server_uri,
                                            'node_distance_map': _config.node_distance_map})

        structure = DalHelper.build_dal_structure(
            {'vpools': [1],
             'domains': [1, 2],
             'storagerouters': [1, 2],
             'storagedrivers': [(1, 1, 1), (2, 1, 2)],  # (<id>, <vpool_id>, <storagerouter_id>)
             'storagerouter_domains': [(1, 1, 1, False), (2, 2, 1, False)]}  # (id>, <storagerouter_id>, <domain_id>, <backup>)
        )
        storagerouters = structure['storagerouters']
        vpool = structure['vpools'][1]
        arakoon_installer = ArakoonInstaller(cluster_name='voldrv')
        arakoon_installer.create_cluster(cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD,
                                         ip=storagerouters[1].ip,
                                         base_dir='/tmp',
                                         log_sinks=LogHandler.get_sink_path('arakoon-server_voldrv'),
                                         crash_log_sinks=LogHandler.get_sink_path('arakoon-server-crash_voldrv'))

        # Initial run, it will now be configured
        StorageRouterClient.node_config_recordings = []
        result = StorageDriverController.cluster_registry_checkup()
        self.assertDictEqual(result, {vpool.guid: {'success': True,
                                                   'changes': True}})
        self.assertListEqual(sorted(StorageRouterClient.node_config_recordings), ['1', '2'])
        expected_map = {'1': {'2': StorageDriver.DISTANCES.NEAR},
                        '2': {'1': StorageDriver.DISTANCES.NEAR}}
        configs = vpool.clusterregistry_client.get_node_configs()
        for config in configs:
            _validate_node_config(config, expected_map)

        # Running it again should not change anything
        StorageRouterClient.node_config_recordings = []
        result = StorageDriverController.cluster_registry_checkup()
        self.assertDictEqual(result, {vpool.guid: {'success': True,
                                                   'changes': False}})
        self.assertListEqual(sorted(StorageRouterClient.node_config_recordings), [])
        expected_map = {'1': {'2': StorageDriver.DISTANCES.NEAR},
                        '2': {'1': StorageDriver.DISTANCES.NEAR}}
        configs = vpool.clusterregistry_client.get_node_configs()
        for config in configs:
            _validate_node_config(config, expected_map)

        # Validate some error paths
        domain = structure['domains'][2]
        junction = structure['storagerouters'][1].domains[0]
        junction.domain = domain
        junction.save()
        vpool_config_path = 'file://opt/OpenvStorage/config/framework.json?key=/ovs/vpools/{0}/hosts/1/config'.format(vpool.guid)
        StorageRouterClient.exceptions['server_revision'] = {vpool_config_path: Exception('ClusterNotReachableException')}
        StorageRouterClient.node_config_recordings = []
        result = StorageDriverController.cluster_registry_checkup()
        self.assertDictEqual(result, {vpool.guid: {'success': True,
                                                   'changes': True}})
        self.assertListEqual(sorted(StorageRouterClient.node_config_recordings), ['2'])
        expected_map = {'1': {'2': StorageDriver.DISTANCES.INFINITE},
                        '2': {'1': StorageDriver.DISTANCES.INFINITE}}
        configs = vpool.clusterregistry_client.get_node_configs()
        for config in configs:
            _validate_node_config(config, expected_map)
Пример #9
0
    def create_cluster(cluster_name, cluster_type, ip, base_dir, plugins=None, locked=True, internal=True, filesystem=False, ports=None):
        """
        Always creates a cluster but marks it's usage according to the internal flag
        :param cluster_name: Name of the cluster
        :type cluster_name: str
        :param cluster_type: Type of the cluster (See ServiceType.ARAKOON_CLUSTER_TYPES)
        :type cluster_type: str
        :param ip: IP address of the first node of the new cluster
        :type ip: str
        :param base_dir: Base directory that should contain the data and tlogs
        :type base_dir: str
        :param plugins: Plugins that should be added to the configuration file
        :type plugins: dict
        :param locked: Indicates whether the create should run in a locked context (e.g. to prevent port conflicts)
        :type locked: bool
        :param internal: Is cluster internally managed by OVS
        :type internal: bool
        :param filesystem: Indicates whether the configuration should be on the filesystem or in a configuration cluster
        :type filesystem: bool
        :param ports: A list of ports to be used for this cluster's node
        :type ports: list
        :return: Ports used by arakoon cluster
        :rtype: dict
        """
        if cluster_type not in ServiceType.ARAKOON_CLUSTER_TYPES:
            raise ValueError('Cluster type {0} is not supported. Please choose from {1}'.format(cluster_type, ', '.join(ServiceType.ARAKOON_CLUSTER_TYPES)))

        client = SSHClient(ip, username=ArakoonInstaller.SSHCLIENT_USER)
        if filesystem is True:
            exists = client.file_exists(ArakoonClusterConfig.CONFIG_FILE.format(cluster_name))
        else:
            exists = Configuration.dir_exists('/ovs/arakoon/{0}'.format(cluster_name))
        if exists is True:
            raise ValueError('An Arakoon cluster with name "{0}" already exists'.format(cluster_name))

        ArakoonInstaller._logger.debug('Creating cluster {0} on {1}'.format(cluster_name, ip))

        node_name = System.get_my_machine_id(client)
        base_dir = base_dir.rstrip('/')
        home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name)
        tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name)
        ArakoonInstaller.clean_leftover_arakoon_data(ip, [home_dir, tlog_dir])

        port_mutex = None
        try:
            if locked is True:
                from ovs.extensions.generic.volatilemutex import volatile_mutex
                port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(ip))
                port_mutex.acquire(wait=60)
            if ports is None:
                ports = ArakoonInstaller._get_free_ports(client)
            config = ArakoonClusterConfig(cluster_name, filesystem, plugins.keys() if plugins is not None else None)
            config.nodes.append(ArakoonNodeConfig(name=node_name,
                                                  ip=ip,
                                                  client_port=ports[0],
                                                  messaging_port=ports[1],
                                                  log_sinks=LogHandler.get_sink_path('arakoon_server'),
                                                  crash_log_sinks=LogHandler.get_sink_path('arakoon_server_crash'),
                                                  home=home_dir,
                                                  tlog_dir=tlog_dir))
            metadata = {'internal': internal,
                        'cluster_name': cluster_name,
                        'cluster_type': cluster_type.upper(),
                        'in_use': False}
            service_metadata = ArakoonInstaller._deploy(config=config,
                                                        filesystem=filesystem,
                                                        plugins=plugins.values() if plugins is not None else None,
                                                        delay_service_registration=cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.CFG)[ip]
        finally:
            if port_mutex is not None:
                port_mutex.release()

        ArakoonInstaller._logger.debug('Creating cluster {0} on {1} completed'.format(cluster_name, ip))
        return {'metadata': metadata,
                'client_port': ports[0],
                'messaging_port': ports[1],
                'service_metadata': service_metadata}
Пример #10
0
    def promote_node(cluster_ip, master_ip, ip_client_map, unique_id,
                     configure_memcached, configure_rabbitmq):
        """
        Promotes a given node
        """
        from ovs.dal.lists.storagerouterlist import StorageRouterList
        from ovs.dal.lists.servicetypelist import ServiceTypeList
        from ovs.dal.lists.servicelist import ServiceList
        from ovs.dal.hybrids.service import Service

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Promoting node',
                    title=True)
        service_manager = ServiceFactory.get_manager()
        if configure_memcached is True:
            if NodeTypeController._validate_local_memcache_servers(
                    ip_client_map) is False:
                raise RuntimeError(
                    'Not all memcache nodes can be reached which is required for promoting a node.'
                )

        target_client = ip_client_map[cluster_ip]
        machine_id = System.get_my_machine_id(target_client)
        node_name, _ = target_client.get_hostname()
        master_client = ip_client_map[master_ip]

        storagerouter = StorageRouterList.get_by_machine_id(unique_id)
        storagerouter.node_type = 'MASTER'
        storagerouter.save()

        external_config = Configuration.get('/ovs/framework/external_config')
        if external_config is None:
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Joining Arakoon configuration cluster')
            arakoon_installer = ArakoonInstaller(cluster_name='config')
            arakoon_installer.load(ip=master_ip)
            arakoon_installer.extend_cluster(
                new_ip=cluster_ip,
                base_dir=Configuration.get('/ovs/framework/paths|ovsdb'),
                log_sinks=LogHandler.get_sink_path('arakoon-server_config'),
                crash_log_sinks=LogHandler.get_sink_path(
                    'arakoon-server-crash_config'))
            arakoon_installer.restart_cluster_after_extending(
                new_ip=cluster_ip)
            service_manager.register_service(
                node_name=machine_id,
                service_metadata=arakoon_installer.service_metadata[cluster_ip]
            )

        # Find other (arakoon) master nodes
        arakoon_cluster_name = str(
            Configuration.get('/ovs/framework/arakoon_clusters|ovsdb'))
        arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
            cluster_name=arakoon_cluster_name)
        config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name)
        master_node_ips = [node.ip for node in config.nodes]
        if cluster_ip in master_node_ips:
            master_node_ips.remove(cluster_ip)
        if len(master_node_ips) == 0:
            raise RuntimeError(
                'There should be at least one other master node')

        arakoon_ports = []
        if arakoon_metadata['internal'] is True:
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Joining Arakoon OVS DB cluster')
            arakoon_installer = ArakoonInstaller(
                cluster_name=arakoon_cluster_name)
            arakoon_installer.load()
            arakoon_installer.extend_cluster(
                new_ip=cluster_ip,
                base_dir=Configuration.get('/ovs/framework/paths|ovsdb'),
                log_sinks=LogHandler.get_sink_path(
                    'arakoon-server_{0}'.format(arakoon_cluster_name)),
                crash_log_sinks=LogHandler.get_sink_path(
                    'arakoon-server-crash_{0}'.format(arakoon_cluster_name)))
            arakoon_installer.restart_cluster_after_extending(
                new_ip=cluster_ip)
            arakoon_ports = arakoon_installer.ports[cluster_ip]

        if configure_memcached is True:
            NodeTypeController.configure_memcached(
                client=target_client, logger=NodeTypeController._logger)
        NodeTypeController.add_services(client=target_client,
                                        node_type='master',
                                        logger=NodeTypeController._logger)

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Update configurations')
        if configure_memcached is True:
            endpoints = Configuration.get('/ovs/framework/memcache|endpoints')
            endpoint = '{0}:11211'.format(cluster_ip)
            if endpoint not in endpoints:
                endpoints.append(endpoint)
                Configuration.set('/ovs/framework/memcache|endpoints',
                                  endpoints)
        if configure_rabbitmq is True:
            endpoints = Configuration.get(
                '/ovs/framework/messagequeue|endpoints')
            endpoint = '{0}:5672'.format(cluster_ip)
            if endpoint not in endpoints:
                endpoints.append(endpoint)
                Configuration.set('/ovs/framework/messagequeue|endpoints',
                                  endpoints)

        if arakoon_metadata['internal'] is True:
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Restarting master node services')
            PersistentFactory.store = None
            VolatileFactory.store = None

            if 'arakoon-ovsdb' not in [
                    s.name for s in ServiceList.get_services() if
                    s.is_internal is False or s.storagerouter.ip == cluster_ip
            ]:
                service = Service()
                service.name = 'arakoon-ovsdb'
                service.type = ServiceTypeList.get_by_name(
                    ServiceType.SERVICE_TYPES.ARAKOON)
                service.ports = arakoon_ports
                service.storagerouter = storagerouter
                service.save()

        if configure_rabbitmq is True:
            NodeTypeController.configure_rabbitmq(
                client=target_client, logger=NodeTypeController._logger)
            # Copy rabbitmq cookie
            rabbitmq_cookie_file = '/var/lib/rabbitmq/.erlang.cookie'

            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Copying RabbitMQ cookie')
            contents = master_client.file_read(rabbitmq_cookie_file)
            master_hostname, _ = master_client.get_hostname()
            target_client.dir_create(os.path.dirname(rabbitmq_cookie_file))
            target_client.file_write(rabbitmq_cookie_file, contents)
            target_client.file_chmod(rabbitmq_cookie_file, mode=0400)
            target_client.run(['rabbitmq-server', '-detached'])
            time.sleep(5)
            target_client.run(['rabbitmqctl', 'stop_app'])
            time.sleep(5)
            target_client.run([
                'rabbitmqctl', 'join_cluster',
                'rabbit@{0}'.format(master_hostname)
            ])
            time.sleep(5)
            target_client.run(['rabbitmqctl', 'stop'])
            time.sleep(5)

            # Enable HA for the rabbitMQ queues
            Toolbox.change_service_state(target_client, 'rabbitmq-server',
                                         'start', NodeTypeController._logger)
            NodeTypeController.check_rabbitmq_and_enable_ha_mode(
                client=target_client, logger=NodeTypeController._logger)

        NodeTypeController._configure_amqp_to_volumedriver()

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Starting services')
        services = ['memcached', 'arakoon-ovsdb', 'rabbitmq-server']
        if arakoon_metadata['internal'] is True:
            services.remove('arakoon-ovsdb')
        for service in services:
            if service_manager.has_service(service, client=target_client):
                Toolbox.change_service_state(target_client, service, 'start',
                                             NodeTypeController._logger)

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Restarting services')
        NodeTypeController.restart_framework_and_memcache_services(
            clients=ip_client_map, logger=NodeTypeController._logger)

        if Toolbox.run_hooks(component='nodetype',
                             sub_component='promote',
                             logger=NodeTypeController._logger,
                             cluster_ip=cluster_ip,
                             master_ip=master_ip):
            Toolbox.log(logger=NodeTypeController._logger,
                        messages='Restarting services')
            NodeTypeController.restart_framework_and_memcache_services(
                clients=ip_client_map, logger=NodeTypeController._logger)

        if NodeTypeController.avahi_installed(
                client=target_client,
                logger=NodeTypeController._logger) is True:
            NodeTypeController.configure_avahi(
                client=target_client,
                node_name=node_name,
                node_type='master',
                logger=NodeTypeController._logger)
        Configuration.set('/ovs/framework/hosts/{0}/type'.format(machine_id),
                          'MASTER')
        target_client.run(
            ['chown', '-R', 'ovs:ovs', '/opt/OpenvStorage/config'])
        Configuration.set(
            '/ovs/framework/hosts/{0}/promotecompleted'.format(machine_id),
            True)

        if target_client.file_exists('/tmp/ovs_rollback'):
            target_client.file_delete('/tmp/ovs_rollback')

        Toolbox.log(logger=NodeTypeController._logger,
                    messages='Promote complete')
Пример #11
0
    def execute_scrub_work(queue, vpool, scrub_info, error_messages):
        """
        Executes scrub work for a given vDisk queue and vPool, based on scrub_info
        :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool)
        :type queue: Queue
        :param vpool: the vPool object of the vDisks
        :type vpool: VPool
        :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter
                           that needs to do the work
        :type scrub_info: dict
        :param error_messages: A list of error messages to be filled
        :type error_messages: list
        :return: a list of error messages
        :rtype: list
        """
        def _verify_mds_config(current_vdisk):
            current_vdisk.invalidate_dynamics('info')
            vdisk_configs = current_vdisk.info['metadata_backend_config']
            if len(vdisk_configs) == 0:
                raise RuntimeError('Could not load MDS configuration')
            return vdisk_configs

        client = None
        lock_time = 5 * 60
        storagerouter = scrub_info['storage_router']
        scrub_directory = '{0}/scrub_work_{1}_{2}'.format(
            scrub_info['scrub_path'], vpool.name, storagerouter.name)
        scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(
            vpool.guid, storagerouter.guid)
        backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(
            vpool.guid, storagerouter.guid)
        alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format(
            vpool.name, storagerouter.name)

        # Deploy a proxy
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                ScheduledTaskController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_create(scrub_directory)
                client.dir_chmod(
                    scrub_directory, 0777
                )  # Celery task executed by 'ovs' user and should be able to write in it
                if ServiceManager.has_service(
                        name=alba_proxy_service, client=client
                ) is True and ServiceManager.get_service_status(
                        name=alba_proxy_service, client=client) is True:
                    ScheduledTaskController._logger.info(
                        'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'
                        .format(vpool.name, storagerouter.name,
                                alba_proxy_service))
                    scrub_config = Configuration.get(scrub_config_key)
                else:
                    machine_id = System.get_my_machine_id(client)
                    port_range = Configuration.get(
                        '/ovs/framework/hosts/{0}/ports|storagedriver'.format(
                            machine_id))
                    port = System.get_free_ports(selected_range=port_range,
                                                 nr=1,
                                                 client=client)[0]
                    # Scrub config
                    # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini',
                    #  u'fragment_cache': [u'none'],
                    #  u'ips': [u'127.0.0.1'],
                    #  u'log_level': u'info',
                    #  u'manifest_cache_size': 17179869184,
                    #  u'port': 0,
                    #  u'transport': u'tcp'}

                    # Backend config
                    # {u'alba_connection_host': u'10.100.193.155',
                    #  u'alba_connection_port': 26204,
                    #  u'alba_connection_preset': u'preset',
                    #  u'alba_connection_timeout': 15,
                    #  u'alba_connection_transport': u'TCP',
                    #  u'backend_interface_retries_on_error': 5,
                    #  u'backend_interface_retry_backoff_multiplier': 2.0,
                    #  u'backend_interface_retry_interval_secs': 1,
                    #  u'backend_type': u'ALBA'}
                    scrub_config = Configuration.get(
                        'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(
                            vpool.guid))
                    scrub_config['port'] = port
                    scrub_config['transport'] = 'tcp'
                    Configuration.set(scrub_config_key,
                                      json.dumps(scrub_config, indent=4),
                                      raw=True)

                    params = {
                        'VPOOL_NAME':
                        vpool.name,
                        'LOG_SINK':
                        LogHandler.get_sink_path('alba_proxy'),
                        'CONFIG_PATH':
                        Configuration.get_configuration_path(scrub_config_key)
                    }
                    ServiceManager.add_service(name='ovs-albaproxy',
                                               params=params,
                                               client=client,
                                               target_name=alba_proxy_service)
                    ServiceManager.start_service(name=alba_proxy_service,
                                                 client=client)
                    ScheduledTaskController._logger.info(
                        'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'
                        .format(vpool.name, storagerouter.name,
                                alba_proxy_service))

                backend_config = Configuration.get(
                    'ovs/vpools/{0}/hosts/{1}/config'.format(
                        vpool.guid, vpool.storagedrivers[0].storagedriver_id
                    ))['backend_connection_manager']
                backend_config['alba_connection_host'] = '127.0.0.1'
                backend_config['alba_connection_port'] = scrub_config['port']
                Configuration.set(
                    backend_config_key,
                    json.dumps({"backend_connection_manager": backend_config},
                               indent=4),
                    raw=True)
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(
                vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)
            if client is not None and ServiceManager.has_service(
                    name=alba_proxy_service, client=client) is True:
                if ServiceManager.get_service_status(name=alba_proxy_service,
                                                     client=client) is True:
                    ServiceManager.stop_service(name=alba_proxy_service,
                                                client=client)
                ServiceManager.remove_service(name=alba_proxy_service,
                                              client=client)
            if Configuration.exists(scrub_config_key):
                Configuration.delete(scrub_config_key)

        try:
            # Empty the queue with vDisks to scrub
            with remote(storagerouter.ip, [VDisk]) as rem:
                while True:
                    vdisk = None
                    vdisk_guid = queue.get(False)
                    try:
                        # Check MDS master is local. Trigger MDS handover if necessary
                        vdisk = rem.VDisk(vdisk_guid)
                        ScheduledTaskController._logger.info(
                            'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'
                            .format(vpool.name, storagerouter.name, vdisk.name,
                                    scrub_directory))
                        configs = _verify_mds_config(current_vdisk=vdisk)
                        storagedriver = StorageDriverList.get_by_storagedriver_id(
                            vdisk.storagedriver_id)
                        if configs[0].get(
                                'ip') != storagedriver.storagerouter.ip:
                            ScheduledTaskController._logger.info(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            MDSServiceController.ensure_safety(
                                VDisk(vdisk_guid)
                            )  # Do not use a remote VDisk instance here
                            configs = _verify_mds_config(current_vdisk=vdisk)
                            if configs[0].get(
                                    'ip') != storagedriver.storagerouter.ip:
                                ScheduledTaskController._logger.warning(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name))
                                continue

                        # Do the actual scrubbing
                        with vdisk.storagedriver_client.make_locked_client(
                                str(vdisk.volume_id)) as locked_client:
                            ScheduledTaskController._logger.info(
                                'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'
                                .format(vpool.name, storagerouter.name,
                                        vdisk.name))
                            work_units = locked_client.get_scrubbing_workunits(
                            )
                            for work_unit in work_units:
                                res = locked_client.scrub(
                                    work_unit=work_unit,
                                    scratch_dir=scrub_directory,
                                    log_sinks=[
                                        LogHandler.get_sink_path(
                                            'scrubber', allow_override=True)
                                    ],
                                    backend_config=Configuration.
                                    get_configuration_path(backend_config_key))
                                locked_client.apply_scrubbing_result(
                                    scrubbing_work_result=res)
                            if work_units:
                                ScheduledTaskController._logger.info(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name, len(work_units)))
                            else:
                                ScheduledTaskController._logger.info(
                                    'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'
                                    .format(vpool.name, storagerouter.name,
                                            vdisk.name))
                    except Exception:
                        if vdisk is None:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(
                                vpool.name, storagerouter.name, vdisk_guid)
                        else:
                            message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(
                                vpool.name, storagerouter.name, vdisk.name)
                        error_messages.append(message)
                        ScheduledTaskController._logger.exception(message)

        except Empty:  # Raised when all items have been fetched from the queue
            ScheduledTaskController._logger.info(
                'Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'
                .format(vpool.name, storagerouter.name))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(
                vpool.name, storagerouter.name)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)

        # Delete the proxy again
        try:
            with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time):
                ScheduledTaskController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
                client = SSHClient(storagerouter, 'root')
                client.dir_delete(scrub_directory)
                if ServiceManager.has_service(alba_proxy_service,
                                              client=client):
                    ServiceManager.stop_service(alba_proxy_service,
                                                client=client)
                    ServiceManager.remove_service(alba_proxy_service,
                                                  client=client)
                if Configuration.exists(scrub_config_key):
                    Configuration.delete(scrub_config_key)
                ScheduledTaskController._logger.info(
                    'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'
                    .format(vpool.name, storagerouter.name,
                            alba_proxy_service))
        except Exception:
            message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(
                vpool.name, storagerouter.name, alba_proxy_service)
            error_messages.append(message)
            ScheduledTaskController._logger.exception(message)