示例#1
0
    def get_mds_storagedriver_config_set(vpool):
        """
        Builds a configuration for all StorageRouters from a given VPool with following goals:
        * Primary MDS is the local one
        * All slaves are on different hosts
        * Maximum `mds.safety` nodes are returned
        """

        mds_per_storagerouter = {}
        mds_per_load = {}
        for storagedriver in vpool.storagedrivers:
            storagerouter = storagedriver.storagerouter
            mds_service, load = MDSServiceController.get_preferred_mds(storagerouter, vpool, include_load=True)
            mds_per_storagerouter[storagerouter.guid] = {'host': storagerouter.ip, 'port': mds_service.service.ports[0]}
            if load not in mds_per_load:
                mds_per_load[load] = []
            mds_per_load[load].append(storagerouter.guid)

        safety = Configuration.getInt('ovs.storagedriver.mds.safety')
        config_set = {}
        for storagerouter_guid in mds_per_storagerouter:
            config_set[storagerouter_guid] = [mds_per_storagerouter[storagerouter_guid]]
            for load in sorted(mds_per_load.keys()):
                if len(config_set[storagerouter_guid]) >= safety:
                    break
                sr_guids = mds_per_load[load]
                random.shuffle(sr_guids)
                for sr_guid in sr_guids:
                    if len(config_set[storagerouter_guid]) >= safety:
                        break
                    if sr_guid != storagerouter_guid:
                        config_set[storagerouter_guid].append(mds_per_storagerouter[sr_guid])
        return config_set
 def test_single_node(self):
     base_port = Configuration.getInt('ovs.ports.arakoon')
     cluster = 'one'
     node = sorted(TestArakoonInstaller.nodes.keys())[0]
     ArakoonInstaller.create_cluster(cluster, node, [])
     contents = SSHClient.load(node).file_read(self._get_config_path(cluster))
     expected  = TestArakoonInstaller.expected_global.format(cluster, TestArakoonInstaller.nodes[node])
     expected += TestArakoonInstaller.expected_base.format(TestArakoonInstaller.nodes[node], node, base_port, base_port + 1)
     self.assertEqual(contents.strip(), expected.strip())
示例#3
0
 def mds_checkup():
     """
     Validates the current MDS setup/configuration and takes actions where required
     """
     mds_dict = {}
     for vpool in VPoolList.get_vpools():
         for mds_service in vpool.mds_services:
             storagerouter = mds_service.service.storagerouter
             if vpool not in mds_dict:
                 mds_dict[vpool] = {}
             if storagerouter not in mds_dict[vpool]:
                 mds_dict[vpool][storagerouter] = []
             mds_dict[vpool][storagerouter].append(mds_service)
     for vpool in mds_dict:
         # 1. First, make sure there's at least one MDS on every StorageRouter that's not overloaded
         # If not, create an extra MDS for that StorageRouter
         for storagerouter in mds_dict[vpool]:
             mds_services = mds_dict[vpool][storagerouter]
             has_room = False
             for mds_service in mds_services:
                 load, _ = MDSServiceController.get_mds_load(mds_service)
                 if load < Configuration.getInt('ovs.storagedriver.mds.maxload'):
                     has_room = True
                     break
             if has_room is False:
                 client = SSHClient.load(storagerouter.ip)
                 mds_service = MDSServiceController.prepare_mds_service(client, storagerouter, vpool,
                                                                        fresh_only=False, start=True)
                 if mds_service is None:
                     raise RuntimeError('Could not add MDS node')
                 mds_dict[vpool][storagerouter].append(mds_service)
         mds_config_set = MDSServiceController.get_mds_storagedriver_config_set(vpool)
         for storagerouter in mds_dict[vpool]:
             client = SSHClient.load(storagerouter.ip)
             storagedriver_config = StorageDriverConfiguration('storagedriver', vpool.name)
             storagedriver_config.load(client)
             if storagedriver_config.is_new is False:
                 storagedriver_config.clean()  # Clean out obsolete values
                 storagedriver_config.configure_filesystem(
                     fs_metadata_backend_mds_nodes=mds_config_set[storagerouter.guid]
                 )
                 storagedriver_config.save(client)
         # 2. Per VPool, execute a safety check, making sure the master/slave configuration is optimal.
         for vdisk in vpool.vdisks:
             MDSServiceController.ensure_safety(vdisk)
 def test_multi_node(self):
     base_port = Configuration.getInt('ovs.ports.arakoon')
     cluster = 'one'
     nodes = sorted(TestArakoonInstaller.nodes.keys())
     ArakoonInstaller.create_cluster(cluster, nodes[0], [])
     for node in nodes[1:]:
         ArakoonInstaller.extend_cluster(nodes[0], node, cluster, [])
     expected = TestArakoonInstaller.expected_global.format(cluster, ','.join(TestArakoonInstaller.nodes[node] for node in nodes))
     for node in nodes:
         expected += TestArakoonInstaller.expected_base.format(TestArakoonInstaller.nodes[node], node, base_port, base_port + 1)
     expected = expected.strip()
     for node in nodes:
         contents = SSHClient.load(node).file_read(self._get_config_path(cluster))
         self.assertEqual(contents.strip(), expected.strip())
     node = nodes[0]
     ArakoonInstaller.shrink_cluster(nodes[1], node, cluster)
     expected = TestArakoonInstaller.expected_global.format(cluster, ','.join(TestArakoonInstaller.nodes[node] for node in nodes[1:]))
     for node in nodes[1:]:
         expected += TestArakoonInstaller.expected_base.format(TestArakoonInstaller.nodes[node], node, base_port, base_port + 1)
     expected = expected.strip()
     for node in nodes[1:]:
         contents = SSHClient.load(node).file_read(self._get_config_path(cluster))
         self.assertEqual(contents.strip(), expected.strip())
示例#5
0
    def ensure_safety(vdisk, excluded_storagerouters=None):
        """
        Ensures (or tries to ensure) the safety of a given vdisk (except hypervisor).
        Assumptions:
        * A local overloaded master is better than a non-local non-overloaded master
        * Prefer master/services to be on different hosts, a subsequent slave on the same node doesn't add safety
        * Don't actively overload services (e.g. configure an MDS as slave causing it to get overloaded)
        * Too much safety is not wanted (it adds loads to nodes while not required)
        """

        logger.debug('Ensuring MDS safety for vdisk {0}'.format(vdisk.guid))
        vdisk.reload_client()
        if excluded_storagerouters is None:
            excluded_storagerouters = []
        maxload = Configuration.getInt('ovs.storagedriver.mds.maxload')
        safety = Configuration.getInt('ovs.storagedriver.mds.safety')
        tlogs = Configuration.getInt('ovs.storagedriver.mds.tlogs')
        services = [mds_service.service for mds_service in vdisk.vpool.mds_services
                    if mds_service.service.storagerouter not in excluded_storagerouters]
        nodes = set(service.storagerouter.ip for service in services)
        services_load = {}
        service_per_key = {}
        for service in services:
            load, load_plus = MDSServiceController.get_mds_load(service.mds_service)
            services_load[service.guid] = load, load_plus
            service_per_key['{0}:{1}'.format(service.storagerouter.ip, service.ports[0])] = service

        # List current configuration and filter out excluded services
        reconfigure_required = False
        reconfigure_reasons = []
        vdisk.invalidate_dynamics(['info', 'storagedriver_id', 'storagerouter_guid'])
        configs = vdisk.info['metadata_backend_config']
        for config in configs:
            config['key'] = '{0}:{1}'.format(config['ip'], config['port'])
        master_service = None
        if len(configs) > 0:
            config = configs[0]
            if config['key'] in service_per_key:
                master_service = service_per_key.get(config['key'])
                configs.remove(config)
            else:
                reconfigure_required = True
                reconfigure_reasons.append('Master ({0}:{1}) cannot be used anymore'.format(config['ip'], config['port']))
        slave_services = []
        for config in configs:
            if config['key'] in service_per_key:
                slave_services.append(service_per_key[config['key']])
            else:
                reconfigure_required = True
                reconfigure_reasons.append('Slave ({0}:{1}) cannot be used anymore'.format(config['ip'], config['port']))

        # Fix services_load
        services_per_load = {}
        for service in services:
            if service == master_service or service in slave_services:
                load = services_load[service.guid][0]
            else:
                load = services_load[service.guid][1]
            services_load[service.guid] = load
            if load not in services_per_load:
                services_per_load[load] = []
            services_per_load[load].append(service)

        # Further checks if a reconfiguration is required.
        service_nodes = []
        if master_service is not None:
            service_nodes.append(master_service.storagerouter.ip)
        for service in slave_services:
            ip = service.storagerouter.ip
            if ip in service_nodes:
                reconfigure_required = True
                reconfigure_reasons.append('Multiple MDS services on the same node')
            else:
                service_nodes.append(ip)
        if len(service_nodes) > safety:
            # Too much safety
            reconfigure_required = True
            reconfigure_reasons.append('Too much safety')
        if len(service_nodes) < safety and len(service_nodes) < len(nodes):
            # Insufficient MDS services configured while there should be sufficient nodes available
            reconfigure_required = True
            reconfigure_reasons.append('Not enough safety')
        if master_service is not None and services_load[master_service.guid] > maxload:
            # The master service is overloaded
            reconfigure_required = True
            reconfigure_reasons.append('Master overloaded')
        if master_service is not None and master_service.storagerouter_guid != vdisk.storagerouter_guid:
            # The master is not local
            reconfigure_required = True
            reconfigure_reasons.append('Master is not local')
        if any(service for service in slave_services if services_load[service.guid] > maxload):
            # There's a slave service overloaded
            reconfigure_required = True
            reconfigure_reasons.append('One or more slaves overloaded')

        if reconfigure_required is False:
            logger.debug('No reconfiguration required for vdisk {0}'.format(vdisk.guid))
            MDSServiceController.sync_vdisk_to_reality(vdisk)
            return

        logger.debug('Reconfiguration required for vdisk {0}:'.format(vdisk.guid))
        for reason in reconfigure_reasons:
            logger.debug('Reason: {0} - vdisk {1}'.format(reason, vdisk.guid))
        # Prepare fresh configuration
        new_services = []

        # Check whether the master (if available) is non-local to the vdisk and/or is overloaded
        master_ok = master_service is not None
        if master_ok is True:
            master_ok = master_service.storagerouter_guid == vdisk.storagerouter_guid and services_load[master_service.guid] <= maxload

        if master_ok:
            # Add this master to the fresh configuration
            new_services.append(master_service)
        else:
            # Try to find the best non-overloaded local MDS (slave)
            candidate_master = None
            candidate_master_load = 0
            local_mds = None
            local_mds_load = 0
            for service in services:
                load = services_load[service.guid]
                if load <= maxload and service.storagerouter_guid == vdisk.storagerouter_guid:
                    if local_mds is None or local_mds_load > load:
                        # This service is a non-overloaded local MDS
                        local_mds = service
                        local_mds_load = load
                    if service in slave_services:
                        if candidate_master is None or candidate_master_load > load:
                            # This service is a non-overloaded local slave
                            candidate_master = service
                            candidate_master_load = load
            if candidate_master is not None:
                # A non-overloaded local slave was found.
                client = MetadataServerClient.load(candidate_master)
                amount_of_tlogs = client.catch_up(str(vdisk.volume_id), True)
                if amount_of_tlogs < tlogs:
                    # Almost there. Catching up right now, and continue as soon as it's up-to-date
                    start = time.time()
                    client.catch_up(str(vdisk.volume_id), False)
                    logger.debug('MDS catch up for vdisk {0} took {1}s'.format(vdisk.guid, round(time.time() - start, 2)))
                    # It's up to date, so add it as a new master
                    new_services.append(candidate_master)
                    if master_service is not None:
                        # The current master (if available) is now candidate for become one of the slaves
                        slave_services.append(master_service)
                else:
                    # It's not up to date, keep the previous master (if available) and give the local slave
                    # some more time to catch up
                    if master_service is not None:
                        new_services.append(master_service)
                    new_services.append(candidate_master)
                if candidate_master in slave_services:
                    slave_services.remove(candidate_master)
            else:
                # There's no non-overloaded local slave found. Keep the current master (if available) and add
                # a local MDS (if available) as slave
                if master_service is not None:
                    new_services.append(master_service)
                if local_mds is not None:
                    new_services.append(local_mds)
                    if local_mds in slave_services:
                        slave_services.remove(local_mds)

        # At this point, there might (or might not) be a (new) master, and a (catching up) slave. The rest of the non-local
        # MDS nodes must now be added to the configuration until the safety is reached. There's always one extra
        # slave recycled to make sure there's always an (almost) up-to-date slave ready for failover
        loads = sorted(load for load in services_per_load.keys() if load <= maxload)
        nodes = set(service.storagerouter.ip for service in new_services)
        slave_added = False
        if len(nodes) < safety:
            for load in loads:
                for service in services_per_load[load]:
                    if slave_added is False and service in slave_services and service.storagerouter.ip not in nodes:
                        new_services.append(service)
                        slave_services.remove(service)
                        nodes.add(service.storagerouter.ip)
                        slave_added = True
        if len(nodes) < safety:
            for load in loads:
                for service in services_per_load[load]:
                    if len(nodes) < safety and service.storagerouter.ip not in nodes:
                        new_services.append(service)
                        nodes.add(service.storagerouter.ip)

        # Build the new configuration and update the vdisk
        configs = []
        for service in new_services:
            client = MetadataServerClient.load(service)
            client.create_namespace(str(vdisk.volume_id))
            configs.append(MDSNodeConfig(address=str(service.storagerouter.ip),
                                         port=service.ports[0]))
        vdisk.storagedriver_client.update_metadata_backend_config(
            volume_id=str(vdisk.volume_id),
            metadata_backend_config=MDSMetaDataBackendConfig(configs)
        )
        MDSServiceController.sync_vdisk_to_reality(vdisk)
        logger.debug('Ensuring MDS safety for vdisk {0} completed'.format(vdisk.guid))