예제 #1
0
    def test_alba_arakoon_checkup(self):
        """
        Validates whether the ALBA Arakoon checkup works (Manual and Scheduled)
        """
        ovs_structure = DalHelper.build_dal_structure(
            structure={'storagerouters': [1]})
        alba_structure = AlbaDalHelper.build_dal_structure(
            structure={'alba_backends': [[1, 'LOCAL']]})

        #############################
        # SCHEDULED_ARAKOON_CHECKUP #
        #############################
        # Create an ABM and NSM cluster for ALBA Backend 1 and do some basic validations
        sr_1 = ovs_structure['storagerouters'][1]
        ab_1 = alba_structure['alba_backends'][1]
        MockedSSHClient._run_returns[sr_1.ip] = {}
        MockedSSHClient._run_returns[sr_1.ip][
            'ln -s /usr/lib/alba/albamgr_plugin.cmxs /tmp/unittest/sr_1/disk_1/partition_1/arakoon/backend_1-abm/db'] = None
        MockedSSHClient._run_returns[sr_1.ip][
            'ln -s /usr/lib/alba/nsm_host_plugin.cmxs /tmp/unittest/sr_1/disk_1/partition_1/arakoon/backend_1-nsm_0/db'] = None
        AlbaController.add_cluster(ab_1.guid)

        abm_cluster_name = '{0}-abm'.format(ab_1.name)
        nsm_cluster_name = '{0}-nsm_0'.format(ab_1.name)
        arakoon_clusters = sorted(Configuration.list('/ovs/arakoon'))
        self.assertListEqual(list1=[abm_cluster_name, nsm_cluster_name],
                             list2=arakoon_clusters)

        abm_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
            cluster_name=abm_cluster_name)
        nsm_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
            cluster_name=nsm_cluster_name)
        self.assertTrue(expr=abm_metadata['in_use'])
        self.assertTrue(expr=nsm_metadata['in_use'])

        # Run scheduled Arakoon checkup and validate amount of Arakoon clusters did not change
        AlbaArakoonController.scheduled_alba_arakoon_checkup()
        self.assertListEqual(list1=[abm_cluster_name, nsm_cluster_name],
                             list2=arakoon_clusters)
        self.assertEqual(first=len(ab_1.abm_cluster.abm_services), second=1)
        self.assertEqual(first=len(ab_1.nsm_clusters), second=1)
        self.assertEqual(first=len(ab_1.nsm_clusters[0].nsm_services),
                         second=1)

        # Create 2 additional StorageRouters
        srs = DalHelper.build_dal_structure(
            structure={'storagerouters': [2, 3]},
            previous_structure=ovs_structure)['storagerouters']
        sr_2 = srs[2]
        sr_3 = srs[3]

        # Run scheduled checkup again and do some validations
        MockedSSHClient._run_returns[sr_2.ip] = {}
        MockedSSHClient._run_returns[sr_3.ip] = {}
        MockedSSHClient._run_returns[sr_2.ip][
            'ln -s /usr/lib/alba/albamgr_plugin.cmxs /tmp/unittest/sr_2/disk_1/partition_1/arakoon/backend_1-abm/db'] = None
        MockedSSHClient._run_returns[sr_3.ip][
            'ln -s /usr/lib/alba/albamgr_plugin.cmxs /tmp/unittest/sr_3/disk_1/partition_1/arakoon/backend_1-abm/db'] = None
        MockedSSHClient._run_returns[sr_2.ip][
            'arakoon --node {0} -config file://opt/OpenvStorage/config/framework.json?key=/ovs/arakoon/backend_1-abm/config -catchup-only'
            .format(sr_2.machine_id)] = None
        MockedSSHClient._run_returns[sr_3.ip][
            'arakoon --node {0} -config file://opt/OpenvStorage/config/framework.json?key=/ovs/arakoon/backend_1-abm/config -catchup-only'
            .format(sr_3.machine_id)] = None
        AlbaArakoonController.scheduled_alba_arakoon_checkup()
        self.assertListEqual(list1=[abm_cluster_name, nsm_cluster_name],
                             list2=arakoon_clusters)
        self.assertEqual(first=len(ab_1.abm_cluster.abm_services),
                         second=3)  # Gone up from 1 to 3
        self.assertEqual(first=len(ab_1.nsm_clusters), second=1)
        self.assertEqual(first=len(ab_1.nsm_clusters[0].nsm_services),
                         second=1)  # Still 1 since NSM checkup hasn't ran yet

        # Make sure 1 StorageRouter is unreachable
        SSHClient._raise_exceptions[sr_3.ip] = {
            'users': ['ovs'],
            'exception': UnableToConnectException('No route to host')
        }
        AlbaArakoonController.scheduled_alba_arakoon_checkup()
        alba_logs = Logger._logs.get('lib', [])
        self.assertIn(
            member='Storage Router with IP {0} is not reachable'.format(
                sr_3.ip),
            container=alba_logs)

        ##########################
        # MANUAL_ARAKOON_CHECKUP #
        ##########################
        AlbaDalHelper.setup()  # Clear everything
        ovs_structure = DalHelper.build_dal_structure(
            structure={'storagerouters': [1]})
        alba_structure = AlbaDalHelper.build_dal_structure(
            structure={'alba_backends': [[1, 'LOCAL']]})
        sr_1 = ovs_structure['storagerouters'][1]
        ab_1 = alba_structure['alba_backends'][1]
        MockedSSHClient._run_returns[sr_1.ip] = {}
        MockedSSHClient._run_returns[sr_1.ip][
            'ln -s /usr/lib/alba/albamgr_plugin.cmxs /tmp/unittest/sr_1/disk_1/partition_1/arakoon/backend_1-abm/db'] = None
        MockedSSHClient._run_returns[sr_1.ip][
            'ln -s /usr/lib/alba/nsm_host_plugin.cmxs /tmp/unittest/sr_1/disk_1/partition_1/arakoon/backend_1-nsm_0/db'] = None
        AlbaController.add_cluster(ab_1.guid)

        # Run manual Arakoon checkup and validate amount of Arakoon clusters did not change
        AlbaArakoonController.manual_alba_arakoon_checkup(
            alba_backend_guid=ab_1.guid, nsm_clusters=[], abm_cluster=None)
        self.assertListEqual(list1=[abm_cluster_name, nsm_cluster_name],
                             list2=arakoon_clusters)
        self.assertEqual(first=len(ab_1.abm_cluster.abm_services), second=1)
        self.assertEqual(first=len(ab_1.nsm_clusters), second=1)
        self.assertEqual(first=len(ab_1.nsm_clusters[0].nsm_services),
                         second=1)

        # Test some error paths
        with self.assertRaises(ValueError) as raise_info:
            AlbaArakoonController.manual_alba_arakoon_checkup(
                alba_backend_guid=ab_1.guid,
                nsm_clusters=['no_abm_cluster_passed'])
        self.assertEqual(
            first=raise_info.exception.message,
            second='Both ABM cluster and NSM clusters must be provided')
        with self.assertRaises(ValueError) as raise_info:
            AlbaArakoonController.manual_alba_arakoon_checkup(
                alba_backend_guid=ab_1.guid,
                nsm_clusters=[],
                abm_cluster='no_nsm_clusters_passed')
        self.assertEqual(
            first=raise_info.exception.message,
            second='Both ABM cluster and NSM clusters must be provided')
        with self.assertRaises(ValueError) as raise_info:
            AlbaArakoonController.manual_alba_arakoon_checkup(
                alba_backend_guid=ab_1.guid,
                nsm_clusters=[nsm_cluster_name],
                abm_cluster=abm_cluster_name)
        self.assertEqual(first=raise_info.exception.message,
                         second='Cluster {0} has already been claimed'.format(
                             abm_cluster_name))
        with self.assertRaises(ValueError) as raise_info:
            AlbaArakoonController.manual_alba_arakoon_checkup(
                alba_backend_guid=ab_1.guid,
                nsm_clusters=['non-existing-nsm-cluster'],
                abm_cluster='non-existing-abm-cluster')
        self.assertEqual(
            first=raise_info.exception.message,
            second=
            'Could not find an Arakoon cluster with name: non-existing-abm-cluster'
        )

        # Recreate ALBA Backend with Arakoon clusters
        AlbaDalHelper.setup()  # Clear everything
        ovs_structure = DalHelper.build_dal_structure(
            structure={'storagerouters': [1]})
        alba_structure = AlbaDalHelper.build_dal_structure(
            structure={'alba_backends': [[1, 'LOCAL']]})
        sr_1 = ovs_structure['storagerouters'][1]
        ab_1 = alba_structure['alba_backends'][1]

        # Create some Arakoon clusters to be claimed by the manual checkup
        for cluster_name, cluster_type in {
                'manual-abm-1': ServiceType.ARAKOON_CLUSTER_TYPES.ABM,
                'manual-abm-2': ServiceType.ARAKOON_CLUSTER_TYPES.ABM,
                'manual-nsm-1': ServiceType.ARAKOON_CLUSTER_TYPES.NSM,
                'manual-nsm-2': ServiceType.ARAKOON_CLUSTER_TYPES.NSM,
                'manual-nsm-3': ServiceType.ARAKOON_CLUSTER_TYPES.NSM
        }.iteritems():
            arakoon_installer = ArakoonInstaller(cluster_name=cluster_name)
            arakoon_installer.create_cluster(
                cluster_type=cluster_type,
                ip=sr_1.ip,
                base_dir=DalHelper.CLUSTER_DIR.format(cluster_name),
                internal=False)
            arakoon_installer.start_cluster()
            arakoon_installer.unclaim_cluster()
        AlbaArakoonController.manual_alba_arakoon_checkup(
            alba_backend_guid=ab_1.guid,
            nsm_clusters=['manual-nsm-1', 'manual-nsm-3'],
            abm_cluster='manual-abm-2')

        # Validate the correct clusters have been claimed by the manual checkup
        unused_abms = ArakoonInstaller.get_unused_arakoon_clusters(
            cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.ABM)
        unused_nsms = ArakoonInstaller.get_unused_arakoon_clusters(
            cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.NSM)
        self.assertEqual(first=len(unused_abms), second=1)
        self.assertEqual(first=len(unused_nsms), second=1)
        self.assertEqual(first=unused_abms[0]['cluster_name'],
                         second='manual-abm-1')
        self.assertEqual(first=unused_nsms[0]['cluster_name'],
                         second='manual-nsm-2')
    def nsm_checkup(alba_backend_guid=None,
                    min_internal_nsms=1,
                    external_nsm_cluster_names=None):
        # type: (Optional[str], Optional[int], Optional[List[str]]) -> None
        """
        Validates the current NSM setup/configuration and takes actions where required.
        Assumptions:
        * A 2 node NSM is considered safer than a 1 node NSM.
        * When adding an NSM, the nodes with the least amount of NSM participation are preferred

        :param alba_backend_guid: Run for a specific ALBA Backend
        :type alba_backend_guid: str
        :param min_internal_nsms: Minimum amount of NSM hosts that need to be provided
        :type min_internal_nsms: int
        :param external_nsm_cluster_names: Information about the additional clusters to claim (only for externally managed Arakoon clusters)
        :type external_nsm_cluster_names: list
        :return: None
        :rtype: NoneType
        """
        ###############
        # Validations #
        ###############
        if external_nsm_cluster_names is None:
            external_nsm_cluster_names = []
        AlbaArakoonController._logger.info('NSM checkup started')
        if min_internal_nsms < 1:
            raise ValueError(
                'Minimum amount of NSM clusters must be 1 or more')

        if not isinstance(external_nsm_cluster_names, list):
            raise ValueError(
                "'external_nsm_cluster_names' must be of type 'list'")

        if len(external_nsm_cluster_names) > 0:
            if alba_backend_guid is None:
                raise ValueError(
                    'Additional NSMs can only be configured for a specific ALBA Backend'
                )
            if min_internal_nsms > 1:
                raise ValueError(
                    "'min_internal_nsms' and 'external_nsm_cluster_names' are mutually exclusive"
                )

            external_nsm_cluster_names = list(set(
                external_nsm_cluster_names))  # Remove duplicate cluster names
            for cluster_name in external_nsm_cluster_names:
                try:
                    ArakoonInstaller.get_arakoon_metadata_by_cluster_name(
                        cluster_name=cluster_name)
                except NotFoundException:
                    raise ValueError(
                        'Arakoon cluster with name {0} does not exist'.format(
                            cluster_name))

        if alba_backend_guid is None:
            alba_backends = [
                alba_backend
                for alba_backend in AlbaBackendList.get_albabackends()
                if alba_backend.backend.status == 'RUNNING'
            ]
        else:
            alba_backends = [AlbaBackend(alba_backend_guid)]

        masters = StorageRouterList.get_masters()
        storagerouters = set()
        for alba_backend in alba_backends:
            if alba_backend.abm_cluster is None:
                raise ValueError(
                    'No ABM cluster found for ALBA Backend {0}'.format(
                        alba_backend.name))
            if len(alba_backend.abm_cluster.abm_services) == 0:
                raise ValueError(
                    'ALBA Backend {0} does not have any registered ABM services'
                    .format(alba_backend.name))
            if len(alba_backend.nsm_clusters) + len(
                    external_nsm_cluster_names) > MAX_NSM_AMOUNT:
                raise ValueError(
                    'The maximum of {0} NSM Arakoon clusters will be exceeded. Amount of clusters that can be deployed for this ALBA Backend: {1}'
                    .format(MAX_NSM_AMOUNT,
                            MAX_NSM_AMOUNT - len(alba_backend.nsm_clusters)))
            # Validate enough externally managed Arakoon clusters are available
            if alba_backend.abm_cluster.abm_services[
                    0].service.is_internal is False:
                unused_cluster_names = set([
                    cluster_info['cluster_name'] for cluster_info in
                    ArakoonInstaller.get_unused_arakoon_clusters(
                        cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.NSM)
                ])
                if set(external_nsm_cluster_names).difference(
                        unused_cluster_names):
                    raise ValueError(
                        'Some of the provided cluster_names have already been claimed before'
                    )
                storagerouters.update(
                    set(masters)
                )  # For externally managed we need an available master node
            else:
                for abm_service in alba_backend.abm_cluster.abm_services:  # For internally managed we need all StorageRouters online
                    storagerouters.add(abm_service.service.storagerouter)
                for nsm_cluster in alba_backend.nsm_clusters:  # For internally managed we need all StorageRouters online
                    for nsm_service in nsm_cluster.nsm_services:
                        storagerouters.add(nsm_service.service.storagerouter)

        ssh_clients = {}
        for storagerouter in storagerouters:
            try:
                ssh_clients[storagerouter] = SSHClient(endpoint=storagerouter)
            except UnableToConnectException:
                raise RuntimeError(
                    'StorageRouter {0} with IP {1} is not reachable'.format(
                        storagerouter.name, storagerouter.ip))

        version_str = AlbaArakoonInstaller.get_alba_version_string()
        nsm_installer = NSMInstaller(version_str=version_str,
                                     ssh_clients=ssh_clients)

        ##################
        # Check Clusters #
        ##################
        safety = Configuration.get(
            '/ovs/framework/plugins/alba/config|nsm.safety')
        maxload = Configuration.get(
            '/ovs/framework/plugins/alba/config|nsm.maxload')

        AlbaArakoonController._logger.debug(
            'NSM safety is configured at: {0}'.format(safety))
        AlbaArakoonController._logger.debug(
            'NSM max load is configured at: {0}'.format(maxload))

        master_client = None
        failed_backends = []
        for alba_backend in alba_backends:
            try:
                # Gather information
                AlbaArakoonController._logger.info(
                    'ALBA Backend {0} - Ensuring NSM safety'.format(
                        alba_backend.name))

                internal = AlbaArakoonInstaller.is_internally_managed(
                    alba_backend)
                nsm_loads = AlbaArakoonController.get_nsm_loads(alba_backend)
                nsm_storagerouters = AlbaArakoonController.get_nsms_per_storagerouter(
                    alba_backend)
                sorted_nsm_clusters = sorted(alba_backend.nsm_clusters,
                                             key=lambda k: k.number)

                if not internal and len(external_nsm_cluster_names) > 0:
                    for sr, cl in ssh_clients.iteritems():
                        if sr.node_type == 'MASTER':
                            master_client = cl
                            break
                    if master_client is None:
                        # Internal is False and we specified the NSM clusters to claim, but no MASTER nodes online
                        raise ValueError(
                            'Could not find an online master node')

                AlbaArakoonController._logger.debug(
                    'ALBA Backend {0} - Arakoon clusters are {1} managed'.
                    format(alba_backend.name,
                           'internally' if internal is True else 'externally'))
                for nsm_number, nsm_load in nsm_loads.iteritems():
                    AlbaArakoonController._logger.debug(
                        'ALBA Backend {0} - NSM Cluster {1} - Load {2}'.format(
                            alba_backend.name, nsm_number, nsm_load))
                for sr, count in nsm_storagerouters.iteritems():
                    AlbaArakoonController._logger.debug(
                        'ALBA Backend {0} - StorageRouter {1} - NSM Services {2}'
                        .format(alba_backend.name, sr.name, count))

                if internal:
                    # Extend existing NSM clusters if safety not met
                    for nsm_cluster in sorted_nsm_clusters:
                        AlbaArakoonController._logger.debug(
                            'ALBA Backend {0} - Processing NSM {1} - Expected safety {2} - Current safety {3}'
                            .format(alba_backend.name, nsm_cluster.number,
                                    safety, len(nsm_cluster.nsm_services)))
                        AlbaArakoonController.ensure_nsm_cluster_safety(
                            nsm_cluster,
                            nsm_storagerouters,
                            nsm_installer=nsm_installer)
                AlbaArakoonController.ensure_nsm_clusters_load(
                    alba_backend,
                    nsms_per_storagerouter=nsm_storagerouters,
                    ssh_clients=ssh_clients,
                    version_str=version_str,
                    min_internal_nsms=min_internal_nsms,
                    external_nsm_cluster_names=external_nsm_cluster_names)
            except Exception:
                AlbaArakoonController._logger.exception(
                    'NSM Checkup failed for Backend {0}'.format(
                        alba_backend.name))
                failed_backends.append(alba_backend.name)