def test_pvc_multiple_clone_performance(
        self,
        interface_iterate,
        teardown_factory,
        storageclass_factory,
        pvc_factory,
        pod_factory,
    ):
        """
        1. Creating PVC
           PVC size is calculated in the test and depends on the storage capacity, but not less then 1 GiB
           it will use ~75% capacity of the Storage, Min storage capacity 1 TiB
        2. Fill the PVC with 70% of data
        3. Take a clone of the PVC and measure time and speed of creation by reading start creation and end creation
            times from relevant logs
        4. Repeat the previous step number of times (maximal num_of_clones is 512)
        5. Print all measured statistics for all the clones.

        Raises:
            StorageNotSufficientException: in case of not enough capacity on the cluster

        """
        num_of_clones = 512

        # Getting the total Storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = int(ceph_cluster.get_ceph_capacity())

        # Use 70% of the storage capacity in the test
        capacity_to_use = int(ceph_capacity * 0.7)

        # since we do not want to use more then 65%, we add 35% to the needed
        # capacity, and minimum PVC size is 1 GiB
        need_capacity = int((num_of_clones + 2) * 1.35)
        # Test will run only on system with enough capacity
        if capacity_to_use < need_capacity:
            err_msg = (f"The system have only {ceph_capacity} GiB, "
                       f"we want to use only {capacity_to_use} GiB, "
                       f"and we need {need_capacity} GiB to run the test")
            log.error(err_msg)
            raise exceptions.StorageNotSufficientException(err_msg)

        # Calculating the PVC size in GiB
        pvc_size = int(capacity_to_use / (num_of_clones + 2))

        self.interface = interface_iterate
        self.sc_obj = storageclass_factory(self.interface)

        if self.interface == constants.CEPHFILESYSTEM:
            sc = "CephFS"
        if self.interface == constants.CEPHBLOCKPOOL:
            sc = "RBD"

        self.full_log_path = get_full_test_logs_path(cname=self)
        self.full_log_path += f"-{sc}"

        self.pvc_obj = pvc_factory(interface=self.interface,
                                   size=pvc_size,
                                   status=constants.STATUS_BOUND)

        self.pod_obj = pod_factory(interface=self.interface,
                                   pvc=self.pvc_obj,
                                   status=constants.STATUS_RUNNING)

        # Calculating the file size as 70% of the PVC size
        filesize = self.pvc_obj.size * 0.70
        # Change the file size to MB for the FIO function
        file_size = f"{int(filesize * constants.GB2MB)}M"
        file_name = self.pod_obj.name

        log.info(f"Total capacity size is : {ceph_capacity} GiB, "
                 f"Going to use {need_capacity} GiB, "
                 f"With {num_of_clones} clones to {pvc_size} GiB PVC. "
                 f"File size to be written is : {file_size} "
                 f"with the name of {file_name}")
        self.params = {}
        self.params["clonenum"] = f"{num_of_clones}"
        self.params["filesize"] = file_size
        self.params["ERRMSG"] = "Error in command"

        clone_yaml = self.build_params()
        performance_lib.write_fio_on_pod(self.pod_obj, file_size)

        # Running the test
        results = []
        for test_num in range(1, int(self.params["clonenum"]) + 1):
            log.info(f"Starting test number {test_num}")
            ct = self.create_clone(test_num, clone_yaml)
            speed = self.params["datasize"] / ct
            results.append({"Clone Num": test_num, "time": ct, "speed": speed})
            log.info(
                f"Results for clone number {test_num} are : "
                f"Creation time is {ct} secs, Creation speed {speed} MB/sec")

        for r in results:
            log.info(
                f"Clone number {r['Clone Num']} creation time is {r['time']} secs."
            )
            log.info(
                f"Clone number {r['Clone Num']} creation speed is {r['speed']} MB/sec."
            )

        creation_time_list = [r["time"] for r in results]
        average_creation_time = statistics.mean(creation_time_list)
        log.info(f"Average creation time is  {average_creation_time} secs.")

        creation_speed_list = [r["speed"] for r in results]
        average_creation_speed = statistics.mean(creation_speed_list)
        log.info(f"Average creation speed is  {average_creation_time} MB/sec.")

        self.results_path = get_full_test_logs_path(cname=self)
        # Produce ES report
        # Collecting environment information
        self.get_env_info()

        # Initialize the results doc file.
        full_results = self.init_full_results(
            ResultsAnalyse(
                self.uuid,
                self.crd_data,
                self.full_log_path,
                "pvc_multiple_clone_measurement",
            ))

        full_results.add_key("interface", self.interface)
        full_results.add_key("clones_num", num_of_clones)
        full_results.add_key("clone_size", pvc_size)
        full_results.add_key("multi_clone_creation_time", creation_time_list)
        full_results.add_key("multi_clone_creation_time_average",
                             average_creation_time)
        full_results.add_key("multi_clone_creation_speed", creation_speed_list)
        full_results.add_key("multi_clone_creation_speed_average",
                             average_creation_speed)

        # Write the test results into the ES server
        if full_results.es_write():
            res_link = full_results.results_link()
            log.info(f"The Result can be found at : {res_link}")

            # Create text file with results of all subtest (4 - according to the parameters)
            self.write_result_to_file(res_link)
Exemplo n.º 2
0
class Sanity:
    """
    Class for cluster health and functional validations
    """
    def __init__(self):
        """
        Initializer for Sanity class - Init CephCluster() in order to
        set the cluster status before starting the tests
        """
        self.pvc_objs = list()
        self.pod_objs = list()
        self.obj_data = ""
        self.ceph_cluster = CephCluster()

    def health_check(self, cluster_check=True, tries=20):
        """
        Perform Ceph and cluster health checks
        """
        wait_for_cluster_connectivity(tries=400)
        logger.info("Checking cluster and Ceph health")
        node.wait_for_nodes_status(timeout=300)

        ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'],
                          tries=tries)
        if cluster_check:
            self.ceph_cluster.cluster_health_check(timeout=60)

    def create_resources(self, pvc_factory, pod_factory, run_io=True):
        """
        Sanity validation - Create resources (FS and RBD) and run IO

        Args:
            pvc_factory (function): A call to pvc_factory function
            pod_factory (function): A call to pod_factory function
            run_io (bool): True for run IO, False otherwise

        """
        logger.info(
            "Creating resources and running IO as a sanity functional validation"
        )

        for interface in [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]:
            pvc_obj = pvc_factory(interface)
            self.pvc_objs.append(pvc_obj)
            self.pod_objs.append(pod_factory(pvc=pvc_obj, interface=interface))
        if run_io:
            for pod in self.pod_objs:
                pod.run_io('fs', '1G', runtime=30)
            for pod in self.pod_objs:
                get_fio_rw_iops(pod)
        self.create_obc()
        self.verify_obc()

    def create_obc(self):
        """
        OBC creation for RGW and Nooba

        """
        if config.ENV_DATA['platform'] in constants.ON_PREM_PLATFORMS:
            obc_rgw = templating.load_yaml(constants.RGW_OBC_YAML)
            obc_rgw_data_yaml = tempfile.NamedTemporaryFile(
                mode='w+', prefix='obc_rgw_data', delete=False)
            templating.dump_data_to_temp_yaml(obc_rgw, obc_rgw_data_yaml.name)
            logger.info("Creating OBC for rgw")
            run_cmd(f"oc create -f {obc_rgw_data_yaml.name}", timeout=2400)
            self.obc_rgw = obc_rgw['metadata']['name']

        obc_nooba = templating.load_yaml(constants.MCG_OBC_YAML)
        obc_mcg_data_yaml = tempfile.NamedTemporaryFile(mode='w+',
                                                        prefix='obc_mcg_data',
                                                        delete=False)
        templating.dump_data_to_temp_yaml(obc_nooba, obc_mcg_data_yaml.name)
        logger.info("create OBC for mcg")
        run_cmd(f"oc create -f {obc_mcg_data_yaml.name}", timeout=2400)
        self.obc_mcg = obc_nooba['metadata']['name']

    def delete_obc(self):
        """
        Clenaup OBC resources created above

        """
        if config.ENV_DATA['platform'] in constants.ON_PREM_PLATFORMS:
            logger.info(f"Deleting rgw obc {self.obc_rgw}")
            obcrgw = OCP(kind='ObjectBucketClaim',
                         resource_name=f'{self.obc_rgw}')
            run_cmd(f"oc delete obc/{self.obc_rgw}")
            obcrgw.wait_for_delete(resource_name=f'{self.obc_rgw}',
                                   timeout=300)

        logger.info(f"Deleting mcg obc {self.obc_mcg}")
        obcmcg = OCP(kind='ObjectBucketClaim', resource_name=f'{self.obc_mcg}')
        run_cmd(f"oc delete obc/{self.obc_mcg} -n "
                f"{defaults.ROOK_CLUSTER_NAMESPACE}")
        obcmcg.wait_for_delete(resource_name=f'{self.obc_mcg}', timeout=300)

    def verify_obc(self):
        """
        OBC verification from external cluster perspective,
        we will check 2 OBCs

        """
        sample = TimeoutSampler(300, 5, self.ceph_cluster.noobaa_health_check)
        sample.wait_for_func_status(True)

    def delete_resources(self):
        """
        Sanity validation - Delete resources (FS and RBD)

        """
        logger.info("Deleting resources as a sanity functional validation")

        self.delete_obc()

        for pod_obj in self.pod_objs:
            pod_obj.delete()
        for pod_obj in self.pod_objs:
            pod_obj.ocp.wait_for_delete(pod_obj.name)
        for pvc_obj in self.pvc_objs:
            pvc_obj.delete()
        for pvc_obj in self.pvc_objs:
            pvc_obj.ocp.wait_for_delete(pvc_obj.name)

    @ignore_leftovers
    def create_pvc_delete(self, multi_pvc_factory, project=None):
        """
        Creates and deletes all types of PVCs

        """
        # Create rbd pvcs
        pvc_objs_rbd = create_pvcs(multi_pvc_factory=multi_pvc_factory,
                                   interface='CephBlockPool',
                                   project=project,
                                   status="",
                                   storageclass=None)

        # Create cephfs pvcs
        pvc_objs_cephfs = create_pvcs(multi_pvc_factory=multi_pvc_factory,
                                      interface='CephFileSystem',
                                      project=project,
                                      status="",
                                      storageclass=None)

        all_pvc_to_delete = pvc_objs_rbd + pvc_objs_cephfs

        # Check pvc status
        for pvc_obj in all_pvc_to_delete:
            helpers.wait_for_resource_state(resource=pvc_obj,
                                            state=constants.STATUS_BOUND,
                                            timeout=300)

        # Start deleting PVC
        delete_pvcs(all_pvc_to_delete)

        # Check PVCs are deleted
        for pvc_obj in all_pvc_to_delete:
            pvc_obj.ocp.wait_for_delete(resource_name=pvc_obj.name)

        logger.info("All PVCs are deleted as expected")

    def obc_put_obj_create_delete(self, mcg_obj, bucket_factory):
        """
        Creates bucket then writes, reads and deletes objects

        """
        bucket_name = bucket_factory(amount=1, interface='OC')[0].name
        self.obj_data = "A string data"

        for i in range(0, 30):
            key = 'Object-key-' + f"{i}"
            logger.info(f"Write, read and delete object with key: {key}")
            assert s3_put_object(mcg_obj, bucket_name, key,
                                 self.obj_data), f"Failed: Put object, {key}"
            assert s3_get_object(mcg_obj, bucket_name,
                                 key), f"Failed: Get object, {key}"
            assert s3_delete_object(mcg_obj, bucket_name,
                                    key), f"Failed: Delete object, {key}"
Exemplo n.º 3
0
    def test_respin_mcg_pod_and_check_data_integrity_crd(
        self,
        mcg_obj,
        cld_mgr,
        awscli_pod_session,
        namespace_store_factory,
        bucket_factory,
        test_directory_setup,
        mcg_pod,
    ):
        """
        Test Write to ns bucket using CRDs and read directly from AWS.
        Respin one of mcg pods when data are uploaded.
        """

        logger.info("Create the namespace resources and verify health")
        nss_tup = ("oc", {"aws": [(1, self.DEFAULT_REGION)]})
        ns_store = namespace_store_factory(*nss_tup)[0]

        logger.info(
            "Create the namespace bucket on top of the namespace stores")
        bucketclass_dict = {
            "interface": "OC",
            "namespace_policy_dict": {
                "type": "Single",
                "namespacestores": [ns_store],
            },
        }
        logger.info(
            "Create the namespace bucket on top of the namespace resource")
        ns_bucket = bucket_factory(
            amount=1,
            interface=bucketclass_dict["interface"],
            bucketclass=bucketclass_dict,
        )[0].name
        s3_creds = {
            "access_key_id": cld_mgr.aws_client.access_key,
            "access_key": cld_mgr.aws_client.secret_key,
            "endpoint": constants.MCG_NS_AWS_ENDPOINT,
            "region": self.DEFAULT_REGION,
        }
        original_folder = test_directory_setup.origin_dir
        result_folder = test_directory_setup.result_dir
        logger.info("Upload files to NS bucket")
        self.write_files_to_pod_and_upload(
            mcg_obj,
            awscli_pod_session,
            bucket_to_write=ns_bucket,
            original_dir=original_folder,
            amount=3,
        )

        logger.info(f"Respin mcg resource {mcg_pod}")
        noobaa_pods = pod.get_noobaa_pods()
        pod_obj = [pod for pod in noobaa_pods
                   if pod.name.startswith(mcg_pod)][0]
        pod_obj.delete(force=True)
        logger.info("Wait for noobaa pods to come up")
        assert pod_obj.ocp.wait_for_resource(
            condition="Running",
            selector="app=noobaa",
            resource_count=len(noobaa_pods),
            timeout=1000,
        )
        logger.info("Wait for noobaa health to be OK")
        ceph_cluster_obj = CephCluster()
        ceph_cluster_obj.wait_for_noobaa_health_ok()

        logger.info("Read files directly from AWS")
        self.download_files(
            mcg_obj,
            awscli_pod_session,
            bucket_to_read=ns_store.uls_name,
            download_dir=result_folder,
            s3_creds=s3_creds,
        )

        logger.info("Compare between uploaded files and downloaded files")
        assert self.compare_dirs(
            awscli_pod_session,
            origin=original_folder,
            destination=result_folder,
            amount=3,
        )
Exemplo n.º 4
0
    def test_upgrade_ocp(self):
        """
        Tests OCS stability when upgrading OCP

        """

        ceph_cluster = CephCluster()
        with CephHealthMonitor(ceph_cluster):

            ocp_channel = config.UPGRADE.get('ocp_channel',
                                             ocp.get_ocp_upgrade_channel())
            ocp_upgrade_version = config.UPGRADE.get('ocp_upgrade_version')
            if not ocp_upgrade_version:
                ocp_upgrade_version = get_latest_ocp_version(
                    channel=ocp_channel)
                ocp_arch = config.UPGRADE['ocp_arch']
                target_image = f"{ocp_upgrade_version}-{ocp_arch}"
            elif ocp_upgrade_version.endswith(".nightly"):
                target_image = expose_ocp_version(ocp_upgrade_version)

            logger.info(f"Target image; {target_image}")

            image_path = config.UPGRADE['ocp_upgrade_path']
            cluster_operators = ocp.get_all_cluster_operators()
            logger.info(f" oc version: {ocp.get_current_oc_version()}")
            # Verify Upgrade subscription channel:
            ocp.patch_ocp_upgrade_channel(ocp_channel)
            for sampler in TimeoutSampler(timeout=250,
                                          sleep=15,
                                          func=ocp.verify_ocp_upgrade_channel,
                                          channel_variable=ocp_channel):
                if sampler:
                    logger.info(f"OCP Channel:{ocp_channel}")
                    break

            # Upgrade OCP
            logger.info(f"full upgrade path: {image_path}:{target_image}")
            ocp.upgrade_ocp(image=target_image, image_path=image_path)

            # Wait for upgrade
            for ocp_operator in cluster_operators:
                logger.info(f"Checking upgrade status of {ocp_operator}:")
                # ############ Workaround for issue 2624 #######
                name_changed_between_versions = (
                    'service-catalog-apiserver',
                    'service-catalog-controller-manager')
                if ocp_operator in name_changed_between_versions:
                    logger.info(f"{ocp_operator} upgrade will not be verified")
                    continue
                # ############ End of Workaround ###############
                ver = ocp.get_cluster_operator_version(ocp_operator)
                logger.info(f"current {ocp_operator} version: {ver}")
                for sampler in TimeoutSampler(
                        timeout=2700,
                        sleep=60,
                        func=ocp.confirm_cluster_operator_version,
                        target_version=target_image,
                        cluster_operator=ocp_operator):
                    if sampler:
                        logger.info(f"{ocp_operator} upgrade completed!")
                        break
                    else:
                        logger.info(
                            f"{ocp_operator} upgrade did not completed yet!")

            # post upgrade validation: check cluster operator status
            cluster_operators = ocp.get_all_cluster_operators()
            for ocp_operator in cluster_operators:
                logger.info(f"Checking cluster status of {ocp_operator}")
                for sampler in TimeoutSampler(
                        timeout=2700,
                        sleep=60,
                        func=ocp.verify_cluster_operator_status,
                        cluster_operator=ocp_operator):
                    if sampler:
                        break
                    else:
                        logger.info(f"{ocp_operator} status is not valid")
            # Post upgrade validation: check cluster version status
            logger.info("Checking clusterversion status")
            for sampler in TimeoutSampler(
                    timeout=900,
                    sleep=15,
                    func=ocp.validate_cluster_version_status):
                if sampler:
                    logger.info("Upgrade Completed Successfully!")
                    break
Exemplo n.º 5
0
    def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern):
        """
        This is a basic fio perf test

        """

        # Deployment ripsaw
        log.info("Deploying ripsaw operator")
        ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml')
        if interface == 'CephBlockPool':
            sc = constants.CEPHBLOCKPOOL_SC
        else:
            sc = constants.CEPHFILESYSTEM_SC

        # Create fio benchmark
        log.info("Create resource file for fio workload")
        fio_cr = templating.load_yaml(constants.FIO_CR_YAML)

        # Saving the Original elastic-search IP and PORT - if defined in yaml
        if 'elasticsearch' in fio_cr['spec']:
            backup_es = fio_cr['spec']['elasticsearch']
        else:
            log.warning(
                'Elastic Search information does not exists in YAML file')
            fio_cr['spec']['elasticsearch'] = {}

        # Use the internal define elastic-search server in the test - if exist
        if es:
            fio_cr['spec']['elasticsearch'] = {
                'server': es.get_ip(),
                'port': es.get_port()
            }

        # Setting the data set to 40% of the total storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = ceph_cluster.get_ceph_capacity()
        total_data_set = int(ceph_capacity * 0.4)
        filesize = int(fio_cr['spec']['workload']['args']['filesize'].replace(
            'GiB', ''))
        # To make sure the number of App pods will not be more then 50, in case
        # of large data set, changing the size of the file each pod will work on
        if total_data_set > 500:
            filesize = int(ceph_capacity * 0.008)
            fio_cr['spec']['workload']['args']['filesize'] = f'{filesize}GiB'
            # make sure that the storage size is larger then the file size
            fio_cr['spec']['workload']['args'][
                'storagesize'] = f'{int(filesize * 1.2)}Gi'
        fio_cr['spec']['workload']['args']['servers'] = int(total_data_set /
                                                            filesize)
        log.info(f'Total Data set to work on is : {total_data_set} GiB')

        environment = get_environment_info()
        if not environment['user'] == '':
            fio_cr['spec']['test_user'] = environment['user']
        fio_cr['spec']['clustername'] = environment['clustername']

        log.debug(f'Environment information is : {environment}')

        fio_cr['spec']['workload']['args']['storageclass'] = sc
        if io_pattern == 'sequential':
            fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read']
            fio_cr['spec']['workload']['args']['iodepth'] = 1
        log.info(f'The FIO CR file is {fio_cr}')
        fio_cr_obj = OCS(**fio_cr)
        fio_cr_obj.create()

        # Wait for fio client pod to be created
        for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern,
                                      'fio-client',
                                      constants.RIPSAW_NAMESPACE):
            try:
                if fio_pod[0] is not None:
                    fio_client_pod = fio_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        # Getting the start time of the test
        start_time = time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime())

        # Getting the UUID from inside the benchmark pod
        uuid = ripsaw.get_uuid(fio_client_pod)
        # Setting back the original elastic-search information
        fio_cr['spec']['elasticsearch'] = backup_es

        full_results = FIOResultsAnalyse(uuid, fio_cr)

        # Initialize the results doc file.
        for key in environment:
            full_results.add_key(key, environment[key])

        # Setting the global parameters of the test
        full_results.add_key('io_pattern', io_pattern)
        full_results.add_key('dataset', f'{total_data_set}GiB')
        full_results.add_key('file_size',
                             fio_cr['spec']['workload']['args']['filesize'])
        full_results.add_key('servers',
                             fio_cr['spec']['workload']['args']['servers'])
        full_results.add_key('samples',
                             fio_cr['spec']['workload']['args']['samples'])
        full_results.add_key('operations',
                             fio_cr['spec']['workload']['args']['jobs'])
        full_results.add_key('block_sizes',
                             fio_cr['spec']['workload']['args']['bs'])
        full_results.add_key('io_depth',
                             fio_cr['spec']['workload']['args']['iodepth'])
        full_results.add_key('jobs',
                             fio_cr['spec']['workload']['args']['numjobs'])
        full_results.add_key(
            'runtime', {
                'read': fio_cr['spec']['workload']['args']['read_runtime'],
                'write': fio_cr['spec']['workload']['args']['write_runtime']
            })
        full_results.add_key(
            'storageclass', fio_cr['spec']['workload']['args']['storageclass'])
        full_results.add_key('vol_size',
                             fio_cr['spec']['workload']['args']['storagesize'])

        # Wait for fio pod to initialized and complete
        log.info("Waiting for fio_client to complete")
        pod_obj = OCP(kind='pod')
        pod_obj.wait_for_resource(
            condition='Completed',
            resource_name=fio_client_pod,
            timeout=18000,
            sleep=300,
        )

        # Getting the end time of the test
        end_time = time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime())
        full_results.add_key('test_time', {
            'start': start_time,
            'end': end_time
        })

        output = run_cmd(f'oc logs {fio_client_pod}')
        log.info(f'The Test log is : {output}')

        try:
            if 'Fio failed to execute' not in output:
                log.info("FIO has completed successfully")
        except IOError:
            log.info("FIO failed to complete")

        # Clean up fio benchmark
        log.info("Deleting FIO benchmark")
        fio_cr_obj.delete()

        log.debug(f'Full results is : {full_results.results}')

        # if Internal ES is exists, Copy all data from the Internal to main ES
        if es:
            log.info('Copy all data from Internal ES to Main ES')
            es._copy(full_results.es)
        # Adding this sleep between the copy and the analyzing of the results
        # since sometimes the results of the read (just after write) are empty
        time.sleep(30)
        full_results.analyze_results()  # Analyze the results
        # Writing the analyzed test results to the Elastic-Search server
        full_results.es_write()
        full_results.codespeed_push()  # Push results to codespeed
        # Creating full link to the results on the ES server
        log.info(f'The Result can be found at ; {full_results.results_link()}')
Exemplo n.º 6
0
class PASTest(BaseTest):
    """
    Base class for QPAS team - Performance and Scale tests

    This class contain functions which used by performance and scale test,
    and also can be used by E2E test which used the benchmark-operator (ripsaw)
    """
    def setup(self):
        """
        Setting up the environment for each performance and scale test

        Args:
            name (str): The test name that will use in the performance dashboard
        """
        log.info("Setting up test environment")
        self.es = None  # place holder for the incluster deployment elasticsearch
        self.es_backup = None  # place holder for the elasticsearch backup
        self.main_es = None  # place holder for the main elasticsearch object
        self.benchmark_obj = None  # place holder for the benchmark object
        self.client_pod = None  # Place holder for the client pod object
        self.dev_mode = config.RUN["cli_params"].get("dev_mode")
        self.pod_obj = OCP(kind="pod", namespace=benchmark_operator.BMO_NAME)
        self.initialize_test_crd()

        # Place holders for test results file (all sub-tests together)
        self.results_file = ""

        # All tests need a uuid for the ES results, benchmark-operator base test
        # will overrite it with uuid pulling from the benchmark pod
        self.uuid = uuid4().hex

        # Getting the full path for the test logs
        self.full_log_path = os.environ.get("PYTEST_CURRENT_TEST").split(
            " ")[0]
        self.full_log_path = (self.full_log_path.replace("::", "/").replace(
            "[", "-").replace("]", ""))
        self.full_log_path = os.path.join(ocsci_log_path(), self.full_log_path)
        log.info(f"Logs file path name is : {self.full_log_path}")

        # Getting the results path as a list
        self.results_path = self.full_log_path.split("/")
        self.results_path.pop()

        # List of test(s) for checking the results
        self.workloads = []

        # Collecting all Environment configuration Software & Hardware
        # for the performance report.
        self.environment = get_environment_info()
        self.environment["clusterID"] = get_running_cluster_id()

        self.ceph_cluster = CephCluster()
        self.used_capacity = self.get_cephfs_data()

        self.get_osd_info()

        self.get_node_info(node_type="master")
        self.get_node_info(node_type="worker")

    def teardown(self):
        if hasattr(self, "operator"):
            self.operator.cleanup()

        now_data = self.get_cephfs_data()
        # Wait 1 minutes for the backend deletion actually start.
        log.info("Waiting for Ceph to finish cleaning up")
        time.sleep(60)

        # Quarry the storage usage every 2 Min. if no difference between two
        # samples, the backend cleanup is done.
        still_going_down = True
        while still_going_down:
            new_data = self.get_cephfs_data()
            # no deletion operation is in progress
            if abs(now_data - new_data) < 1:
                still_going_down = False
                # up to 2% inflation of usage is acceptable
                if new_data > (self.used_capacity * 1.02):
                    log.warning(
                        f"usage capacity after the test ({new_data:.2f} GiB) "
                        f"is more then in the begining of it ({self.used_capacity:.2f} GiB)"
                    )
            else:
                log.info(f"Last usage : {now_data}, Current usage {new_data}")
                now_data = new_data
                log.info("Waiting for Ceph to finish cleaning up")
                time.sleep(120)
                still_going_down = True
        log.info("Storage usage was cleandup")

    def initialize_test_crd(self):
        """
        Initializing the test CRD file.
        this include the Elasticsearch info, cluster name and user name which run the test
        """
        self.crd_data = {
            "spec": {
                "test_user":
                "******",  # place holde only will be change in the test.
                "clustername":
                "test_cluster",  # place holde only will be change in the test.
                "elasticsearch": {
                    "server":
                    config.PERF.get("production_es_server"),
                    "port":
                    config.PERF.get("production_es_port"),
                    "url":
                    f"http://{config.PERF.get('production_es_server')}:{config.PERF.get('production_es_port')}",
                },
            }
        }
        # during development use the dev ES so the data in the Production ES will be clean.
        if self.dev_mode:
            self.crd_data["spec"]["elasticsearch"] = {
                "server":
                config.PERF.get("dev_es_server"),
                "port":
                config.PERF.get("dev_es_port"),
                "url":
                f"http://{config.PERF.get('dev_es_server')}:{config.PERF.get('dev_es_port')}",
            }

    def create_new_pool(self, pool_name):
        """
        Creating new Storage pool for RBD / CephFS to use in a test so it can be
        deleted in the end of the test for fast cleanup

        Args:
            pool_name (str):  the name of the pool to create

        """
        if self.interface == constants.CEPHBLOCKPOOL:
            self.ceph_cluster.create_new_blockpool(pool_name=pool_name)
            self.ceph_cluster.set_pgs(poolname=pool_name, pgs=128)
        elif self.interface == constants.CEPHFILESYSTEM:
            self.ceph_cluster.create_new_filesystem(fs_name=pool_name)
            self.ceph_cluster.toolbox.exec_ceph_cmd(
                f"ceph fs subvolumegroup create {pool_name} csi")
            self.ceph_cluster.set_pgs(poolname=f"{pool_name}-data0", pgs=128)

        self.ceph_cluster.set_target_ratio(
            poolname="ocs-storagecluster-cephblockpool", ratio=0.24)
        self.ceph_cluster.set_target_ratio(
            poolname="ocs-storagecluster-cephfilesystem-data0", ratio=0.24)
        return

    def delete_ceph_pool(self, pool_name):
        """
        Delete Storage pool (RBD / CephFS) that was created for the test for
        fast cleanup.

        Args:
            pool_name (str):  the name of the pool to be delete

        """
        if self.interface == constants.CEPHBLOCKPOOL:
            self.ceph_cluster.delete_blockpool(pool_name=pool_name)
        elif self.interface == constants.CEPHFILESYSTEM:
            self.ceph_cluster.delete_filesystem(fs_name=pool_name)

        self.ceph_cluster.set_target_ratio(
            poolname="ocs-storagecluster-cephblockpool", ratio=0.49)
        self.ceph_cluster.set_target_ratio(
            poolname="ocs-storagecluster-cephfilesystem-data0", ratio=0.49)
        return

    def get_cephfs_data(self):
        """
        Look through ceph pods and find space usage on all ceph pools

        Returns:
            int: total used capacity in GiB.
        """
        ceph_status = self.ceph_cluster.toolbox.exec_ceph_cmd(
            ceph_cmd="ceph df")
        total_used = 0
        for pool in ceph_status["pools"]:
            total_used += pool["stats"]["bytes_used"]
        return total_used / constants.GB

    def get_osd_info(self):
        """
        Getting the OSD's information and update the main environment
        dictionary.

        """
        ct_pod = pod.get_ceph_tools_pod()
        osd_info = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd df")
        self.environment["osd_size"] = osd_info.get("nodes")[0].get(
            "crush_weight")
        self.environment["osd_num"] = len(osd_info.get("nodes"))
        self.environment["total_capacity"] = osd_info.get("summary").get(
            "total_kb_avail")
        self.environment["ocs_nodes_num"] = len(node.get_ocs_nodes())

    def get_node_info(self, node_type="master"):
        """
        Getting node type hardware information and update the main environment
        dictionary.

        Args:
            node_type (str): the node type to collect data about,
              can be : master / worker - the default is master

        """
        if node_type == "master":
            nodes = node.get_master_nodes()
        elif node_type == "worker":
            nodes = node.get_worker_nodes()
        else:
            log.warning(f"Node type ({node_type}) is invalid")
            return

        oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        self.environment[f"{node_type}_nodes_num"] = len(nodes)
        self.environment[
            f"{node_type}_nodes_cpu_num"] = oc_cmd.exec_oc_debug_cmd(
                node=nodes[0],
                cmd_list=["lscpu | grep '^CPU(s):' | awk '{print $NF}'"],
            ).rstrip()
        self.environment[
            f"{node_type}_nodes_memory"] = oc_cmd.exec_oc_debug_cmd(
                node=nodes[0],
                cmd_list=["free | grep Mem | awk '{print $2}'"]).rstrip()

    def deploy_benchmark_operator(self):
        """
        Deploy the benchmark operator

        """
        self.operator = benchmark_operator.BenchmarkOperator()
        self.operator.deploy()

    def es_info_backup(self, elasticsearch):
        """
        Saving the Original elastic-search IP and PORT - if defined in yaml

        Args:
            elasticsearch (obj): elasticsearch object

        """

        self.crd_data["spec"]["elasticsearch"] = {}

        # for development mode use the Dev ES server
        if self.dev_mode and config.PERF.get("dev_lab_es"):
            log.info("Using the development ES server")
            self.crd_data["spec"]["elasticsearch"] = {
                "server": config.PERF.get("dev_es_server"),
                "port": config.PERF.get("dev_es_port"),
                "url":
                f"http://{config.PERF.get('dev_es_server')}:{config.PERF.get('dev_es_port')}",
                "parallel": True,
            }

        # for production mode use the Lab ES server
        if not self.dev_mode and config.PERF.get("production_es"):
            self.crd_data["spec"]["elasticsearch"] = {
                "server": config.PERF.get("production_es_server"),
                "port": config.PERF.get("production_es_port"),
                "url":
                f"http://{config.PERF.get('production_es_server')}:{config.PERF.get('production_es_port')}",
                "parallel": True,
            }

        # backup the Main ES info (if exists)
        if not self.crd_data["spec"]["elasticsearch"] == {}:
            self.backup_es = self.crd_data["spec"]["elasticsearch"]
            log.info(
                f"Creating object for the Main ES server on {self.backup_es['url']}"
            )
            self.main_es = Elasticsearch([self.backup_es["url"]],
                                         verify_certs=True)
        else:
            log.warning(
                "Elastic Search information does not exists for this test")

        # Use the internal define elastic-search server in the test - if exist
        if elasticsearch:

            if not isinstance(elasticsearch, dict):
                # elasticsearch is an internally deployed server (obj)
                ip = elasticsearch.get_ip()
                port = elasticsearch.get_port()
            else:
                # elasticsearch is an existing server (dict)
                ip = elasticsearch.get("server")
                port = elasticsearch.get("port")

            self.crd_data["spec"]["elasticsearch"] = {
                "server": ip,
                "port": port,
                "url": f"http://{ip}:{port}",
                "parallel": True,
            }
            log.info(
                f"Going to use the ES : {self.crd_data['spec']['elasticsearch']}"
            )
        elif config.PERF.get("internal_es_server"):
            # use an in-cluster elastic-search (not deployed by the test)
            self.crd_data["spec"]["elasticsearch"] = {
                "server": config.PERF.get("internal_es_server"),
                "port": config.PERF.get("internal_es_port"),
                "url":
                f"http://{config.PERF.get('internal_es_server')}:{config.PERF.get('internal_es_port')}",
                "parallel": True,
            }

    def set_storageclass(self, interface):
        """
        Setting the benchmark CRD storageclass

        Args:
            interface (str): The interface which will used in the test

        """
        if interface == constants.CEPHBLOCKPOOL:
            storageclass = constants.DEFAULT_STORAGECLASS_RBD
        else:
            storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS
        log.info(f"Using [{storageclass}] Storageclass")
        self.crd_data["spec"]["workload"]["args"][
            "storageclass"] = storageclass

    def get_env_info(self):
        """
        Getting the environment information and update the workload RC if
        necessary.

        """
        if not self.environment["user"] == "":
            self.crd_data["spec"]["test_user"] = self.environment["user"]
        else:
            # since full results object need this parameter, initialize it from CR file
            self.environment["user"] = self.crd_data["spec"]["test_user"]
        self.crd_data["spec"]["clustername"] = self.environment["clustername"]

        log.debug(f"Environment information is : {self.environment}")

    def deploy_and_wait_for_wl_to_start(self, timeout=300, sleep=20):
        """
        Deploy the workload and wait until it start working

        Args:
            timeout (int): time in second to wait until the benchmark start
            sleep (int): Sleep interval seconds

        """
        log.debug(f"The {self.benchmark_name} CR file is {self.crd_data}")
        self.benchmark_obj = OCS(**self.crd_data)
        self.benchmark_obj.create()

        # This time is only for reporting - when the benchmark started.
        self.start_time = self.get_time()

        # Wait for benchmark client pod to be created
        log.info(f"Waiting for {self.client_pod_name} to Start")
        for bm_pod in TimeoutSampler(
                timeout,
                sleep,
                get_pod_name_by_pattern,
                self.client_pod_name,
                benchmark_operator.BMO_NAME,
        ):
            try:
                if bm_pod[0] is not None:
                    self.client_pod = bm_pod[0]
                    break
            except IndexError:
                log.info("Bench pod is not ready yet")
        # Sleeping for 15 sec for the client pod to be fully accessible
        time.sleep(15)
        log.info(f"The benchmark pod {self.client_pod_name} is Running")

    def wait_for_wl_to_finish(self, timeout=18000, sleep=300):
        """
        Waiting until the workload is finished and get the test log

        Args:
            timeout (int): time in second to wait until the benchmark start
            sleep (int): Sleep interval seconds

        Raise:
            exception for too much restarts of the test.
            ResourceWrongStatusException : test Failed / Error
            TimeoutExpiredError : test did not completed on time.

        """
        log.info(f"Waiting for {self.client_pod_name} to complete")

        Finished = 0
        restarts = 0
        total_time = timeout
        while not Finished and total_time > 0:
            results = run_oc_command(
                "get pod --no-headers -o custom-columns=:metadata.name,:status.phase",
                namespace=benchmark_operator.BMO_NAME,
            )
            (fname, status) = ["", ""]
            for name in results:
                # looking for the pod which run the benchmark (not the IO)
                # this pod contain the `client` in his name, and there is only one
                # pod like this, other pods have the `server` in the name.
                (fname, status) = name.split()
                if re.search("client", fname):
                    break
                else:
                    (fname, status) = ["", ""]

            if fname == "":  # there is no `client` pod !
                err_msg = f"{self.client_pod} Failed to run !!!"
                log.error(err_msg)
                raise Exception(err_msg)

            if not fname == self.client_pod:
                # The client pod name is different from previous check, it was restarted
                log.info(
                    f"The pod {self.client_pod} was restart. the new client pod is {fname}"
                )
                self.client_pod = fname
                restarts += 1
                # in case of restarting the benchmark, reset the timeout as well
                total_time = timeout

            if restarts > 3:  # we are tolerating only 3 restarts
                err_msg = f"Too much restarts of the benchmark ({restarts})"
                log.error(err_msg)
                raise Exception(err_msg)

            if status == "Succeeded":
                # Getting the end time of the benchmark - for reporting.
                self.end_time = self.get_time()
                self.test_logs = self.pod_obj.exec_oc_cmd(
                    f"logs {self.client_pod}", out_yaml_format=False)
                log.info(f"{self.client_pod} completed successfully")
                Finished = 1
            elif (status != constants.STATUS_RUNNING
                  and status != constants.STATUS_PENDING):
                # if the benchmark pod is not in Running state (and not Completed/Pending),
                # no need to wait for timeout.
                # Note: the pod can be in pending state in case of restart.
                err_msg = f"{self.client_pod} Failed to run - ({status})"
                log.error(err_msg)
                raise exceptions.ResourceWrongStatusException(
                    self.client_pod,
                    describe_out=err_msg,
                    column="Status",
                    expected="Succeeded",
                    got=status,
                )
            else:
                log.info(
                    f"{self.client_pod} is in {status} State, and wait to Succeeded State."
                    f" wait another {sleep} sec. for benchmark to complete")
                time.sleep(sleep)
                total_time -= sleep

        if not Finished:
            err_msg = (f"{self.client_pod} did not completed on time, "
                       f"maybe timeout ({timeout}) need to be increase")
            log.error(err_msg)
            raise exceptions.TimeoutExpiredError(self.client_pod,
                                                 custom_message=err_msg)

        # Saving the benchmark internal log into a file at the logs directory
        log_file_name = f"{self.full_log_path}/test-pod.log"
        try:
            with open(log_file_name, "w") as f:
                f.write(self.test_logs)
            log.info(f"The Test log can be found at : {log_file_name}")
        except Exception:
            log.warning(f"Cannot write the log to the file {log_file_name}")
        log.info(f"The {self.benchmark_name} benchmark complete")

    def copy_es_data(self, elasticsearch):
        """
        Copy data from Internal ES (if exists) to the main ES

        Args:
            elasticsearch (obj): elasticsearch object (if exits)

        """
        log.info(f"In copy_es_data Function - {elasticsearch}")
        if elasticsearch:
            log.info("Copy all data from Internal ES to Main ES")
            log.info("Dumping data from the Internal ES to tar ball file")
            elasticsearch.dumping_all_data(self.full_log_path)
            es_connection = self.backup_es
            es_connection["host"] = es_connection.pop("server")
            es_connection.pop("url")
            if elasticsearch_load(self.main_es, self.full_log_path):
                # Adding this sleep between the copy and the analyzing of the results
                # since sometimes the results of the read (just after write) are empty
                time.sleep(10)
                log.info(
                    f"All raw data for tests results can be found at : {self.full_log_path}"
                )
                return True
            else:
                log.warning("Cannot upload data into the Main ES server")
                return False

    def read_from_es(self, es, index, uuid):
        """
        Reading all results from elasticsearch server

        Args:
            es (dict): dictionary with elasticsearch info  {server, port}
            index (str): the index name to read from the elasticsearch server
            uuid (str): the test UUID to find in the elasticsearch server

        Returns:
            list : list of all results

        """

        con = Elasticsearch([{"host": es["server"], "port": es["port"]}])
        query = {"size": 1000, "query": {"match": {"uuid": uuid}}}

        try:
            results = con.search(index=index, body=query)
            full_data = []
            for res in results["hits"]["hits"]:
                full_data.append(res["_source"])
            return full_data

        except Exception as e:
            log.warning(f"{index} Not found in the Internal ES. ({e})")
            return []

    def es_connect(self):
        """
        Create elasticsearch connection to the server

        Return:
            bool : True if there is a connection to the ES, False if not.

        """

        OK = True  # the return value
        try:
            log.info(
                f"try to connect the ES : {self.es['server']}:{self.es['port']}"
            )
            self.es_con = Elasticsearch([{
                "host": self.es["server"],
                "port": self.es["port"]
            }])
        except Exception:
            log.error(f"Cannot connect to ES server {self.es}")
            OK = False

        # Testing the connection to the elastic-search
        if not self.es_con.ping():
            log.error(f"Cannot connect to ES server {self.es}")
            OK = False

        return OK

    def get_kibana_indexid(self, server, name):
        """
        Get the kibana Index ID by its name.

        Args:
            server (str): the IP (or name) of the Kibana server
            name (str): the name of the index

        Returns:
            str : the index ID of the given name
                  return None if the index does not exist.

        """

        port = 5601
        http_link = f"http://{server}:{port}/api/saved_objects"
        search_string = f"_find?type=index-pattern&search_fields=title&search='{name}'"
        log.info(f"Connecting to Kibana {server} on port {port}")
        try:
            res = requests.get(f"{http_link}/{search_string}")
            res = json.loads(res.content.decode())
            for ind in res.get("saved_objects"):
                if ind.get("attributes").get("title") in [name, f"{name}*"]:
                    log.info(
                        f"The Kibana indexID for {name} is {ind.get('id')}")
                    return ind.get("id")
        except esexp.ConnectionError:
            log.warning("Cannot connect to Kibana server {}:{}".format(
                server, port))
        log.warning(f"Can not find the Kibana index : {name}")
        return None

    def write_result_to_file(self, res_link):
        """
        Write the results link into file, to combine all sub-tests results
        together in one file, so it can be easily pushed into the performance dashboard

        Args:
            res_link (str): http link to the test results in the ES server

        """
        if not os.path.exists(self.results_path):
            os.makedirs(self.results_path)
        self.results_file = os.path.join(self.results_path, "all_results.txt")

        log.info(f"Try to push results into : {self.results_file}")
        try:
            with open(self.results_file, "a+") as f:
                f.write(f"{res_link}\n")
            f.close()
        except FileNotFoundError:
            log.info("The file does not exist, so create new one.")
            with open(self.results_file, "w+") as f:
                f.write(f"{res_link}\n")
            f.close()
        except OSError as err:
            log.error(f"OS error: {err}")

    @staticmethod
    def get_time(time_format=None):
        """
        Getting the current GMT time in a specific format for the ES report,
        or for seeking in the containers log

        Args:
            time_format (str): which thime format to return - None / CSI

        Returns:
            str : current date and time in formatted way

        """
        formated = "%Y-%m-%dT%H:%M:%SGMT"
        if time_format and time_format.lower() == "csi":
            formated = "%Y-%m-%dT%H:%M:%SZ"

        return time.strftime(formated, time.gmtime())

    def check_tests_results(self):
        """
        Check that all sub-tests (test multiplication by parameters) finished and
        pushed the data to the ElastiSearch server.
        It also generate the es link to push into the performance dashboard.
        """

        es_links = []
        try:
            with open(self.results_file, "r") as f:
                data = f.read().split("\n")
            data.pop()  # remove the last empty element
            if len(data) != self.number_of_tests:
                log.error("Not all tests finished")
                raise exceptions.BenchmarkTestFailed()
            else:
                log.info(
                    "All test finished OK, and the results can be found at :")
                for res in data:
                    log.info(res)
                    es_links.append(res)
        except OSError as err:
            log.error(f"OS error: {err}")
            raise err

        self.es_link = ",".join(es_links)

    def push_to_dashboard(self, test_name):
        """
        Pushing the test results into the performance dashboard, if exist

        Args:
            test_name (str): the test name as defined in the performance dashboard

        Returns:
            None in case of pushing the results to the dashboard failed

        """

        try:
            db = PerfDash()
        except MissingRequiredConfigKeyError as ex:
            log.error(
                f"Results cannot be pushed to the performance dashboard, no connection [{ex}]"
            )
            return None

        log.info(f"Full version is : {self.environment.get('ocs_build')}")
        version = self.environment.get("ocs_build").split("-")[0]
        try:
            build = self.environment.get("ocs_build").split("-")[1]
            build = build.split(".")[0]
        except Exception:
            build = "GA"

        # Getting the topology from the cluster
        az = node.get_odf_zone_count()
        if az == 0:
            az = 1
        topology = f"{az}-AZ"

        # Check if it is Arbiter cluster
        my_obj = OCP(kind="StorageCluster",
                     namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        arbiter = (my_obj.data.get("items")[0].get("spec").get("arbiter").get(
            "enable", False))

        if arbiter:
            topology = "Strech-Arbiter"

        # Check if run on LSO
        try:
            ns = OCP(kind="namespace",
                     resource_name=defaults.LOCAL_STORAGE_NAMESPACE)
            ns.get()
            platform = f"{self.environment.get('platform')}-LSO"
        except Exception:
            platform = self.environment.get("platform")

        # Check if encrypted cluster
        encrypt = (
            my_obj.data.get("items")[0].get("spec").get("encryption").get(
                "enable", False))
        kms = (my_obj.data.get("items")[0].get("spec").get("encryption").get(
            "kms").get("enable", False))
        if kms:
            platform = f"{platform}-KMS"
        elif encrypt:
            platform = f"{platform}-Enc"

        # Check the base storageclass on AWS
        if self.environment.get("platform").upper() == "AWS":
            osd_pod_list = pod.get_osd_pods()
            osd_pod = osd_pod_list[0].pod_data["metadata"]["name"]
            osd_pod_obj = OCP(
                kind="POD",
                resource_name=osd_pod,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            log.info(f"The First OSD pod nams is {osd_pod}")

            osd_pvc_name = osd_pod_obj.get(
            )["spec"]["initContainers"][0]["volumeDevices"][0]["name"]
            log.info(f"The First OSD name is : {osd_pvc_name}")
            osd_pvc_obj = OCP(
                kind="PersistentVolumeClaim",
                resource_name=osd_pvc_name,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )

            odf_back_storage = osd_pvc_obj.get()["spec"]["storageClassName"]
            log.info(
                f"The ODF deployment use {odf_back_storage} as back storage")
            if odf_back_storage != "gp2":
                platform = f"{platform}-{odf_back_storage}"

        if self.dev_mode:
            port = "8181"
        else:
            port = "8080"

        try:
            log.info("Trying to push :"
                     f"version={version},"
                     f"build={build},"
                     f"platform={platform},"
                     f"topology={topology},"
                     f"test={test_name},"
                     f"eslink={self.es_link}, logfile=None")

            db.add_results(
                version=version,
                build=build,
                platform=platform,
                topology=topology,
                test=test_name,
                eslink=self.es_link,
                logfile=None,
            )
            resultslink = (f"http://{db.creds['host']}:{port}/index.php?"
                           f"version1={db.get_version_id(version)}"
                           f"&build1={db.get_build_id(version, build)}"
                           f"&platform1={db.get_platform_id(platform)}"
                           f"&az_topology1={db.get_topology_id(topology)}"
                           f"&test_name%5B%5D={db.get_test_id(test_name)}"
                           "&submit=Choose+options")
            log.info(f"Full results report can be found at : {resultslink}")
        except Exception as ex:
            log.error(
                f"Can not push results into the performance Dashboard! [{ex}]")

        db.cleanup()

    def add_test_to_results_check(self, test, test_count, test_name):
        """
        Adding test information to list of test(s) that we want to check the results
        and push them to the dashboard.

        Args:
            test (str): the name of the test function that we want to check
            test_count (int): number of test(s) that need to run - according to parametize
            test_name (str): the test name in the Performance dashboard

        """
        self.workloads.append({
            "name": test,
            "tests": test_count,
            "test_name": test_name
        })

    def check_results_and_push_to_dashboard(self):
        """
        Checking test(s) results - that all test(s) are finished OK, and push
        the results into the performance dashboard

        """

        for wl in self.workloads:
            self.number_of_tests = wl["tests"]

            self.results_file = os.path.join("/", *self.results_path,
                                             wl["name"], "all_results.txt")
            log.info(
                f"Check results for [{wl['name']}] in : {self.results_file}")
            self.check_tests_results()
            self.push_to_dashboard(test_name=wl["test_name"])

    def create_test_project(self):
        """
        Creating new project (namespace) for performance test
        """
        self.namespace = "pas-test-namespace"
        log.info(f"Creating new namespace ({self.namespace}) for the test")
        try:
            self.proj = helpers.create_project(project_name=self.namespace)
        except CommandFailed as ex:
            if str(ex).find("(AlreadyExists)"):
                log.warning("The namespace already exists !")
            log.error("Cannot create new project")
            raise CommandFailed(f"{self.namespace} was not created")

    def delete_test_project(self):
        """
        Deleting the performance test project (namespace)
        """
        log.info(f"Deleting the test namespace : {self.namespace}")
        switch_to_default_rook_cluster_project()
        try:
            self.proj.delete(resource_name=self.namespace)
            self.proj.wait_for_delete(resource_name=self.namespace,
                                      timeout=60,
                                      sleep=10)
        except CommandFailed:
            log.error(f"Cannot delete project {self.namespace}")
            raise CommandFailed(f"{self.namespace} was not created")

    def set_results_path_and_file(self, func_name):
        """
        Setting the results_path and results_file parameter for a specific test

        Args:
            func_name (str): the name of the function which use for the test
        """

        self.results_path = os.path.join("/", *self.results_path, func_name)
        self.results_file = os.path.join(self.results_path, "all_results.txt")
Exemplo n.º 7
0
    def test_upgrade_ocp(self):
        """
        Tests OCS stability when upgrading OCP

        """

        ceph_cluster = CephCluster()
        with CephHealthMonitor(ceph_cluster):

            ocp_upgrade_version = config.UPGRADE.get('ocp_upgrade_version')
            if not ocp_upgrade_version:
                ocp_channel = config.UPGRADE['ocp_channel']
                ocp_upgrade_version = get_latest_ocp_version(
                    channel=ocp_channel)
                ocp_arch = config.UPGRADE['ocp_arch']
                target_image = f"{ocp_upgrade_version}-{ocp_arch}"
            elif ocp_upgrade_version.endswith(".nightly"):
                target_image = expose_ocp_version(ocp_upgrade_version)

            logger.info(f"Target image; {target_image}")

            image_path = config.UPGRADE['ocp_upgrade_path']
            self.cluster_operators = ocp.get_all_cluster_operators()
            logger.info(f" oc version: {ocp.get_current_oc_version()}")
            # Verify Upgrade subscription channel:
            ocp.patch_ocp_upgrade_channel(ocp_channel)
            for sampler in TimeoutSampler(timeout=250,
                                          sleep=15,
                                          func=ocp.verify_ocp_upgrade_channel,
                                          channel_variable=ocp_channel):
                if sampler:
                    logger.info(f"OCP Channel:{ocp_channel}")
                    break

            # Upgrade OCP
            logger.info(f"full upgrade path: {image_path}:{target_image}")
            ocp.upgrade_ocp(image=target_image, image_path=image_path)

            # Wait for upgrade
            for ocp_operator in self.cluster_operators:
                logger.info(f"Checking upgrade status of {ocp_operator}:")
                ver = ocp.get_cluster_operator_version(ocp_operator)
                logger.info(f"current {ocp_operator} version: {ver}")
                for sampler in TimeoutSampler(
                        timeout=2700,
                        sleep=60,
                        func=ocp.confirm_cluster_operator_version,
                        target_version=target_image,
                        cluster_operator=ocp_operator):
                    logger.info(
                        f"ClusterOperator upgrade "
                        f"{'completed!' if sampler else 'did not completed yet!'}"
                    )
                    if sampler:
                        break

            # post upgrade validation: check cluster operator status
            for ocp_operator in self.cluster_operators:
                logger.info(f"Checking cluster status of {ocp_operator}")
                for sampler in TimeoutSampler(
                        timeout=2700,
                        sleep=60,
                        func=ocp.verify_cluster_operator_status,
                        cluster_operator=ocp_operator):
                    logger.info(
                        f"ClusterOperator status is  "
                        f"{'valid' if sampler else 'status is not valid'}")
                    if sampler:
                        break
            # Post upgrade validation: check cluster version status
            logger.info("Checking clusterversion status")
            for sampler in TimeoutSampler(
                    timeout=900,
                    sleep=15,
                    func=ocp.validate_cluster_version_status):
                if sampler:
                    logger.info("Upgrade Completed Successfully!")
                    break
Exemplo n.º 8
0
class Sanity:
    """
    Class for cluster health and functional validations
    """
    def __init__(self):
        """
        Initializer for Sanity class - Init CephCluster() in order to
        set the cluster status before starting the tests
        """
        self.pvc_objs = list()
        self.pod_objs = list()
        self.obj_data = ""
        self.ceph_cluster = CephCluster()

    def health_check(self, cluster_check=True, tries=20):
        """
        Perform Ceph and cluster health checks
        """
        wait_for_cluster_connectivity(tries=400)
        logger.info("Checking cluster and Ceph health")
        node.wait_for_nodes_status(timeout=300)

        ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'],
                          tries=tries)
        if cluster_check:
            self.ceph_cluster.cluster_health_check(timeout=60)

    def create_resources(self, pvc_factory, pod_factory, run_io=True):
        """
        Sanity validation - Create resources (FS and RBD) and run IO

        Args:
            pvc_factory (function): A call to pvc_factory function
            pod_factory (function): A call to pod_factory function
            run_io (bool): True for run IO, False otherwise

        """
        logger.info(
            "Creating resources and running IO as a sanity functional validation"
        )

        for interface in [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]:
            pvc_obj = pvc_factory(interface)
            self.pvc_objs.append(pvc_obj)
            self.pod_objs.append(pod_factory(pvc=pvc_obj, interface=interface))
        if run_io:
            for pod in self.pod_objs:
                pod.run_io('fs', '1G', runtime=30)
            for pod in self.pod_objs:
                get_fio_rw_iops(pod)

    def delete_resources(self):
        """
        Sanity validation - Delete resources (FS and RBD)

        """
        logger.info("Deleting resources as a sanity functional validation")

        for pod_obj in self.pod_objs:
            pod_obj.delete()
        for pod_obj in self.pod_objs:
            pod_obj.ocp.wait_for_delete(pod_obj.name)
        for pvc_obj in self.pvc_objs:
            pvc_obj.delete()
        for pvc_obj in self.pvc_objs:
            pvc_obj.ocp.wait_for_delete(pvc_obj.name)

    @ignore_leftovers
    def create_pvc_delete(self, multi_pvc_factory, project=None):
        """
        Creates and deletes all types of PVCs

        """
        # Create rbd pvcs
        pvc_objs_rbd = create_pvcs(multi_pvc_factory=multi_pvc_factory,
                                   interface='CephBlockPool',
                                   project=project,
                                   status="",
                                   storageclass=None)

        # Create cephfs pvcs
        pvc_objs_cephfs = create_pvcs(multi_pvc_factory=multi_pvc_factory,
                                      interface='CephFileSystem',
                                      project=project,
                                      status="",
                                      storageclass=None)

        all_pvc_to_delete = pvc_objs_rbd + pvc_objs_cephfs

        # Check pvc status
        for pvc_obj in all_pvc_to_delete:
            helpers.wait_for_resource_state(resource=pvc_obj,
                                            state=constants.STATUS_BOUND,
                                            timeout=300)

        # Start deleting PVC
        delete_pvcs(all_pvc_to_delete)

        # Check PVCs are deleted
        for pvc_obj in all_pvc_to_delete:
            pvc_obj.ocp.wait_for_delete(resource_name=pvc_obj.name)

        logger.info("All PVCs are deleted as expected")

    def obc_put_obj_create_delete(self, mcg_obj, bucket_factory):
        """
        Creates bucket then writes, reads and deletes objects

        """
        bucket_name = bucket_factory(amount=1, interface='OC')[0].name
        self.obj_data = "A string data"

        for i in range(0, 30):
            key = 'Object-key-' + f"{i}"
            logger.info(f"Write, read and delete object with key: {key}")
            assert s3_put_object(mcg_obj, bucket_name, key,
                                 self.obj_data), f"Failed: Put object, {key}"
            assert s3_get_object(mcg_obj, bucket_name,
                                 key), f"Failed: Get object, {key}"
            assert s3_delete_object(mcg_obj, bucket_name,
                                    key), f"Failed: Delete object, {key}"
Exemplo n.º 9
0
def unset_noout():
    """
    unset_noout with 10 retries and delay of 10 seconds.
    """
    ceph = CephCluster()
    ceph.unset_noout()
class TestSmallFileWorkloadScale(E2ETest):
    """
    Deploy benchmark operator and run different scale tests.
    Call common small files workload routine to run SmallFile workload
    """

    def setup(self):
        """
        Initialize the test environment

        """
        # Deploy internal ES server - not need to keep results,
        # so don't use production ES
        self.es = ElasticSearch()

        # Initial the Small Files workload, based on benchmark-operator
        self.small_files = SmallFiles(self.es)

        self.ceph_cluster = CephCluster()

        # Get the total storage capacity
        self.ceph_capacity = self.ceph_cluster.get_ceph_capacity()
        log.info(f"Total storage capacity is {self.ceph_capacity:,.2f} GiB")

        # Collect the pulls usage before the test is starting
        self.orig_data = self.get_cephfs_data()

    def teardown(self):
        """
        Teardown the test environment

        """
        self.small_files.cleanup()
        self.es.cleanup()

    def get_cephfs_data(self):
        """
        Look through ceph pods and find space usage on all ceph filesystem pods

        Returns:
            Dictionary of byte usage, indexed by pod name.
        """
        ceph_status = self.ceph_cluster.toolbox.exec_ceph_cmd(ceph_cmd="ceph df")
        ret_value = {}
        for pool in ceph_status["pools"]:
            # Only the data pool is in our interest (not metadata)
            if "cephfilesystem" in pool["name"]:
                ret_value[pool["name"]] = pool["stats"]["bytes_used"]
        return ret_value

    def display_ceph_usage(self, msg, data):
        """
        Display the pool usage in a pretty way

        Args:
            msg (str): the message string to display with the values
            data (dict): dictionary of pools -> capacity (in bytes)

        """
        log.info(f"The pools usage {msg} is :")
        for entry in data:
            log.info(f"{entry} now uses {data[entry]:,} bytes")

    @pytest.mark.parametrize(
        argnames=["file_size", "files", "threads", "interface"],
        argvalues=[
            # 500K Files, ~4GB
            pytest.param(*[8, 125000, 4, constants.CEPHFILESYSTEM]),
            # 5M Files, ~152GB
            pytest.param(*[32, 1250000, 4, constants.CEPHFILESYSTEM]),
        ],
    )
    def test_scale_smallfile_workload(self, file_size, files, threads, interface):
        # updating the benchmark parameters
        self.small_files.setup_storageclass(interface)
        self.small_files.setup_test_params(file_size, files, threads, 1)

        # Verify we have enough storage capacity to run the test.
        self.small_files.setup_vol_size(file_size, files, threads, self.ceph_capacity)

        # Run the benchmark to create files on the volume
        self.small_files.setup_operations("create")
        self.small_files.run()

        # Collect pools usage after creation is done.
        self.run_data = self.get_cephfs_data()

        # Delete the benchmark data
        self.small_files.delete()

        # Getting the usage capacity immediately after deletion
        self.now_data = self.get_cephfs_data()

        # Wait 3 minutes for the backend deletion actually start.
        time.sleep(180)

        # Quarry the storage usage every 2 Min. if no difference between two
        # samples, the backend cleanup is done.
        still_going_down = True
        while still_going_down:
            log.info("Waiting for Ceph to finish cleaning up")
            time.sleep(120)
            self.new_data = self.get_cephfs_data()
            still_going_down = False
            for entry in self.new_data:
                if self.new_data[entry] < self.now_data[entry]:
                    still_going_down = True
                    self.now_data[entry] = self.new_data[entry]

        self.display_ceph_usage("Before ths test", self.orig_data)
        self.display_ceph_usage("After data creation", self.run_data)

        # Make sure that the test actually wrote data to the volume
        # at least 1GiB.
        for entry in self.run_data:
            if re.search("metadata", entry):
                # Since we are interesting in the data written and not the metadata
                # skipping the metadata pool
                continue
            written = self.run_data[entry] - self.orig_data[entry]
            check = written > constants.GB
            errmsg = (
                f"{written:,.2f} bytes was written to {entry} -"
                "This is not enough for the test"
            )
            assert check, errmsg

        self.display_ceph_usage("After data deletion", self.now_data)

        for entry in self.now_data:
            # Leak indicated if over %20 more storage is used and more then 5 GiB.
            try:
                ratio = self.now_data[entry] / self.orig_data[entry]
            except ZeroDivisionError:
                ratio = self.now_data[entry]

            added_data = (self.now_data[entry] - self.orig_data[entry]) / constants.GB
            # in some cases (especially for metadata), it might be that after the
            # test there is less data in the pool than before the test.
            if added_data < 0:
                added_data = 0
                ratio = 1

            log.info(
                "The ratio between capacity before and after the test "
                f"on {entry} is : {ratio:.2f} ; {added_data:,.2f} GiB"
            )

            check = (ratio < 1.20) or (added_data < 3)
            errmsg = f"{entry} is over 20% (or 3 GiB) larger [{ratio} ; {added_data}]-- possible leak"
            assert check, errmsg
Exemplo n.º 11
0
    def test_upgrade_ocp(self, reduce_and_resume_cluster_load):
        """
        Tests OCS stability when upgrading OCP

        """

        cluster_ver = ocp.run_cmd("oc get clusterversions/version -o yaml")
        logger.debug(f"Cluster versions before upgrade:\n{cluster_ver}")
        ceph_cluster = CephCluster()
        with CephHealthMonitor(ceph_cluster):

            ocp_channel = config.UPGRADE.get("ocp_channel",
                                             ocp.get_ocp_upgrade_channel())
            ocp_upgrade_version = config.UPGRADE.get("ocp_upgrade_version")
            if not ocp_upgrade_version:
                ocp_upgrade_version = get_latest_ocp_version(
                    channel=ocp_channel)
                ocp_arch = config.UPGRADE["ocp_arch"]
                target_image = f"{ocp_upgrade_version}-{ocp_arch}"
            elif ocp_upgrade_version.endswith(".nightly"):
                target_image = expose_ocp_version(ocp_upgrade_version)

            logger.info(f"Target image; {target_image}")

            image_path = config.UPGRADE["ocp_upgrade_path"]
            cluster_operators = ocp.get_all_cluster_operators()
            logger.info(f" oc version: {ocp.get_current_oc_version()}")
            # Verify Upgrade subscription channel:
            ocp.patch_ocp_upgrade_channel(ocp_channel)
            for sampler in TimeoutSampler(
                    timeout=250,
                    sleep=15,
                    func=ocp.verify_ocp_upgrade_channel,
                    channel_variable=ocp_channel,
            ):
                if sampler:
                    logger.info(f"OCP Channel:{ocp_channel}")
                    break

            # Upgrade OCP
            logger.info(f"full upgrade path: {image_path}:{target_image}")
            ocp.upgrade_ocp(image=target_image, image_path=image_path)

            # Wait for upgrade
            for ocp_operator in cluster_operators:
                logger.info(f"Checking upgrade status of {ocp_operator}:")
                # ############ Workaround for issue 2624 #######
                name_changed_between_versions = (
                    "service-catalog-apiserver",
                    "service-catalog-controller-manager",
                )
                if ocp_operator in name_changed_between_versions:
                    logger.info(f"{ocp_operator} upgrade will not be verified")
                    continue
                # ############ End of Workaround ###############
                ver = ocp.get_cluster_operator_version(ocp_operator)
                logger.info(f"current {ocp_operator} version: {ver}")
                for sampler in TimeoutSampler(
                        timeout=2700,
                        sleep=60,
                        func=ocp.confirm_cluster_operator_version,
                        target_version=target_image,
                        cluster_operator=ocp_operator,
                ):
                    if sampler:
                        logger.info(f"{ocp_operator} upgrade completed!")
                        break
                    else:
                        logger.info(
                            f"{ocp_operator} upgrade did not completed yet!")

            # post upgrade validation: check cluster operator status
            cluster_operators = ocp.get_all_cluster_operators()
            for ocp_operator in cluster_operators:
                logger.info(f"Checking cluster status of {ocp_operator}")
                for sampler in TimeoutSampler(
                        timeout=2700,
                        sleep=60,
                        func=ocp.verify_cluster_operator_status,
                        cluster_operator=ocp_operator,
                ):
                    if sampler:
                        break
                    else:
                        logger.info(f"{ocp_operator} status is not valid")
            # Post upgrade validation: check cluster version status
            logger.info("Checking clusterversion status")
            for sampler in TimeoutSampler(
                    timeout=900,
                    sleep=15,
                    func=ocp.validate_cluster_version_status):
                if sampler:
                    logger.info("Upgrade Completed Successfully!")
                    break

        cluster_ver = ocp.run_cmd("oc get clusterversions/version -o yaml")
        logger.debug(f"Cluster versions post upgrade:\n{cluster_ver}")

        # load new config file
        self.load_ocp_version_config_file(ocp_upgrade_version)

        new_ceph_cluster = CephCluster()
        new_ceph_cluster.wait_for_rebalance(timeout=1800)
        ceph_health_check(tries=90, delay=30)
Exemplo n.º 12
0
    def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern):
        """
        This is a basic fio perf test
        """
        # Deployment ripsaw
        log.info("Deploying ripsaw operator")
        ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml')
        sc = 'ocs-storagecluster-ceph-rbd' if interface == 'CephBlockPool' else 'ocs-storagecluster-cephfs'

        # Create fio benchmark
        log.info("Create resource file for fio workload")
        fio_cr = templating.load_yaml(constants.FIO_CR_YAML)

        # Saving the Original elastic-search IP and PORT - if defined in yaml
        es_server = ""
        es_port = ""
        if 'elasticsearch' in fio_cr['spec']:
            if 'server' in fio_cr['spec']['elasticsearch']:
                es_server = fio_cr['spec']['elasticsearch']['server']
            if 'port' in fio_cr['spec']['elasticsearch']:
                es_port = fio_cr['spec']['elasticsearch']['port']
        else:
            fio_cr['spec']['elasticsearch'] = {}

        # Use the internal define elastic-search server in the test
        fio_cr['spec']['elasticsearch'] = {
            'server': es.get_ip(),
            'port': es.get_port()
        }

        # Setting the data set to 40% of the total storage capacity but
        # not more then 600GiB
        ceph_cluster = CephCluster()
        total_data_set = int(ceph_cluster.get_ceph_capacity() * 0.4)
        filesize = int(fio_cr['spec']['workload']['args']['filesize'].replace(
            'GiB', ''))
        # To make sure the number of App pods will not be more then 50, in case
        # of large data set, changing the size of the file each pod will work on
        if total_data_set > 500:
            filesize = int(ceph_cluster.get_ceph_capacity() * 0.008)
            fio_cr['spec']['workload']['args']['filesize'] = f'{filesize}GiB'
            # make sure that the storage size is larger then the file size
            fio_cr['spec']['workload']['args'][
                'storagesize'] = f'{int(filesize * 1.2)}Gi'
        fio_cr['spec']['workload']['args']['servers'] = int(total_data_set /
                                                            filesize)
        log.info(f'Total Data set to work on is : {total_data_set} GiB')

        fio_cr['spec']['clustername'] = config.ENV_DATA[
            'platform'] + get_build() + get_ocs_version()
        fio_cr['spec']['test_user'] = get_ocs_version(
        ) + interface + io_pattern
        fio_cr['spec']['workload']['args']['storageclass'] = sc
        if io_pattern == 'sequential':
            fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read']
        log.info(f'fio_cr: {fio_cr}')
        fio_cr_obj = OCS(**fio_cr)
        fio_cr_obj.create()

        # Wait for fio client pod to be created
        for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern,
                                      'fio-client',
                                      constants.RIPSAW_NAMESPACE):
            try:
                if fio_pod[0] is not None:
                    fio_client_pod = fio_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        # Wait for fio pod to initialized and complete
        log.info("Waiting for fio_client to complete")
        pod_obj = OCP(kind='pod')
        pod_obj.wait_for_resource(
            condition='Completed',
            resource_name=fio_client_pod,
            timeout=18000,
            sleep=300,
        )

        output = run_cmd(f'oc logs {fio_client_pod}')

        try:
            if 'Fio failed to execute' not in output:
                log.info("FIO has completed successfully")
        except IOError:
            log.info("FIO failed to complete")

        # Clean up fio benchmark
        log.info("Deleting FIO benchmark")
        fio_cr_obj.delete()

        # Setting back the original elastic-search information
        fio_cr['spec']['elasticsearch'] = {
            'server': es_server,
            'port': es_port
        }
        analyze_regression(io_pattern,
                           sc,
                           es_username=fio_cr['spec']['test_user'])
Exemplo n.º 13
0
    def test_add_capacity_with_resource_delete(
        self,
        workload_storageutilization_rbd,
        resource_name,
        resource_id,
        is_kill_resource_repeatedly,
    ):
        """
        The function get the resource name, and id.
        The function adds capacity to the cluster, and then delete the resource while
        storage capacity is getting increased.

        Args:
            resource_name (str): the name of the resource to delete
            resource_id (int): the id of the resource to delete
            is_kill_resource_repeatedly (bool): If True then kill the resource repeatedly. Else, if False
                delete the resource only once.

        """
        used_percentage = get_percent_used_capacity()
        logging.info(
            f"storageutilization is completed. used capacity = {used_percentage}"
        )

        osd_pods_before = pod_helpers.get_osd_pods()
        number_of_osd_pods_before = len(osd_pods_before)
        if number_of_osd_pods_before >= constants.MAX_OSDS:
            pytest.skip("We have maximum of OSDs in the cluster")

        d = Disruptions()
        d.set_resource(resource_name)

        self.new_pods_in_status_running = False

        osd_size = storage_cluster.get_osd_size()
        logging.info(f"Adding one new set of OSDs. osd size = {osd_size}")
        storagedeviceset_count = storage_cluster.add_capacity(osd_size)
        logging.info("Adding one new set of OSDs was issued without problems")

        # Wait for new osd's to come up. After the first new osd in status Init - delete the resource.
        # After deleting the resource we expect that all the new osd's will be in status running,
        # and the delete resource will be also in status running.
        pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before)
        logging.info(
            f"Delete a {resource_name} pod while storage capacity is getting increased"
        )
        if is_kill_resource_repeatedly:
            with ThreadPoolExecutor() as executor:
                executor.submit(self.kill_resource_repeatedly, resource_name,
                                resource_id)
                self.wait_for_osd_pods_to_be_running(storagedeviceset_count)
        else:
            d.delete_resource(resource_id)
            self.wait_for_osd_pods_to_be_running(storagedeviceset_count)

        self.new_pods_in_status_running = True
        logging.info(
            "Finished verifying add capacity when one of the pods gets deleted"
        )
        logging.info("Waiting for ceph health check to finished...")
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"],
                          tries=90)
        ceph_cluster_obj = CephCluster()
        assert ceph_cluster_obj.wait_for_rebalance(
            timeout=1800), "Data re-balance failed to complete"
Exemplo n.º 14
0
class Sanity:
    """
    Class for cluster health and functional validations
    """

    def __init__(self):
        """
        Initializer for Sanity class - Init CephCluster() in order to
        set the cluster status before starting the tests
        """
        self.pvc_objs = list()
        self.pod_objs = list()
        self.ceph_cluster = CephCluster()

    def health_check(self, cluster_check=True, tries=20):
        """
        Perform Ceph and cluster health checks
        """
        wait_for_cluster_connectivity(tries=400)
        logger.info("Checking cluster and Ceph health")
        node.wait_for_nodes_status(timeout=300)

        ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'], tries=tries)
        if cluster_check:
            self.ceph_cluster.cluster_health_check(timeout=60)

    def create_resources(self, pvc_factory, pod_factory, run_io=True):
        """
        Sanity validation - Create resources (FS and RBD) and run IO

        Args:
            pvc_factory (function): A call to pvc_factory function
            pod_factory (function): A call to pod_factory function
            run_io (bool): True for run IO, False otherwise

        """
        logger.info("Creating resources and running IO as a sanity functional validation")

        for interface in [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]:
            pvc_obj = pvc_factory(interface)
            self.pvc_objs.append(pvc_obj)
            self.pod_objs.append(pod_factory(pvc=pvc_obj, interface=interface))
        if run_io:
            for pod in self.pod_objs:
                pod.run_io('fs', '1G')
            for pod in self.pod_objs:
                get_fio_rw_iops(pod)

    def delete_resources(self):
        """
        Sanity validation - Delete resources (FS and RBD)

        """
        logger.info("Deleting resources as a sanity functional validation")

        for pod_obj in self.pod_objs:
            pod_obj.delete()
        for pod_obj in self.pod_objs:
            pod_obj.ocp.wait_for_delete(pod_obj.name)
        for pvc_obj in self.pvc_objs:
            pvc_obj.delete()
        for pvc_obj in self.pvc_objs:
            pvc_obj.ocp.wait_for_delete(pvc_obj.name)
Exemplo n.º 15
0
    def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern):
        """
        This is a basic fio perf test

        """

        # Deployment ripsaw
        log.info("Deploying ripsaw operator")
        ripsaw.apply_crd("resources/crds/" "ripsaw_v1alpha1_ripsaw_crd.yaml")
        if interface == "CephBlockPool":
            sc = constants.CEPHBLOCKPOOL_SC
        else:
            sc = constants.CEPHFILESYSTEM_SC

        # Create fio benchmark
        log.info("Create resource file for fio workload")
        fio_cr = templating.load_yaml(constants.FIO_CR_YAML)

        # Saving the Original elastic-search IP and PORT - if defined in yaml
        if "elasticsearch" in fio_cr["spec"]:
            backup_es = fio_cr["spec"]["elasticsearch"]
        else:
            log.warning(
                "Elastic Search information does not exists in YAML file")
            fio_cr["spec"]["elasticsearch"] = {}

        # Use the internal define elastic-search server in the test - if exist
        if es:
            fio_cr["spec"]["elasticsearch"] = {
                "server": es.get_ip(),
                "port": es.get_port(),
            }

        # Setting the data set to 40% of the total storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = ceph_cluster.get_ceph_capacity()
        total_data_set = int(ceph_capacity * 0.4)
        filesize = int(fio_cr["spec"]["workload"]["args"]["filesize"].replace(
            "GiB", ""))
        # To make sure the number of App pods will not be more then 50, in case
        # of large data set, changing the size of the file each pod will work on
        if total_data_set > 500:
            filesize = int(ceph_capacity * 0.008)
            fio_cr["spec"]["workload"]["args"]["filesize"] = f"{filesize}GiB"
            # make sure that the storage size is larger then the file size
            fio_cr["spec"]["workload"]["args"][
                "storagesize"] = f"{int(filesize * 1.2)}Gi"
        fio_cr["spec"]["workload"]["args"]["servers"] = int(total_data_set /
                                                            filesize)
        log.info(f"Total Data set to work on is : {total_data_set} GiB")

        environment = get_environment_info()
        if not environment["user"] == "":
            fio_cr["spec"]["test_user"] = environment["user"]
        fio_cr["spec"]["clustername"] = environment["clustername"]

        log.debug(f"Environment information is : {environment}")

        fio_cr["spec"]["workload"]["args"]["storageclass"] = sc
        if io_pattern == "sequential":
            fio_cr["spec"]["workload"]["args"]["jobs"] = ["write", "read"]
            fio_cr["spec"]["workload"]["args"]["iodepth"] = 1
        log.info(f"The FIO CR file is {fio_cr}")
        fio_cr_obj = OCS(**fio_cr)
        fio_cr_obj.create()

        # Wait for fio client pod to be created
        for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern,
                                      "fio-client",
                                      constants.RIPSAW_NAMESPACE):
            try:
                if fio_pod[0] is not None:
                    fio_client_pod = fio_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        # Getting the start time of the test
        start_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime())

        # Getting the UUID from inside the benchmark pod
        uuid = ripsaw.get_uuid(fio_client_pod)
        # Setting back the original elastic-search information
        fio_cr["spec"]["elasticsearch"] = backup_es

        full_results = FIOResultsAnalyse(uuid, fio_cr)

        # Initialize the results doc file.
        for key in environment:
            full_results.add_key(key, environment[key])

        # Setting the global parameters of the test
        full_results.add_key("io_pattern", io_pattern)
        full_results.add_key("dataset", f"{total_data_set}GiB")
        full_results.add_key("file_size",
                             fio_cr["spec"]["workload"]["args"]["filesize"])
        full_results.add_key("servers",
                             fio_cr["spec"]["workload"]["args"]["servers"])
        full_results.add_key("samples",
                             fio_cr["spec"]["workload"]["args"]["samples"])
        full_results.add_key("operations",
                             fio_cr["spec"]["workload"]["args"]["jobs"])
        full_results.add_key("block_sizes",
                             fio_cr["spec"]["workload"]["args"]["bs"])
        full_results.add_key("io_depth",
                             fio_cr["spec"]["workload"]["args"]["iodepth"])
        full_results.add_key("jobs",
                             fio_cr["spec"]["workload"]["args"]["numjobs"])
        full_results.add_key(
            "runtime",
            {
                "read": fio_cr["spec"]["workload"]["args"]["read_runtime"],
                "write": fio_cr["spec"]["workload"]["args"]["write_runtime"],
            },
        )
        full_results.add_key(
            "storageclass", fio_cr["spec"]["workload"]["args"]["storageclass"])
        full_results.add_key("vol_size",
                             fio_cr["spec"]["workload"]["args"]["storagesize"])

        # Wait for fio pod to initialized and complete
        log.info("Waiting for fio_client to complete")
        pod_obj = OCP(kind="pod")
        pod_obj.wait_for_resource(
            condition="Completed",
            resource_name=fio_client_pod,
            timeout=18000,
            sleep=300,
        )

        # Getting the end time of the test
        end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime())
        full_results.add_key("test_time", {
            "start": start_time,
            "end": end_time
        })

        output = run_cmd(f"oc logs {fio_client_pod}")
        log.info(f"The Test log is : {output}")

        try:
            if "Fio failed to execute" not in output:
                log.info("FIO has completed successfully")
        except IOError:
            log.info("FIO failed to complete")

        # Clean up fio benchmark
        log.info("Deleting FIO benchmark")
        fio_cr_obj.delete()

        log.debug(f"Full results is : {full_results.results}")

        # if Internal ES is exists, Copy all data from the Internal to main ES
        if es:
            log.info("Copy all data from Internal ES to Main ES")
            es._copy(full_results.es)
        # Adding this sleep between the copy and the analyzing of the results
        # since sometimes the results of the read (just after write) are empty
        time.sleep(30)
        full_results.analyze_results()  # Analyze the results
        # Writing the analyzed test results to the Elastic-Search server
        full_results.es_write()
        full_results.codespeed_push()  # Push results to codespeed
        # Creating full link to the results on the ES server
        log.info(f"The Result can be found at ; {full_results.results_link()}")
Exemplo n.º 16
0
    def setup(
        self, request, scenario, num_of_nodes, num_of_fail_nodes,
        disrupt_provisioner, project_factory, multi_pvc_factory, dc_pod_factory
    ):
        """
        Identify the nodes and start DeploymentConfig based app pods using
        PVC with ReadWriteOnce (RWO) access mode on selected nodes

        Args:
            scenario (str): Scenario of app pods running on OCS or dedicated nodes
                (eg., 'colocated', 'dedicated')
            num_of_nodes (int): number of nodes required for running test
            num_of_fail_nodes (int): number of nodes to make unresponsive during test
            disrupt_provisioner (bool): True to disrupt the leader provisioner
                pods if not running on selected nodes, else False
            project_factory: A fixture to create new project
            multi_pvc_factory: A fixture create a set of new PVCs
            dc_pod_factory: A fixture to create deploymentconfig pods

        Returns:
            tuple: containing the params used in test cases
        """
        ocs_nodes, non_ocs_nodes = self.identify_and_add_nodes(
            scenario, num_of_nodes
        )
        test_nodes = ocs_nodes if (scenario == "colocated") else non_ocs_nodes
        logger.info(f"Using nodes {test_nodes} for running test")

        def finalizer():
            helpers.remove_label_from_worker_node(
                node_list=test_nodes, label_key="nodetype"
            )

        request.addfinalizer(finalizer)

        if len(ocs_nodes) > 4 and float(config.ENV_DATA['ocs_version']) >= 4.3:
            pod_obj = ocp.OCP(
                kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']
            )
            assert pod_obj.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MON_APP_LABEL, resource_count=5, timeout=900
            )

        ceph_cluster = CephCluster()
        project = project_factory()

        # Select nodes for running app pods and inducing network failure later
        app_pod_nodes = self.select_nodes_for_app_pods(
            scenario, ceph_cluster, ocs_nodes, non_ocs_nodes,
            num_of_fail_nodes
        )

        # Create multiple RBD and CephFS backed PVCs with RWO accessmode
        num_of_pvcs = self.num_of_app_pods_per_node * num_of_fail_nodes
        rbd_pvcs = multi_pvc_factory(
            interface=constants.CEPHBLOCKPOOL, project=project, size=self.pvc_size,
            access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs
        )
        cephfs_pvcs = multi_pvc_factory(
            interface=constants.CEPHFILESYSTEM, project=project, size=self.pvc_size,
            access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs
        )

        # Create deploymentconfig based pods
        dc_pods = []
        # Start app-pods on selected node(s)
        for node_name in app_pod_nodes:
            logger.info(f"Starting app pods on the node {node_name}")
            helpers.label_worker_node(
                node_list=[node_name], label_key="nodetype",
                label_value="app-pod"
            )

            for num in range(self.num_of_app_pods_per_node):
                dc_pods.append(
                    dc_pod_factory(
                        interface=constants.CEPHBLOCKPOOL, pvc=rbd_pvcs.pop(0),
                        node_selector={'nodetype': 'app-pod'}
                    )
                )
                assert pod.verify_node_name(dc_pods[-1], node_name), (
                    f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}"
                )
                dc_pods.append(
                    dc_pod_factory(
                        interface=constants.CEPHFILESYSTEM, pvc=cephfs_pvcs.pop(0),
                        node_selector={'nodetype': 'app-pod'}
                    )
                )
                assert pod.verify_node_name(dc_pods[-1], node_name), (
                    f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}"
                )
            helpers.remove_label_from_worker_node(
                node_list=[node_name], label_key="nodetype"
            )

        # Label other test nodes to be able to run app pods later
        helpers.label_worker_node(
            node_list=test_nodes, label_key="nodetype", label_value="app-pod"
        )

        # Get ceph mon,osd pods running on selected node if colocated scenario
        # and extra OCS nodes are present
        ceph_pods = []
        if scenario == "colocated" and len(test_nodes) > len(ceph_cluster.osds):
            pods_to_check = ceph_cluster.osds
            # Skip mon pods if mon_count is 5 as there may not be enough nodes
            # for all mons to run after multiple node failures
            if ceph_cluster.mon_count == 3:
                pods_to_check.extend(ceph_cluster.mons)
            for pod_obj in pods_to_check:
                if pod.get_pod_node(pod_obj).name in app_pod_nodes[0]:
                    ceph_pods.append(pod_obj)
            logger.info(
                f"Colocated Mon, OSD pods: {[pod_obj.name for pod_obj in ceph_pods]}"
            )

        disruptor = []
        if disrupt_provisioner:
            disruptor = self.disrupt_plugin_provisioner_pods(app_pod_nodes)

        return ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor
Exemplo n.º 17
0
def run_ocs_upgrade(operation=None, *operation_args, **operation_kwargs):
    """
    Run upgrade procedure of OCS cluster

    Args:
        operation: (function): Function to run
        operation_args: (iterable): Function's arguments
        operation_kwargs: (map): Function's keyword arguments

    """

    ceph_cluster = CephCluster()
    upgrade_ocs = OCSUpgrade(
        namespace=config.ENV_DATA["cluster_namespace"],
        version_before_upgrade=config.ENV_DATA.get("ocs_version"),
        ocs_registry_image=config.UPGRADE.get("upgrade_ocs_registry_image"),
        upgrade_in_current_source=config.UPGRADE.get(
            "upgrade_in_current_source", False),
    )
    upgrade_version = upgrade_ocs.get_upgrade_version()
    assert (
        upgrade_ocs.get_parsed_versions()[1] >=
        upgrade_ocs.get_parsed_versions()[0]), (
            f"Version you would like to upgrade to: {upgrade_version} "
            f"is not higher or equal to the version you currently running: "
            f"{upgrade_ocs.version_before_upgrade}")
    csv_name_pre_upgrade = upgrade_ocs.get_csv_name_pre_upgrade()
    pre_upgrade_images = upgrade_ocs.get_pre_upgrade_image(
        csv_name_pre_upgrade)
    upgrade_ocs.load_version_config_file(upgrade_version)
    if config.DEPLOYMENT.get("disconnected"):
        upgrade_ocs.ocs_registry_image = prepare_disconnected_ocs_deployment(
            upgrade=True)
        log.info(
            f"Disconnected upgrade - new image: {upgrade_ocs.ocs_registry_image}"
        )
    with CephHealthMonitor(ceph_cluster):
        channel = upgrade_ocs.set_upgrade_channel()
        upgrade_ocs.set_upgrade_images()
        upgrade_ocs.update_subscription(channel)
        if operation:
            log.info(f"Calling test function: {operation}")
            _ = operation(*operation_args, **operation_kwargs)
            # Workaround for issue #2531
            time.sleep(30)
            # End of workaround

        for sample in TimeoutSampler(
                timeout=725,
                sleep=5,
                func=upgrade_ocs.check_if_upgrade_completed,
                channel=channel,
                csv_name_pre_upgrade=csv_name_pre_upgrade,
        ):
            try:
                if sample:
                    log.info("Upgrade success!")
                    break
            except TimeoutException:
                raise TimeoutException("No new CSV found after upgrade!")
        old_image = upgrade_ocs.get_images_post_upgrade(
            channel, pre_upgrade_images, upgrade_version)
    verify_image_versions(
        old_image,
        upgrade_ocs.get_parsed_versions()[1],
        upgrade_ocs.version_before_upgrade,
    )
    ocs_install_verification(
        timeout=600,
        skip_osd_distribution_check=True,
        ocs_registry_image=upgrade_ocs.ocs_registry_image,
        post_upgrade_verification=True,
        version_before_upgrade=upgrade_ocs.version_before_upgrade,
    )
    def test_pvc_multiple_clone_performance(
        self,
        interface_iterate,
        teardown_factory,
        storageclass_factory,
        pvc_factory,
        pod_factory,
    ):
        """
        1. Creating PVC
           PVC size is calculated in the test and depends on the storage capacity, but not less then 1 GiB
           it will use ~75% capacity of the Storage, Min storage capacity 1 TiB
        2. Fill the PVC with 70% of data
        3. Take a clone of the PVC and measure time and speed of creation by reading start creation and end creation
            times from relevant logs
        4. Repeat the previous step number of times (maximal num_of_clones is 512)
        5. Print all measured statistics for all the clones.

        Raises:
            StorageNotSufficientException: in case of not enough capacity on the cluster

        """
        num_of_clones = 512

        # Getting the total Storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = int(ceph_cluster.get_ceph_capacity())

        # Use 70% of the storage capacity in the test
        capacity_to_use = int(ceph_capacity * 0.7)

        # since we do not want to use more then 65%, we add 35% to the needed
        # capacity, and minimum PVC size is 1 GiB
        need_capacity = int((num_of_clones + 2) * 1.35)
        # Test will run only on system with enough capacity
        if capacity_to_use < need_capacity:
            err_msg = (f"The system have only {ceph_capacity} GiB, "
                       f"we want to use only {capacity_to_use} GiB, "
                       f"and we need {need_capacity} GiB to run the test")
            log.error(err_msg)
            raise exceptions.StorageNotSufficientException(err_msg)

        # Calculating the PVC size in GiB
        pvc_size = int(capacity_to_use / (num_of_clones + 2))

        self.interface = interface_iterate
        self.sc_obj = storageclass_factory(self.interface)

        self.pvc_obj = pvc_factory(interface=self.interface,
                                   size=pvc_size,
                                   status=constants.STATUS_BOUND)

        self.pod_obj = pod_factory(interface=self.interface,
                                   pvc=self.pvc_obj,
                                   status=constants.STATUS_RUNNING)

        # Calculating the file size as 70% of the PVC size
        filesize = self.pvc_obj.size * 0.70
        # Change the file size to MB for the FIO function
        file_size = f"{int(filesize * constants.GB2MB)}M"
        file_name = self.pod_obj.name

        log.info(f"Total capacity size is : {ceph_capacity} GiB, "
                 f"Going to use {need_capacity} GiB, "
                 f"With {num_of_clones} clones to {pvc_size} GiB PVC. "
                 f"File size to be written is : {file_size} "
                 f"with the name of {file_name}")
        self.params = {}
        self.params["clonenum"] = f"{num_of_clones}"
        self.params["filesize"] = file_size
        self.params["ERRMSG"] = "Error in command"

        clone_yaml = self.build_params()
        performance_lib.write_fio_on_pod(self.pod_obj, file_size)

        # Running the test
        results = []
        for test_num in range(1, int(self.params["clonenum"]) + 1):
            log.info(f"Starting test number {test_num}")
            ct = self.create_clone(test_num, clone_yaml)
            speed = self.params["datasize"] / ct
            results.append({"Clone Num": test_num, "time": ct, "speed": speed})
            log.info(
                f"Results for clone number {test_num} are : "
                f"Creation time is {ct} secs, Creation speed {speed} MB/sec")

        for r in results:
            log.info(
                f"Clone number {r['Clone Num']} creation time is {r['time']} secs."
            )
            log.info(
                f"Clone number {r['Clone Num']} creation speed is {r['speed']} MB/sec."
            )
Exemplo n.º 19
0
def test_upgrade():
    ceph_cluster = CephCluster()
    with CephHealthMonitor(ceph_cluster):
        namespace = config.ENV_DATA['cluster_namespace']
        version_before_upgrade = config.ENV_DATA.get("ocs_version")
        upgrade_version = config.UPGRADE.get("upgrade_ocs_version",
                                             version_before_upgrade)
        ocs_registry_image = config.UPGRADE.get('upgrade_ocs_registry_image')
        if ocs_registry_image:
            upgrade_version = get_ocs_version_from_image(ocs_registry_image)
        parsed_version_before_upgrade = parse_version(version_before_upgrade)
        parsed_upgrade_version = parse_version(upgrade_version)
        assert parsed_upgrade_version >= parsed_version_before_upgrade, (
            f"Version you would like to upgrade to: {upgrade_version} "
            f"is not higher or equal to the version you currently running: "
            f"{version_before_upgrade}")
        operator_selector = get_selector_for_ocs_operator()
        package_manifest = PackageManifest(
            resource_name=OCS_OPERATOR_NAME,
            selector=operator_selector,
        )
        channel = config.DEPLOYMENT.get('ocs_csv_channel')
        csv_name_pre_upgrade = package_manifest.get_current_csv(channel)
        log.info(f"CSV name before upgrade is: {csv_name_pre_upgrade}")
        csv_pre_upgrade = CSV(resource_name=csv_name_pre_upgrade,
                              namespace=namespace)
        pre_upgrade_images = get_images(csv_pre_upgrade.get())
        version_change = parsed_upgrade_version > parsed_version_before_upgrade
        if version_change:
            version_config_file = os.path.join(constants.CONF_DIR,
                                               'ocs_version',
                                               f'ocs-{upgrade_version}.yaml')
            load_config_file(version_config_file)
        ocs_catalog = CatalogSource(
            resource_name=constants.OPERATOR_CATALOG_SOURCE_NAME,
            namespace=constants.MARKETPLACE_NAMESPACE,
        )
        upgrade_in_current_source = config.UPGRADE.get(
            'upgrade_in_current_source', False)
        if not upgrade_in_current_source:
            if not ocs_catalog.is_exist() and not upgrade_in_current_source:
                log.info("OCS catalog source doesn't exist. Creating new one.")
                create_catalog_source(ocs_registry_image, ignore_upgrade=True)
            image_url = ocs_catalog.get_image_url()
            image_tag = ocs_catalog.get_image_name()
            log.info(f"Current image is: {image_url}, tag: {image_tag}")
            if ocs_registry_image:
                image_url, new_image_tag = ocs_registry_image.split(':')
            elif (config.UPGRADE.get('upgrade_to_latest', True)
                  or version_change):
                new_image_tag = get_latest_ds_olm_tag()
            else:
                new_image_tag = get_next_version_available_for_upgrade(
                    image_tag)
            cs_data = deepcopy(ocs_catalog.data)
            image_for_upgrade = ':'.join([image_url, new_image_tag])
            log.info(f"Image: {image_for_upgrade} will be used for upgrade.")
            cs_data['spec']['image'] = image_for_upgrade

            with NamedTemporaryFile() as cs_yaml:
                dump_data_to_temp_yaml(cs_data, cs_yaml.name)
                ocs_catalog.apply(cs_yaml.name)
        # Wait for the new package manifest for upgrade.
        operator_selector = get_selector_for_ocs_operator()
        package_manifest = PackageManifest(
            resource_name=OCS_OPERATOR_NAME,
            selector=operator_selector,
        )
        package_manifest.wait_for_resource()
        channel = config.DEPLOYMENT.get('ocs_csv_channel')
        if not channel:
            channel = package_manifest.get_default_channel()

        # update subscription
        subscription = OCP(
            resource_name=constants.OCS_SUBSCRIPTION,
            kind='subscription',
            namespace=config.ENV_DATA['cluster_namespace'],
        )
        current_ocs_source = subscription.data['spec']['source']
        log.info(f"Current OCS subscription source: {current_ocs_source}")
        ocs_source = current_ocs_source if upgrade_in_current_source else (
            constants.OPERATOR_CATALOG_SOURCE_NAME)
        patch_subscription_cmd = (
            f'oc patch subscription {constants.OCS_SUBSCRIPTION} '
            f'-n {namespace} --type merge -p \'{{"spec":{{"channel": '
            f'"{channel}", "source": "{ocs_source}"}}}}\'')
        run_cmd(patch_subscription_cmd)

        subscription_plan_approval = config.DEPLOYMENT.get(
            'subscription_plan_approval')
        if subscription_plan_approval == 'Manual':
            wait_for_install_plan_and_approve(namespace)
        attempts = 145
        for attempt in range(1, attempts + 1):
            log.info(f"Attempt {attempt}/{attempts} to check CSV upgraded.")
            csv_name_post_upgrade = package_manifest.get_current_csv(channel)
            if csv_name_post_upgrade == csv_name_pre_upgrade:
                log.info(f"CSV is still: {csv_name_post_upgrade}")
                sleep(5)
            else:
                log.info(f"CSV now upgraded to: {csv_name_post_upgrade}")
                break
            if attempts == attempt:
                raise TimeoutException("No new CSV found after upgrade!")
        csv_post_upgrade = CSV(resource_name=csv_name_post_upgrade,
                               namespace=namespace)
        log.info(
            f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state"
        )
        if version_before_upgrade == '4.2' and upgrade_version == '4.3':
            log.info("Force creating Ceph toolbox after upgrade 4.2 -> 4.3")
            setup_ceph_toolbox(force_setup=True)
        csv_post_upgrade.wait_for_phase("Succeeded", timeout=600)
        post_upgrade_images = get_images(csv_post_upgrade.get())
        old_images, _, _ = get_upgrade_image_info(pre_upgrade_images,
                                                  post_upgrade_images)
        verify_image_versions(old_images, parsed_upgrade_version)
        ocs_install_verification(
            timeout=600,
            skip_osd_distribution_check=True,
            ocs_registry_image=ocs_registry_image,
            post_upgrade_verification=True,
        )
    def test_pvc_snapshot_performance(self, pvc_size):
        """
        1. Run I/O on a pod file
        2. Calculate md5sum of the file
        3. Take a snapshot of the PVC
        4. Measure the total snapshot creation time and the CSI snapshot creation time
        4. Restore From the snapshot and measure the time
        5. Attach a new pod to it
        6. Verify that the file is present on the new pod also
        7. Verify that the md5sum of the file on the new pod matches
           with the md5sum of the file on the original pod

        This scenario run 3 times and report all the average results of the 3 runs
        and will send them to the ES
        Args:
            pvc_size: the size of the PVC to be tested - parametrize

        """

        # Getting the total Storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = ceph_cluster.get_ceph_capacity()

        log.info(f"Total capacity size is : {ceph_capacity}")
        log.info(f"PVC Size is : {pvc_size}")
        log.info(f"Needed capacity is {int(int(pvc_size) * 5)}")
        if int(ceph_capacity) < int(pvc_size) * 5:
            log.error(
                f"PVC size is {pvc_size}GiB and it is too large for this system"
                f" which have only {ceph_capacity}GiB")
            return
        # Calculating the file size as 25% of the PVC size
        # in the end the PVC will be 75% full
        filesize = self.pvc_obj.size * 0.25
        # Change the file size to MB and from int to str
        file_size = f"{int(filesize * 1024)}M"

        all_results = []

        self.results_path = get_full_test_logs_path(cname=self)
        log.info(f"Logs file path name is : {self.full_log_path}")

        # Produce ES report
        # Collecting environment information
        self.get_env_info()

        # Initialize the results doc file.
        self.full_results = self.init_full_results(
            ResultsAnalyse(
                self.uuid,
                self.crd_data,
                self.full_log_path,
                "pvc_snapshot_perf",
            ))
        self.full_results.add_key("pvc_size", pvc_size + " GiB")
        self.full_results.add_key("interface", self.sc)
        self.full_results.all_results["creation_time"] = []
        self.full_results.all_results["csi_creation_time"] = []
        self.full_results.all_results["creation_speed"] = []
        self.full_results.all_results["restore_time"] = []
        self.full_results.all_results["restore_speed"] = []
        self.full_results.all_results["restore_csi_time"] = []
        for test_num in range(self.tests_numbers):
            test_results = {
                "test_num": test_num + 1,
                "dataset": (test_num + 1) * filesize * 1024,  # size in MiB
                "create": {
                    "time": None,
                    "csi_time": None,
                    "speed": None
                },
                "restore": {
                    "time": None,
                    "speed": None
                },
            }
            log.info(f"Starting test phase number {test_num}")
            # Step 1. Run I/O on a pod file.
            file_name = f"{self.pod_object.name}-{test_num}"
            log.info(f"Starting IO on the POD {self.pod_object.name}")
            # Going to run only write IO to fill the PVC for the snapshot
            self.pod_object.fillup_fs(size=file_size, fio_filename=file_name)

            # Wait for fio to finish
            fio_result = self.pod_object.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"IO error on pod {self.pod_object.name}. FIO result: {fio_result}"
            log.info("IO on the PVC Finished")

            # Verify presence of the file
            file_path = pod.get_file_path(self.pod_object, file_name)
            log.info(f"Actual file path on the pod {file_path}")
            assert pod.check_file_existence(
                self.pod_object, file_path), f"File {file_name} doesn't exist"
            log.info(f"File {file_name} exists in {self.pod_object.name}")

            # Step 2. Calculate md5sum of the file.
            orig_md5_sum = pod.cal_md5sum(self.pod_object, file_name)

            # Step 3. Take a snapshot of the PVC and measure the time of creation.
            snap_name = self.pvc_obj.name.replace("pvc-test",
                                                  f"snapshot-test{test_num}")
            log.info(f"Taking snapshot of the PVC {snap_name}")

            start_time = datetime.datetime.utcnow().strftime(
                "%Y-%m-%dT%H:%M:%SZ")

            test_results["create"]["time"] = self.measure_create_snapshot_time(
                pvc_name=self.pvc_obj.name,
                snap_name=snap_name,
                namespace=self.pod_object.namespace,
                interface=self.interface,
                start_time=start_time,
            )

            test_results["create"][
                "csi_time"] = performance_lib.measure_csi_snapshot_creation_time(
                    interface=self.interface,
                    snapshot_id=self.snap_uid,
                    start_time=start_time,
                )

            test_results["create"]["speed"] = int(
                test_results["dataset"] / test_results["create"]["time"])
            log.info(
                f' Test {test_num} dataset is {test_results["dataset"]} MiB')
            log.info(
                f"Snapshot name {snap_name} and id {self.snap_uid} creation time is"
                f' : {test_results["create"]["time"]} sec.')
            log.info(
                f"Snapshot name {snap_name} and id {self.snap_uid} csi creation time is"
                f' : {test_results["create"]["csi_time"]} sec.')
            log.info(
                f'Snapshot speed is : {test_results["create"]["speed"]} MB/sec'
            )

            # Step 4. Restore the PVC from the snapshot and measure the time
            # Same Storage class of the original PVC
            sc_name = self.pvc_obj.backed_sc

            # Size should be same as of the original PVC
            pvc_size = str(self.pvc_obj.size) + "Gi"

            # Create pvc out of the snapshot
            # Both, the snapshot and the restore PVC should be in same namespace

            log.info("Restoring from the Snapshot")
            restore_pvc_name = self.pvc_obj.name.replace(
                "pvc-test", f"restore-pvc{test_num}")
            restore_pvc_yaml = constants.CSI_RBD_PVC_RESTORE_YAML
            if self.interface == constants.CEPHFILESYSTEM:
                restore_pvc_yaml = constants.CSI_CEPHFS_PVC_RESTORE_YAML

            csi_start_time = self.get_time("csi")
            log.info("Restoring the PVC from Snapshot")
            restore_pvc_obj = pvc.create_restore_pvc(
                sc_name=sc_name,
                snap_name=self.snap_obj.name,
                namespace=self.snap_obj.namespace,
                size=pvc_size,
                pvc_name=restore_pvc_name,
                restore_pvc_yaml=restore_pvc_yaml,
            )
            helpers.wait_for_resource_state(
                restore_pvc_obj,
                constants.STATUS_BOUND,
                timeout=3600  # setting this to 60 Min.
                # since it can be take long time to restore, and we want it to finished.
            )
            restore_pvc_obj.reload()
            log.info("PVC was restored from the snapshot")
            test_results["restore"][
                "time"] = helpers.measure_pvc_creation_time(
                    self.interface, restore_pvc_obj.name)

            test_results["restore"]["speed"] = int(
                test_results["dataset"] / test_results["restore"]["time"])
            log.info(
                f'Snapshot restore time is : {test_results["restore"]["time"]}'
            )
            log.info(
                f'restore speed is : {test_results["restore"]["speed"]} MB/sec'
            )

            test_results["restore"][
                "csi_time"] = performance_lib.csi_pvc_time_measure(
                    self.interface, restore_pvc_obj, "create", csi_start_time)
            log.info(
                f'Snapshot csi restore time is : {test_results["restore"]["csi_time"]}'
            )

            # Step 5. Attach a new pod to the restored PVC
            restore_pod_object = helpers.create_pod(
                interface_type=self.interface,
                pvc_name=restore_pvc_obj.name,
                namespace=self.snap_obj.namespace,
            )

            # Confirm that the pod is running
            helpers.wait_for_resource_state(resource=restore_pod_object,
                                            state=constants.STATUS_RUNNING)
            restore_pod_object.reload()

            # Step 6. Verify that the file is present on the new pod also.
            log.info(f"Checking the existence of {file_name} "
                     f"on restore pod {restore_pod_object.name}")
            assert pod.check_file_existence(
                restore_pod_object,
                file_path), f"File {file_name} doesn't exist"
            log.info(f"File {file_name} exists in {restore_pod_object.name}")

            # Step 7. Verify that the md5sum matches
            log.info(
                f"Verifying that md5sum of {file_name} "
                f"on pod {self.pod_object.name} matches with md5sum "
                f"of the same file on restore pod {restore_pod_object.name}")
            assert pod.verify_data_integrity(
                restore_pod_object, file_name,
                orig_md5_sum), "Data integrity check failed"
            log.info("Data integrity check passed, md5sum are same")

            restore_pod_object.delete()
            restore_pvc_obj.delete()

            all_results.append(test_results)

        # clean the enviroment
        self.pod_object.delete()
        self.pvc_obj.delete()
        self.delete_test_project()

        # logging the test summary, all info in one place for easy log reading
        c_speed, c_runtime, c_csi_runtime, r_speed, r_runtime, r_csi_runtime = (
            0 for i in range(6))

        log.info("Test summary :")
        for tst in all_results:
            c_speed += tst["create"]["speed"]
            c_runtime += tst["create"]["time"]
            c_csi_runtime += tst["create"]["csi_time"]
            r_speed += tst["restore"]["speed"]
            r_runtime += tst["restore"]["time"]
            r_csi_runtime += tst["restore"]["csi_time"]

            self.full_results.all_results["creation_time"].append(
                tst["create"]["time"])
            self.full_results.all_results["csi_creation_time"].append(
                tst["create"]["csi_time"])
            self.full_results.all_results["creation_speed"].append(
                tst["create"]["speed"])
            self.full_results.all_results["restore_time"].append(
                tst["restore"]["time"])
            self.full_results.all_results["restore_speed"].append(
                tst["restore"]["speed"])
            self.full_results.all_results["restore_csi_time"].append(
                tst["restore"]["csi_time"])
            self.full_results.all_results["dataset_inMiB"] = tst["dataset"]
            log.info(
                f"Test {tst['test_num']} results : dataset is {tst['dataset']} MiB. "
                f"Take snapshot time is {tst['create']['time']} "
                f"at {tst['create']['speed']} MiB/Sec "
                f"Restore from snapshot time is {tst['restore']['time']} "
                f"at {tst['restore']['speed']} MiB/Sec ")

        avg_snap_c_time = c_runtime / self.tests_numbers
        avg_snap_csi_c_time = c_csi_runtime / self.tests_numbers
        avg_snap_c_speed = c_speed / self.tests_numbers
        avg_snap_r_time = r_runtime / self.tests_numbers
        avg_snap_r_speed = r_speed / self.tests_numbers
        avg_snap_r_csi_time = r_csi_runtime / self.tests_numbers
        log.info(f" Average snapshot creation time is {avg_snap_c_time} sec.")
        log.info(
            f" Average csi snapshot creation time is {avg_snap_csi_c_time} sec."
        )
        log.info(
            f" Average snapshot creation speed is {avg_snap_c_speed} MiB/sec")
        log.info(f" Average snapshot restore time is {avg_snap_r_time} sec.")
        log.info(
            f" Average snapshot restore speed is {avg_snap_r_speed} MiB/sec")
        log.info(
            f" Average snapshot restore csi time is {avg_snap_r_csi_time} sec."
        )

        self.full_results.add_key("avg_snap_creation_time_insecs",
                                  avg_snap_c_time)
        self.full_results.add_key("avg_snap_csi_creation_time_insecs",
                                  avg_snap_csi_c_time)
        self.full_results.add_key("avg_snap_creation_speed", avg_snap_c_speed)
        self.full_results.add_key("avg_snap_restore_time_insecs",
                                  avg_snap_r_time)
        self.full_results.add_key("avg_snap_restore_speed", avg_snap_r_speed)
        self.full_results.add_key("avg_snap_restore_csi_time_insecs",
                                  avg_snap_r_csi_time)

        # Write the test results into the ES server
        log.info("writing results to elastic search server")
        if self.full_results.es_write():
            res_link = self.full_results.results_link()

            # write the ES link to the test results in the test log.
            log.info(f"The result can be found at : {res_link}")

            self.write_result_to_file(res_link)
Exemplo n.º 21
0
    def test_pvc_multiple_snapshot_performance(
        self,
        interface_iterate,
        teardown_factory,
        storageclass_factory,
        pvc_factory,
        pod_factory,
    ):
        """
        1. Creating PVC
           size is depend on storage capacity, but not less then 1 GiB
           it will use ~75% capacity of the Storage, Min storage capacity 1 TiB
        2. Fill the PVC with 80% of data
        3. Take a snapshot of the PVC and measure the time of creation.
        4. re-write the data on the PVC
        5. Take a snapshot of the PVC and measure the time of creation.
        6. repeat steps 4-5 the numbers of snapshot we want to take : 512
           this will be run by outside script for low memory consumption
        7. print all information.

        Raises:
            StorageNotSufficientException: in case of not enough capacity

        """
        # Number od snapshot for CephFS is 100 and for RBD is 512
        num_of_snaps = 100
        if interface_iterate == constants.CEPHBLOCKPOOL:
            num_of_snaps = 512

        # Getting the total Storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = int(ceph_cluster.get_ceph_capacity())

        # Use 70% of the storage capacity in the test
        capacity_to_use = int(ceph_capacity * 0.7)

        # since we do not want to use more then 65%, we add 35% to the needed
        # capacity, and minimum PVC size is 1 GiB
        need_capacity = int((num_of_snaps + 2) * 1.35)
        # Test will run only on system with enough capacity
        if capacity_to_use < need_capacity:
            err_msg = (f"The system have only {ceph_capacity} GiB, "
                       f"we want to use only {capacity_to_use} GiB, "
                       f"and we need {need_capacity} GiB to run the test")
            log.error(err_msg)
            raise exceptions.StorageNotSufficientException(err_msg)

        # Calculating the PVC size in GiB
        pvc_size = int(capacity_to_use / (num_of_snaps + 2))

        self.interface = interface_iterate
        self.sc_obj = storageclass_factory(self.interface)

        self.pvc_obj = pvc_factory(interface=self.interface,
                                   size=pvc_size,
                                   status=constants.STATUS_BOUND)

        self.pod_obj = pod_factory(interface=self.interface,
                                   pvc=self.pvc_obj,
                                   status=constants.STATUS_RUNNING)

        # Calculating the file size as 80% of the PVC size
        filesize = self.pvc_obj.size * 0.80
        # Change the file size to MB for the FIO function
        file_size = f"{int(filesize * constants.GB2MB)}M"
        file_name = self.pod_obj.name

        log.info(f"Total capacity size is : {ceph_capacity} GiB, "
                 f"Going to use {need_capacity} GiB, "
                 f"With {num_of_snaps} Snapshots to {pvc_size} GiB PVC. "
                 f"File size to be written is : {file_size} "
                 f"with the name of {file_name}")

        os.environ["SNAPNUM"] = f"{num_of_snaps}"
        os.environ["LOGPATH"] = f"{ocsci_log_path()}"
        os.environ["FILESIZE"] = file_size
        os.environ["NSPACE"] = self.pvc_obj.namespace
        os.environ["PODNAME"] = self.pod_obj.name
        os.environ["PVCNAME"] = self.pvc_obj.name
        os.environ["INTERFACE"] = self.interface

        main_script = "tests/e2e/performance/test_multi_snapshots.py"
        result = subprocess.run([main_script], stdout=subprocess.PIPE)
        log.info(f"Results from main script : {result.stdout.decode('utf-8')}")

        if "All results are" not in result.stdout.decode("utf-8"):
            log.error("Test did not completed")
            raise Exception("Test did not completed")
Exemplo n.º 22
0
    def validate_cluster(self, resources, instances):
        """
        Perform cluster validation - nodes readiness, Ceph cluster health
        check and functional resources tests
        """
        instances_names = list(instances.values())
        assert ocp.wait_for_nodes_ready(instances_names), (
            "Not all nodes reached status Ready"
        )

        ceph_cluster = CephCluster()
        assert ceph_health_check(
            namespace=config.ENV_DATA['cluster_namespace']
        )
        ceph_cluster.cluster_health_check(timeout=60)

        # Create resources and run IO for both FS and RBD
        # Unpack resources
        projects, secrets, pools, storageclasses, pvcs, pods = resources[:6]

        # Project
        projects.append(helpers.create_project())

        # Secrets
        secrets.append(helpers.create_secret(constants.CEPHBLOCKPOOL))
        secrets.append(helpers.create_secret(constants.CEPHFILESYSTEM))

        # Pools
        pools.append(helpers.create_ceph_block_pool())
        pools.append(helpers.get_cephfs_data_pool_name())

        # Storageclasses
        storageclasses.append(
            helpers.create_storage_class(
                interface_type=constants.CEPHBLOCKPOOL,
                interface_name=pools[0].name,
                secret_name=secrets[0].name
            )
        )
        storageclasses.append(
            helpers.create_storage_class(
                interface_type=constants.CEPHFILESYSTEM,
                interface_name=pools[1],
                secret_name=secrets[1].name
            )
        )

        # PVCs
        pvcs.append(helpers.create_pvc(
            sc_name=storageclasses[0].name, namespace=projects[0].namespace)
        )
        pvcs.append(helpers.create_pvc(
            sc_name=storageclasses[1].name, namespace=projects[0].namespace)
        )

        # Pods
        pods.append(
            helpers.create_pod(
                interface_type=constants.CEPHBLOCKPOOL, pvc_name=pvcs[0].name,
                namespace=projects[0].namespace
            )
        )
        pods.append(
            helpers.create_pod(
                interface_type=constants.CEPHFILESYSTEM, pvc_name=pvcs[1].name,
                namespace=projects[0].namespace
            )
        )

        # Run IO
        for pod in pods:
            pod.run_io('fs', '1G')
        for pod in pods:
            fio_result = pod.get_fio_results()
            logger.info(f"IOPs after FIO for pod {pod.name}:")
            logger.info(
                f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}"
            )
            logger.info(
                f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}"
            )
Exemplo n.º 23
0
    def test_pvc_snapshot_performance(self, teardown_factory, pvc_size):
        """
        1. Run I/O on a pod file.
        2. Calculate md5sum of the file.
        3. Take a snapshot of the PVC and measure the time of creation.
        4. Restore From the snapshot and measure the time
        5. Attach a new pod to it.
        6. Verify that the file is present on the new pod also.
        7. Verify that the md5sum of the file on the new pod matches
           with the md5sum of the file on the original pod.

        This scenario run 3 times and report all results
        Args:
            teardown_factory: A fixture to destroy objects
            pvc_size: the size of the PVC to be tested - parametrize

        """

        # Getting the total Storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = ceph_cluster.get_ceph_capacity()

        log.info(f"Total capacity size is : {ceph_capacity}")
        log.info(f"PVC Size is : {pvc_size}")
        log.info(f"Needed capacity is {int(int(pvc_size) * 5)}")
        if int(ceph_capacity) < int(pvc_size) * 5:
            log.error(
                f"PVC size is {pvc_size}GiB and it is too large for this system"
                f" which have only {ceph_capacity}GiB")
            return
        # Calculating the file size as 25% of the PVC size
        # in the end the PVC will be 75% full
        filesize = self.pvc_obj.size * 0.25
        # Change the file size to MB and from int to str
        file_size = f"{int(filesize * 1024)}M"

        all_results = []

        for test_num in range(self.tests_numbers):
            test_results = {
                "test_num": test_num + 1,
                "dataset": (test_num + 1) * filesize * 1024,  # size in MiB
                "create": {
                    "time": None,
                    "speed": None
                },
                "restore": {
                    "time": None,
                    "speed": None
                },
            }
            log.info(f"Starting test phase number {test_num}")
            # Step 1. Run I/O on a pod file.
            file_name = f"{self.pod_obj.name}-{test_num}"
            log.info(f"Starting IO on the POD {self.pod_obj.name}")
            # Going to run only write IO to fill the PVC for the snapshot
            self.pod_obj.fillup_fs(size=file_size, fio_filename=file_name)

            # Wait for fio to finish
            fio_result = self.pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"IO error on pod {self.pod_obj.name}. FIO result: {fio_result}"
            log.info("IO on the PVC Finished")

            # Verify presence of the file
            file_path = pod.get_file_path(self.pod_obj, file_name)
            log.info(f"Actual file path on the pod {file_path}")
            assert pod.check_file_existence(
                self.pod_obj, file_path), f"File {file_name} doesn't exist"
            log.info(f"File {file_name} exists in {self.pod_obj.name}")

            # Step 2. Calculate md5sum of the file.
            orig_md5_sum = pod.cal_md5sum(self.pod_obj, file_name)

            # Step 3. Take a snapshot of the PVC and measure the time of creation.
            snap_name = self.pvc_obj.name.replace("pvc-test",
                                                  f"snapshot-test{test_num}")
            log.info(f"Taking snapshot of the PVC {snap_name}")

            test_results["create"]["time"] = self.measure_create_snapshot_time(
                pvc_name=self.pvc_obj.name,
                snap_name=snap_name,
                interface=self.interface,
            )
            test_results["create"]["speed"] = int(
                test_results["dataset"] / test_results["create"]["time"])
            log.info(
                f' Test {test_num} dataset is {test_results["dataset"]} MiB')
            log.info(
                f'Snapshot creation time is : {test_results["create"]["time"]} sec.'
            )
            log.info(
                f'Snapshot speed is : {test_results["create"]["speed"]} MB/sec'
            )

            # Step 4. Restore the PVC from the snapshot and measure the time
            # Same Storage class of the original PVC
            sc_name = self.pvc_obj.backed_sc

            # Size should be same as of the original PVC
            pvc_size = str(self.pvc_obj.size) + "Gi"

            # Create pvc out of the snapshot
            # Both, the snapshot and the restore PVC should be in same namespace

            log.info("Restoring from the Snapshot")
            restore_pvc_name = self.pvc_obj.name.replace(
                "pvc-test", f"restore-pvc{test_num}")
            restore_pvc_yaml = constants.CSI_RBD_PVC_RESTORE_YAML
            if self.interface == constants.CEPHFILESYSTEM:
                restore_pvc_yaml = constants.CSI_CEPHFS_PVC_RESTORE_YAML

            log.info("Resorting the PVC from Snapshot")
            restore_pvc_obj = pvc.create_restore_pvc(
                sc_name=sc_name,
                snap_name=self.snap_obj.name,
                namespace=self.snap_obj.namespace,
                size=pvc_size,
                pvc_name=restore_pvc_name,
                restore_pvc_yaml=restore_pvc_yaml,
            )
            helpers.wait_for_resource_state(
                restore_pvc_obj,
                constants.STATUS_BOUND,
                timeout=3600  # setting this to 60 Min.
                # since it can be take long time to restore, and we want it to finished.
            )
            teardown_factory(restore_pvc_obj)
            restore_pvc_obj.reload()
            log.info("PVC was restored from the snapshot")
            test_results["restore"][
                "time"] = helpers.measure_pvc_creation_time(
                    self.interface, restore_pvc_obj.name)
            test_results["restore"]["speed"] = int(
                test_results["dataset"] / test_results["restore"]["time"])
            log.info(
                f'Snapshot restore time is : {test_results["restore"]["time"]}'
            )
            log.info(
                f'restore sped is : {test_results["restore"]["speed"]} MB/sec')

            # Step 5. Attach a new pod to the restored PVC
            restore_pod_obj = helpers.create_pod(
                interface_type=self.interface,
                pvc_name=restore_pvc_obj.name,
                namespace=self.snap_obj.namespace,
                pod_dict_path=constants.NGINX_POD_YAML,
            )

            # Confirm that the pod is running
            helpers.wait_for_resource_state(resource=restore_pod_obj,
                                            state=constants.STATUS_RUNNING)
            teardown_factory(restore_pod_obj)
            restore_pod_obj.reload()

            # Step 6. Verify that the file is present on the new pod also.
            log.info(f"Checking the existence of {file_name} "
                     f"on restore pod {restore_pod_obj.name}")
            assert pod.check_file_existence(
                restore_pod_obj, file_path), f"File {file_name} doesn't exist"
            log.info(f"File {file_name} exists in {restore_pod_obj.name}")

            # Step 7. Verify that the md5sum matches
            log.info(f"Verifying that md5sum of {file_name} "
                     f"on pod {self.pod_obj.name} matches with md5sum "
                     f"of the same file on restore pod {restore_pod_obj.name}")
            assert pod.verify_data_integrity(
                restore_pod_obj, file_name,
                orig_md5_sum), "Data integrity check failed"
            log.info("Data integrity check passed, md5sum are same")

            all_results.append(test_results)

        # logging the test summery, all info in one place for easy log reading
        c_speed, c_runtime, r_speed, r_runtime = (0 for i in range(4))
        log.info("Test summery :")
        for tst in all_results:
            c_speed += tst["create"]["speed"]
            c_runtime += tst["create"]["time"]
            r_speed += tst["restore"]["speed"]
            r_runtime += tst["restore"]["time"]
            log.info(
                f"Test {tst['test_num']} results : dataset is {tst['dataset']} MiB. "
                f"Take snapshot time is {tst['create']['time']} "
                f"at {tst['create']['speed']} MiB/Sec "
                f"Restore from snapshot time is {tst['restore']['time']} "
                f"at {tst['restore']['speed']} MiB/Sec ")
        log.info(
            f" Average snapshot creation time is {c_runtime / self.tests_numbers} sec."
        )
        log.info(
            f" Average snapshot creation speed is {c_speed / self.tests_numbers} MiB/sec"
        )
        log.info(
            f" Average snapshot restore time is {r_runtime / self.tests_numbers} sec."
        )
        log.info(
            f" Average snapshot restore speed is {r_speed / self.tests_numbers} MiB/sec"
        )
Exemplo n.º 24
0
def run_ocs_upgrade(operation=None, *operation_args, **operation_kwargs):
    """
    Run upgrade procedure of OCS cluster

    Args:
        operation: (function): Function to run
        operation_args: (iterable): Function's arguments
        operation_kwargs: (map): Function's keyword arguments

    """

    ceph_cluster = CephCluster()
    original_ocs_version = config.ENV_DATA.get("ocs_version")
    upgrade_in_current_source = config.UPGRADE.get("upgrade_in_current_source",
                                                   False)
    upgrade_ocs = OCSUpgrade(
        namespace=config.ENV_DATA["cluster_namespace"],
        version_before_upgrade=original_ocs_version,
        ocs_registry_image=config.UPGRADE.get("upgrade_ocs_registry_image"),
        upgrade_in_current_source=upgrade_in_current_source,
    )
    upgrade_version = upgrade_ocs.get_upgrade_version()
    assert (
        upgrade_ocs.get_parsed_versions()[1] >=
        upgrade_ocs.get_parsed_versions()[0]), (
            f"Version you would like to upgrade to: {upgrade_version} "
            f"is not higher or equal to the version you currently running: "
            f"{upgrade_ocs.version_before_upgrade}")
    # create external cluster object
    if config.DEPLOYMENT["external_mode"]:
        host = config.EXTERNAL_MODE["external_cluster_node_roles"]["node1"][
            "ip_address"]
        user = config.EXTERNAL_MODE["login"]["username"]
        password = config.EXTERNAL_MODE["login"]["password"]
        external_cluster = ExternalCluster(host, user, password)

    # For external cluster , create the secrets if upgraded version is 4.8
    if (config.DEPLOYMENT["external_mode"] and original_ocs_version == "4.7"
            and upgrade_version == "4.8"):
        external_cluster.create_object_store_user()
        access_key = config.EXTERNAL_MODE.get("access_key_rgw-admin-ops-user",
                                              "")
        secret_key = config.EXTERNAL_MODE.get("secret_key_rgw-admin-ops-user",
                                              "")
        if not (access_key and secret_key):
            raise ExternalClusterRGWAdminOpsUserException(
                "Access and secret key for rgw-admin-ops-user not found")
        cmd = (
            f'oc create secret generic --type="kubernetes.io/rook"'
            f' "rgw-admin-ops-user" --from-literal=accessKey={access_key} --from-literal=secretKey={secret_key}'
        )
        exec_cmd(cmd)

    csv_name_pre_upgrade = upgrade_ocs.get_csv_name_pre_upgrade()
    pre_upgrade_images = upgrade_ocs.get_pre_upgrade_image(
        csv_name_pre_upgrade)
    upgrade_ocs.load_version_config_file(upgrade_version)
    if config.DEPLOYMENT.get("disconnected") and not config.DEPLOYMENT.get(
            "disconnected_env_skip_image_mirroring"):
        upgrade_ocs.ocs_registry_image = prepare_disconnected_ocs_deployment(
            upgrade=True)
        log.info(
            f"Disconnected upgrade - new image: {upgrade_ocs.ocs_registry_image}"
        )

    with CephHealthMonitor(ceph_cluster):
        channel = upgrade_ocs.set_upgrade_channel()
        upgrade_ocs.set_upgrade_images()
        ui_upgrade_supported = False
        if config.UPGRADE.get("ui_upgrade"):
            if (version.get_semantic_ocp_version_from_config()
                    == version.VERSION_4_9 and original_ocs_version == "4.8"
                    and upgrade_version == "4.9"):
                ui_upgrade_supported = True
            else:
                log.warning(
                    "UI upgrade combination is not supported. It will fallback to CLI upgrade"
                )
            if ui_upgrade_supported:
                ocs_odf_upgrade_ui()
        else:
            if (config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM
                ) and not (upgrade_in_current_source):
                create_ocs_secret(config.ENV_DATA["cluster_namespace"])
            if upgrade_version != "4.9":
                # In the case of upgrade to ODF 4.9, the ODF operator should upgrade
                # OCS automatically.
                upgrade_ocs.update_subscription(channel)
            if original_ocs_version == "4.8" and upgrade_version == "4.9":
                deployment = Deployment()
                deployment.subscribe_ocs()
            else:
                # In the case upgrade is not from 4.8 to 4.9 and we have manual approval strategy
                # we need to wait and approve install plan, otherwise it's approved in the
                # subscribe_ocs method.
                subscription_plan_approval = config.DEPLOYMENT.get(
                    "subscription_plan_approval")
                if subscription_plan_approval == "Manual":
                    wait_for_install_plan_and_approve(
                        config.ENV_DATA["cluster_namespace"])
            if (config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM
                ) and not (upgrade_in_current_source):
                for attempt in range(2):
                    # We need to do it twice, because some of the SA are updated
                    # after the first load of OCS pod after upgrade. So we need to
                    # link updated SA again.
                    log.info(f"Sleep 1 minute before attempt: {attempt + 1}/2 "
                             "of linking secret/SAs")
                    time.sleep(60)
                    link_all_sa_and_secret_and_delete_pods(
                        constants.OCS_SECRET,
                        config.ENV_DATA["cluster_namespace"])
        if operation:
            log.info(f"Calling test function: {operation}")
            _ = operation(*operation_args, **operation_kwargs)
            # Workaround for issue #2531
            time.sleep(30)
            # End of workaround

        for sample in TimeoutSampler(
                timeout=725,
                sleep=5,
                func=upgrade_ocs.check_if_upgrade_completed,
                channel=channel,
                csv_name_pre_upgrade=csv_name_pre_upgrade,
        ):
            try:
                if sample:
                    log.info("Upgrade success!")
                    break
            except TimeoutException:
                raise TimeoutException("No new CSV found after upgrade!")
        old_image = upgrade_ocs.get_images_post_upgrade(
            channel, pre_upgrade_images, upgrade_version)
    verify_image_versions(
        old_image,
        upgrade_ocs.get_parsed_versions()[1],
        upgrade_ocs.version_before_upgrade,
    )

    # update external secrets
    if config.DEPLOYMENT["external_mode"]:
        upgrade_version = version.get_semantic_version(upgrade_version, True)
        if upgrade_version >= version.VERSION_4_10:
            external_cluster.update_permission_caps()
        else:
            external_cluster.update_permission_caps(EXTERNAL_CLUSTER_USER)
        external_cluster.get_external_cluster_details()

        # update the external cluster details in secrets
        log.info("updating external cluster secret")
        external_cluster_details = NamedTemporaryFile(
            mode="w+",
            prefix="external-cluster-details-",
            delete=False,
        )
        with open(external_cluster_details.name, "w") as fd:
            decoded_external_cluster_details = decode(
                config.EXTERNAL_MODE["external_cluster_details"])
            fd.write(decoded_external_cluster_details)
        cmd = (
            f"oc set data secret/rook-ceph-external-cluster-details -n {constants.OPENSHIFT_STORAGE_NAMESPACE} "
            f"--from-file=external_cluster_details={external_cluster_details.name}"
        )
        exec_cmd(cmd)

    ocs_install_verification(
        timeout=600,
        skip_osd_distribution_check=True,
        ocs_registry_image=upgrade_ocs.ocs_registry_image,
        post_upgrade_verification=True,
        version_before_upgrade=upgrade_ocs.version_before_upgrade,
    )
Exemplo n.º 25
0
class TestFullClusterHealth(PASTest):
    """
    Test Cluster health when storage is ~85%
    """

    @pytest.fixture(autouse=True)
    def setup(self, request, nodes):
        """
        Setting up test parameters
        """

        def teardown():
            logger.info("cleanup the environment")
            nodes.restart_nodes_by_stop_and_start_teardown()

        request.addfinalizer(teardown)

        logger.info("Starting the test setup")
        self.percent_to_fill = 85.0
        self.ceph_cluster = CephCluster()
        self.nodes = None

        self.benchmark_name = "FIO"
        self.client_pod_name = "fio-client"

        self.sanity_helpers = sanity_helpers.Sanity()

        super(TestFullClusterHealth, self).setup()
        # deploy the benchmark-operator
        self.deploy_benchmark_operator()

    def run(self):
        """

        Run the test, and wait until it finished
        """

        self.deploy_and_wait_for_wl_to_start(timeout=900)
        self.wait_for_wl_to_finish(sleep=300)

        try:
            if "Fio failed to execute" not in self.test_logs:
                logger.info("FIO has completed successfully")
        except IOError:
            logger.warning("FIO failed to complete")

    def calculate_crd_data(self):
        """
        Getting the storage capacity and calculate pod count and pvc size

        """

        ceph_used_capacity_percent = get_percent_used_capacity()
        logger.info(f"Ceph used capacity percent is {ceph_used_capacity_percent}%")

        ceph_capacity = self.ceph_cluster.get_ceph_capacity()
        logger.info(f"Total storage capacity is {ceph_capacity} GiB")

        self.percent_to_fill = self.percent_to_fill - ceph_used_capacity_percent
        logger.info(f"Percentage to fill is {self.percent_to_fill}%")

        self.total_data_set = int(ceph_capacity * (int(self.percent_to_fill) / 100))
        self.filesize = int(
            self.crd_data["spec"]["workload"]["args"]["filesize"].replace("GiB", "")
        )

        # Make sure that filesize>=10 and servers<=60
        self.servers = 60
        self.filesize = int(self.total_data_set / self.servers)
        if self.filesize < 10:
            self.filesize = 10
            self.servers = int(self.total_data_set / self.filesize)

        self.crd_data["spec"]["workload"]["args"]["filesize"] = f"{self.filesize}GiB"
        self.crd_data["spec"]["workload"]["args"][
            "storagesize"
        ] = f"{int(self.total_data_set)}Gi"
        self.crd_data["spec"]["workload"]["args"]["servers"] = self.servers
        self.crd_data["spec"]["workload"]["args"]["bs"] = "1024KiB"
        self.crd_data["spec"]["workload"]["args"]["jobs"] = ["write", "read"]
        self.crd_data["spec"]["workload"]["args"]["iodepth"] = 1

    def delete_pods(self):
        """
        Try to delete pods:
            - Rook operator
            - OSD
            - MGR
            - MON
        """
        pod_list = []
        rook_operator_pod = pod.get_ocs_operator_pod(
            ocs_label=constants.OPERATOR_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        pod_list.append(rook_operator_pod)

        osd_pods = pod.get_osd_pods()
        pod_list.extend(osd_pods)

        mgr_pods = pod.get_mgr_pods()
        pod_list.extend(mgr_pods)

        mon_pods = pod.get_mon_pods()
        pod_list.extend(mon_pods)

        logger.info(f"Deleting pods: {[p.name for p in pod_list]}")
        pod.delete_pods(pod_objs=pod_list)

    def ceph_not_health_error(self):
        """
        Check if Ceph is NOT in "HEALTH_ERR" state
        Warning state is ok since the cluster is low in storage space

        Returns:
            bool: True if Ceph state is NOT "HEALTH_ERR"
        """
        ceph_status = self.ceph_cluster.get_ceph_health()
        logger.info(f"Ceph status is: {ceph_status}")
        return ceph_status != "HEALTH_ERR"

    def mgr_pod_node_restart(self):
        """
        Restart node that runs mgr pod
        """
        mgr_pod_obj = pod.get_mgr_pods()
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        self.nodes.restart_nodes([mgr_node_obj])

        wait_for_nodes_status()

        # Check for Ceph pods
        pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        assert pod_obj.wait_for_resource(
            condition="Running", selector="app=rook-ceph-mgr", timeout=600
        )
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-mon",
            resource_count=3,
            timeout=600,
        )
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-osd",
            resource_count=3,
            timeout=600,
        )

    def restart_ocs_operator_node(self):
        """
        Restart node that runs OCS operator pod
        """

        pod_obj = pod.get_ocs_operator_pod()
        node_obj = pod.get_pod_node(pod_obj)

        self.nodes.restart_nodes([node_obj])

        wait_for_nodes_status()

        pod.wait_for_pods_to_be_running(
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, pod_names=[pod_obj.name]
        )

    def is_cluster_healthy(self):
        """
        Wrapper function for cluster health check

        Returns:
            bool: True if ALL checks passed, False otherwise
        """
        return self.ceph_not_health_error() and pod.wait_for_pods_to_be_running()

    @system_test
    @polarion_id("OCS-2749")
    def test_full_cluster_health(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Verify that the cluster health is ok when the storage is ~85% full

        Steps:
          1. Deploy benchmark operator and run fio workload
          2. Check Ceph health before/after each operation:
            2.1 Osd node reboot
            2.2 Mgr node reboot
            2.3 OCS operator node reboot
            2.4 Delete Rook, OSD, MGR & MON pods
            2.5 Creation and deletion of resources

        """
        self.nodes = nodes

        self.full_log_path = get_full_test_logs_path(cname=self)
        logger.info(f"Logs file path name is : {self.full_log_path}")

        logger.info("Create resource file for fio workload")
        self.crd_data = templating.load_yaml(constants.FIO_CR_YAML)
        self.calculate_crd_data()

        self.set_storageclass(interface=constants.CEPHBLOCKPOOL)

        self.run()

        logger.info("Checking health before disruptive operations")
        assert self.is_cluster_healthy(), "Cluster is not healthy"

        osd_node_reboot()
        logger.info("Checking health after OSD node reboot")
        assert self.is_cluster_healthy(), "Cluster is not healthy"

        self.mgr_pod_node_restart()
        logger.info("Checking health after worker node shutdown")
        assert self.is_cluster_healthy(), "Cluster is not healthy"

        self.restart_ocs_operator_node()
        logger.info("Checking health after OCS operator node restart")
        assert self.is_cluster_healthy(), "Cluster is not healthy"

        self.delete_pods()
        logger.info("Checking health after Rook, OSD, MGR & MON pods deletion")
        assert self.is_cluster_healthy(), "Cluster is not healthy"

        # Create resources
        logger.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(
            pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
        )
        logger.info("Resources Created")

        # Delete resources
        logger.info("Deleting resources")
        self.sanity_helpers.delete_resources()
        logger.info("Resources Deleted")

        logger.info(
            "Checking health after resources creation and deletion using sanity helpers"
        )
        assert self.is_cluster_healthy(), "Cluster is not healthy"
Exemplo n.º 26
0
def test_upgrade():
    ceph_cluster = CephCluster()
    ceph_cluster.enable_health_monitor()
    namespace = config.ENV_DATA['cluster_namespace']
    ocs_catalog = CatalogSource(
        resource_name=constants.OPERATOR_CATALOG_SOURCE_NAME,
        namespace=constants.MARKETPLACE_NAMESPACE,
    )
    version_before_upgrade = config.ENV_DATA.get("ocs_version")
    upgrade_version = config.UPGRADE.get(
        "upgrade_ocs_version", version_before_upgrade
    )
    parsed_version_before_upgrade = parse_version(version_before_upgrade)
    parsed_upgrade_version = parse_version(upgrade_version)
    assert parsed_upgrade_version >= parsed_version_before_upgrade, (
        f"Version you would like to upgrade to: {upgrade_version} "
        f"is not higher or equal to the version you currently running: "
        f"{version_before_upgrade}"
    )
    version_change = parsed_upgrade_version > parsed_version_before_upgrade
    if version_change:
        version_config_file = os.path.join(
            constants.CONF_DIR, 'ocs_version', f'ocs-{upgrade_version}.yaml'
        )
        assert os.path.exists(version_config_file), (
            f"OCS version config file {version_config_file} doesn't exist!"
        )
        with open(
            os.path.abspath(os.path.expanduser(version_config_file))
        ) as file_stream:
            custom_config_data = yaml.safe_load(file_stream)
            config.update(custom_config_data)
    image_url = ocs_catalog.get_image_url()
    image_tag = ocs_catalog.get_image_name()
    log.info(f"Current image is: {image_url}, tag: {image_tag}")
    ocs_registry_image = config.UPGRADE.get('upgrade_ocs_registry_image')
    if ocs_registry_image:
        image_url, new_image_tag = ocs_registry_image.split(':')
    elif config.UPGRADE.get('upgrade_to_latest', True) or version_change:
        new_image_tag = get_latest_ds_olm_tag()
    else:
        new_image_tag = get_next_version_available_for_upgrade(image_tag)
    cs_data = deepcopy(ocs_catalog.data)
    image_for_upgrade = ':'.join([image_url, new_image_tag])
    log.info(f"Image: {image_for_upgrade} will be used for upgrade.")
    cs_data['spec']['image'] = image_for_upgrade
    operator_selector = get_selector_for_ocs_operator()
    package_manifest = PackageManifest(
        resource_name=OCS_OPERATOR_NAME, selector=operator_selector,
    )
    csv_name_pre_upgrade = package_manifest.get_current_csv()
    log.info(f"CSV name before upgrade is: {csv_name_pre_upgrade}")
    csv_pre_upgrade = CSV(
        resource_name=csv_name_pre_upgrade,
        namespace=namespace
    )
    pre_upgrade_images = get_images(csv_pre_upgrade.get())

    with NamedTemporaryFile() as cs_yaml:
        dump_data_to_temp_yaml(cs_data, cs_yaml.name)
        ocs_catalog.apply(cs_yaml.name)
    # Wait for package manifest is ready
    package_manifest.wait_for_resource()
    subscription_plan_approval = config.DEPLOYMENT.get(
        'subscription_plan_approval'
    )
    if subscription_plan_approval == 'Manual':
        wait_for_install_plan_and_approve(namespace)
    attempts = 145
    for attempt in range(1, attempts):
        if attempts == attempt:
            raise TimeoutException("No new CSV found after upgrade!")
        log.info(f"Attempt {attempt}/{attempts} to check CSV upgraded.")
        package_manifest.reload_data()
        csv_name_post_upgrade = package_manifest.get_current_csv()
        if csv_name_post_upgrade == csv_name_pre_upgrade:
            log.info(f"CSV is still: {csv_name_post_upgrade}")
            sleep(5)
        else:
            log.info(f"CSV now upgraded to: {csv_name_post_upgrade}")
            break
    csv_post_upgrade = CSV(
        resource_name=csv_name_post_upgrade,
        namespace=namespace
    )
    log.info(
        f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state"
    )
    if version_before_upgrade == '4.2' and upgrade_version == '4.3':
        log.info("Force creating Ceph toolbox after upgrade 4.2 -> 4.3")
        setup_ceph_toolbox(force_setup=True)
    csv_post_upgrade.wait_for_phase("Succeeded", timeout=600)
    post_upgrade_images = get_images(csv_post_upgrade.get())
    old_images, _, _ = get_upgrade_image_info(
        pre_upgrade_images, post_upgrade_images
    )
    verify_image_versions(old_images, parsed_upgrade_version)
    ocs_install_verification(timeout=600, skip_osd_distribution_check=True)
    ceph_cluster.disable_health_monitor()
    if ceph_cluster.health_error_status:
        CephHealthException(
            f"During upgrade hit Ceph HEALTH_ERROR: "
            f"{ceph_cluster.health_error_status}"
        )
    def setup(
        self,
        request,
        scenario,
        num_of_nodes,
        num_of_fail_nodes,
        disrupt_provisioner,
        project_factory,
        multi_pvc_factory,
        dc_pod_factory,
    ):
        """
        Identify the nodes and start DeploymentConfig based app pods using
        PVC with ReadWriteOnce (RWO) access mode on selected nodes

        Args:
            scenario (str): Scenario of app pods running on OCS or dedicated nodes
                (eg., 'colocated', 'dedicated')
            num_of_nodes (int): number of nodes required for running test
            num_of_fail_nodes (int): number of nodes to make unresponsive during test
            disrupt_provisioner (bool): True to disrupt the leader provisioner
                pods if not running on selected nodes, else False
            project_factory: A fixture to create new project
            multi_pvc_factory: A fixture create a set of new PVCs
            dc_pod_factory: A fixture to create deploymentconfig pods

        Returns:
            tuple: containing the params used in test cases

        """
        ocs_nodes, non_ocs_nodes = self.identify_and_add_nodes(
            scenario, num_of_nodes)
        test_nodes = ocs_nodes if (scenario == "colocated") else non_ocs_nodes
        logger.info(f"Using nodes {test_nodes} for running test")

        def finalizer():
            helpers.remove_label_from_worker_node(node_list=test_nodes,
                                                  label_key="nodetype")

        request.addfinalizer(finalizer)

        ceph_cluster = CephCluster()
        project = project_factory()

        # Wait for mon pods to reach expected count
        # Bug 1778273 - [RFE]: Configure 5 MONs for OCS cluster with 5 or more nodes
        # This wait is required for some of the previous OCS versions (< 4.5)
        current_mon_count = int(
            ceph_cluster.CEPHCLUSTER.get_resource(resource_name="",
                                                  column="MONCOUNT"))
        assert ceph_cluster.POD.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=constants.MON_APP_LABEL,
            resource_count=current_mon_count,
            timeout=900,
        )
        ceph_cluster.mons = []
        ceph_cluster.scan_cluster()

        # Select nodes for running app pods and inducing network failure later
        app_pod_nodes = self.select_nodes_for_app_pods(scenario, ceph_cluster,
                                                       ocs_nodes,
                                                       non_ocs_nodes,
                                                       num_of_fail_nodes)

        # Create multiple RBD and CephFS backed PVCs with RWO accessmode
        num_of_pvcs = self.num_of_app_pods_per_node * num_of_fail_nodes
        rbd_pvcs = multi_pvc_factory(
            interface=constants.CEPHBLOCKPOOL,
            project=project,
            size=self.pvc_size,
            access_modes=[constants.ACCESS_MODE_RWO],
            num_of_pvc=num_of_pvcs,
        )
        cephfs_pvcs = multi_pvc_factory(
            interface=constants.CEPHFILESYSTEM,
            project=project,
            size=self.pvc_size,
            access_modes=[constants.ACCESS_MODE_RWO],
            num_of_pvc=num_of_pvcs,
        )

        # Create deploymentconfig based pods
        dc_pods = []
        # Start app-pods on selected node(s)
        for node_name in app_pod_nodes:
            logger.info(f"Starting app pods on the node {node_name}")
            helpers.label_worker_node(node_list=[node_name],
                                      label_key="nodetype",
                                      label_value="app-pod")

            for num in range(self.num_of_app_pods_per_node):
                dc_pods.append(
                    dc_pod_factory(
                        interface=constants.CEPHBLOCKPOOL,
                        pvc=rbd_pvcs.pop(0),
                        node_selector={"nodetype": "app-pod"},
                    ))
                assert pod.verify_node_name(
                    dc_pods[-1], node_name
                ), f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}"
                dc_pods.append(
                    dc_pod_factory(
                        interface=constants.CEPHFILESYSTEM,
                        pvc=cephfs_pvcs.pop(0),
                        node_selector={"nodetype": "app-pod"},
                    ))
                assert pod.verify_node_name(
                    dc_pods[-1], node_name
                ), f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}"
            helpers.remove_label_from_worker_node(node_list=[node_name],
                                                  label_key="nodetype")

        # Label other test nodes to be able to run app pods later
        helpers.label_worker_node(node_list=test_nodes,
                                  label_key="nodetype",
                                  label_value="app-pod")

        # Get ceph mon,osd pods running on selected node if colocated scenario
        # and extra OCS nodes are present
        # Recovery steps for MON and OSDS not required from OCS 4.4 onwards
        # Refer to BZ 1830015 and BZ 1835908
        ceph_pods = []
        if float(config.ENV_DATA["ocs_version"]) < 4.4 and (
                scenario == "colocated" and len(test_nodes) > 3):
            pods_to_check = ceph_cluster.osds
            # Skip mon pods if mon_count is 5 as there may not be enough nodes
            # for all mons to run after multiple node failures
            if ceph_cluster.mon_count == 3:
                pods_to_check.extend(ceph_cluster.mons)
            for pod_obj in pods_to_check:
                if pod.get_pod_node(pod_obj).name in app_pod_nodes[0]:
                    ceph_pods.append(pod_obj)
            logger.info(
                f"Colocated Mon, OSD pods: {[pod_obj.name for pod_obj in ceph_pods]}"
            )

        disruptor = []
        if disrupt_provisioner:
            disruptor = self.disrupt_plugin_provisioner_pods(app_pod_nodes)

        return ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor
    def test_scale_osds_reboot_nodes(self, interface, project_factory,
                                     multi_pvc_factory, dc_pod_factory):
        """
        Check storage utilization, if its less then runs IO,
        Scale osds from 3-6, check for rebalance and reboot workers
        """
        current_osd_count = count_cluster_osd()
        proj_obj = project_factory()
        if current_osd_count == 3:
            while not validate_osd_utilization(osd_used=10):
                # Create pvc
                pvc_objs = multi_pvc_factory(project=proj_obj,
                                             interface=interface,
                                             size=self.pvc_size,
                                             num_of_pvc=self.num_of_pvcs)

                dc_pod_objs = list()
                for pvc_obj in pvc_objs:
                    dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj))

                wait_for_dc_app_pods_to_reach_running_state(dc_pod_objs,
                                                            timeout=1200)

                for pod_obj in dc_pod_objs:
                    pod_obj.run_io(storage_type='fs',
                                   size='3G',
                                   runtime='60',
                                   fio_filename=f'{pod_obj.name}_io')

        # Add capacity
        osd_size = storage_cluster.get_osd_size()
        count = storage_cluster.add_capacity(osd_size)
        pod = OCP(kind=constants.POD,
                  namespace=config.ENV_DATA['cluster_namespace'])
        pod.wait_for_resource(timeout=300,
                              condition=constants.STATUS_RUNNING,
                              selector='app=rook-ceph-osd',
                              resource_count=count * 3)
        assert ceph_health_check(
            delay=120, tries=50), "New OSDs failed to reach running state"

        cluster = CephCluster()

        # Get rebalance status
        rebalance_status = cluster.get_rebalance_status()
        logger.info(rebalance_status)
        if rebalance_status:
            time_taken = cluster.time_taken_to_complete_rebalance()
            logger.info(f"The time taken to complete rebalance {time_taken}")

        # Rolling reboot on worker nodes
        worker_nodes = get_typed_nodes(node_type='worker')

        factory = platform_nodes.PlatformNodesFactory()
        nodes = factory.get_nodes_platform()

        for node in worker_nodes:
            nodes.restart_nodes(nodes=[node])
            wait_for_nodes_status()

        assert ceph_health_check(
            delay=180), "Failed, Ceph health bad after nodes reboot"
Exemplo n.º 29
0
    def test_vdbench_workload(self, template, with_ocs, load, label_nodes,
                              ripsaw, servers, threads, blocksize, fileio,
                              samples, width, depth, files, file_size, runtime,
                              pause):
        """
        Run VDBench Workload

        Args :
            template (str) : Name of yaml file that will used as a template
            with_ocs (bool) : This parameter will indicate if the test will
                              run on the same nodes as the OCS
            load (int) : load to run on the storage in percentage of the capacity.
            label_nodes (fixture) : This fixture is labeling the worker(s)
                                    that will used for App. pod(s)
            ripsaw (fixture) : Fixture to deploy the ripsaw benchmarking operator
            servers (int) : Number of servers (pods) that will run the IO
            threads (int) : Number of threads that will run on each server
            blocksize (list - str): List of BlockSize - must add the 'K' to it
            fileio (str) : How to select file for the IO : random / sequential
            samples (int) : Number of time(s) to run each test
            width (int) : Width of directory tree to create
            depth (int) : Depth of directory tree to create
            files (int) : Number of files to create in each directory
            file_size (int) : File size (in MB) to create
            runtime (int) : Time (in Sec.) for each test iteration
            pause (int) : Time (in Min.) to pause between each test iteration.
        """
        log.info(f'going to use {template} as template')
        log.info("Apply Operator CRD")

        crd = 'resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml'
        ripsaw.apply_crd(crd)

        log.info('Running vdbench benchmark')
        if template:
            template = os.path.join(constants.TEMPLATE_VDBENCH_DIR, template)
        else:
            template = constants.VDBENCH_BENCHMARK_YAML
        sf_data = templating.load_yaml(template)

        target_results = template + 'Results'

        log.info('Calculating Storage size....')
        ceph_cluster = CephCluster()
        total_capacity = ceph_cluster.get_ceph_capacity()
        assert total_capacity > constants.VDBENCH_MIN_CAPACITY, (
            "Storage capacity is too low for performance testing")
        log.info(f'The Total usable capacity is {total_capacity}')

        if load:
            width = constants.VDBENCH_WIDTH
            depth = constants.VDBENCH_DEPTH
            file_size = constants.VDBENCH_FILE_SIZE
            capacity_per_pod = constants.VDBENCH_CAP_PER_POD
            total_dirs = width**depth
            log.info(f'The total dirs in the tree {total_dirs}')
            log.info(f'Going to run with {load} % of the capacity load.')
            tested_capacity = round(total_capacity * 1024 * load / 100)
            log.info(f'Tested capacity is {tested_capacity} MB')
            servers = round(tested_capacity / capacity_per_pod)
            """
            To spread the application pods evenly on all workers or application nodes and at least 2 app pods
            per node.
            """
            nodes = len(
                node.get_typed_nodes(node_type=constants.WORKER_MACHINE))
            if not with_ocs:
                nodes = len(
                    machine.get_labeled_nodes(
                        f'node-role.kubernetes.io/app={constants.APP_NODE_LABEL}'
                    ))
            log.info(f'Going to use {nodes} nodes for the test !')
            servers = round(servers / nodes) * nodes
            if servers < (nodes * 2):
                servers = nodes * 2

            files = round(tested_capacity / servers / total_dirs)
            total_files = round(files * servers * total_dirs)
            log.info(f'number of pods is {servers}')
            log.info(f'Going to create {total_files} files !')
            log.info(f'number of files in dir is {files}')
        """
            Setting up the parameters for this test
        """
        if servers:
            sf_data['spec']['workload']['args']['servers'] = servers
            target_results = target_results + '-' + str(servers)
        if threads:
            sf_data['spec']['workload']['args']['threads'] = threads
            target_results = target_results + '-' + str(threads)
        if fileio:
            sf_data['spec']['workload']['args']['fileio'] = fileio
            target_results = target_results + '-' + str(fileio)
        if samples:
            sf_data['spec']['workload']['args']['samples'] = samples
            target_results = target_results + '-' + str(samples)
        if width:
            sf_data['spec']['workload']['args']['width'] = width
            target_results = target_results + '-' + str(width)
        if depth:
            sf_data['spec']['workload']['args']['depth'] = depth
            target_results = target_results + '-' + str(depth)
        if files:
            sf_data['spec']['workload']['args']['files'] = files
            target_results = target_results + '-' + str(files)
        if file_size:
            sf_data['spec']['workload']['args']['file_size'] = file_size
            target_results = target_results + '-' + str(file_size)
        if runtime:
            sf_data['spec']['workload']['args']['runtime'] = runtime
            target_results = target_results + '-' + str(runtime)
        if pause:
            sf_data['spec']['workload']['args']['pause'] = pause
            target_results = target_results + '-' + str(pause)
        if len(blocksize) > 0:
            sf_data['spec']['workload']['args']['bs'] = blocksize
            target_results = target_results + '-' + '_'.join(blocksize)
        if with_ocs:
            if sf_data['spec']['workload']['args']['pin_server']:
                del sf_data['spec']['workload']['args']['pin_server']
        """
            Calculating the size of the volume that need to be test, it should
            be at least twice in the size then the size of the files, and at
            least 100Gi.
            since the file_size is in Kb and the vol_size need to be in Gb,
            more calculation is needed.
        """
        vol_size = int((files * total_dirs) * file_size * 1.3)
        log.info('number of files to create : {}'.format(
            int(files * (width**depth))))
        log.info(f'The size of all files is : {vol_size}MB')
        vol_size = int(vol_size / 1024)
        if vol_size < 100:
            vol_size = 100
        sf_data['spec']['workload']['args']['storagesize'] = f'{vol_size}Gi'

        log.debug(f'output of configuration file is {sf_data}')

        timeout = 86400  # 3600 (1H) * 24 (1D)  = one days

        sf_obj = OCS(**sf_data)
        sf_obj.create()
        # wait for benchmark pods to get created - takes a while
        for bench_pod in TimeoutSampler(300, 10, get_pod_name_by_pattern,
                                        'vdbench-client', 'my-ripsaw'):
            try:
                if bench_pod[0] is not None:
                    vdbench_client_pod = bench_pod[0]
                    break
            except IndexError:
                log.info('Benchmark client pod not ready yet')

        bench_pod = OCP(kind='pod', namespace='my-ripsaw')
        log.info('Waiting for VDBench benchmark to Run')
        assert bench_pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                           resource_name=vdbench_client_pod,
                                           sleep=30,
                                           timeout=600)
        start_time = time.time()
        while True:
            logs = bench_pod.exec_oc_cmd(f'logs {vdbench_client_pod}',
                                         out_yaml_format=False)
            if 'Test Run Finished' in logs:
                log.info('VdBench Benchmark Completed Successfully')
                break

            if timeout < (time.time() - start_time):
                raise TimeoutError(
                    'Timed out waiting for benchmark to complete')
            time.sleep(30)

        # Getting the results file from the benchmark pod and put it with the
        # test logs.
        # TODO: find the place of the actual test log and not in the parent
        #       logs path
        target_results = '{}/{}.tgz'.format(ocsci_log_path(), target_results)
        pod_results = constants.VDBENCH_RESULTS_FILE
        retrive_files_from_pod(vdbench_client_pod, target_results, pod_results)