Exemplo n.º 1
0
    def test_scale_mcg_obc_creation(self, tmp_path, timeout=60):
        """
        MCG OBC creation using Noobaa storage class
        """

        log.info(f"Start creating  {self.scale_obc_count} "
                 f"OBC in a batch of {self.num_obc_batch}")
        for i in range(int(self.scale_obc_count / self.num_obc_batch)):
            obc_dict_list = (
                scale_noobaa_lib.construct_obc_creation_yaml_bulk_for_kube_job(
                    no_of_obc=self.num_obc_batch,
                    sc_name=self.sc_name,
                    namespace=self.namespace,
                ))
            # Create job profile
            job_file = ObjectConfFile(
                name="job_profile",
                obj_dict_list=obc_dict_list,
                project=self.namespace,
                tmp_path=tmp_path,
            )
            # Create kube_job
            job_file.create(namespace=self.namespace)
            time.sleep(timeout * 5)

            # Check all the PVC reached Bound state
            obc_bound_list = (
                scale_noobaa_lib.check_all_obc_reached_bound_state_in_kube_job(
                    kube_job_obj=job_file,
                    namespace=self.namespace,
                    no_of_obc=self.num_obc_batch,
                ))
            log.info(f"Number of PVCs in Bound state {len(obc_bound_list)}")
        # Delete obc on cluster
        scale_noobaa_lib.cleanup(self.namespace)
Exemplo n.º 2
0
    def test_scale_mcg_rgw_obc_creation(self, tmp_path, timeout=60):
        """
        OBC creation for both MCG and RGW storage class
        This test case only runs on vSphere cluster deployment
        """

        log.info(
            f"Start creating  {self.scale_obc_count} OBC in a batch of {self.num_obc_batch}"
        )
        for i in range(int(self.scale_obc_count / self.num_obc_batch)):
            obc_dict_list1 = (
                scale_noobaa_lib.construct_obc_creation_yaml_bulk_for_kube_job(
                    no_of_obc=int(self.num_obc_batch / 2),
                    sc_name=self.sc_name,
                    namespace=self.namespace,
                ))
            obc_dict_list2 = (
                scale_noobaa_lib.construct_obc_creation_yaml_bulk_for_kube_job(
                    no_of_obc=int(self.num_obc_batch / 2),
                    sc_name=self.sc_rgw_name,
                    namespace=self.namespace,
                ))
            # Create job profile
            job_file1 = ObjectConfFile(
                name="job_profile1",
                obj_dict_list=obc_dict_list1,
                project=self.namespace,
                tmp_path=tmp_path,
            )
            job_file2 = ObjectConfFile(
                name="job_profile2",
                obj_dict_list=obc_dict_list2,
                project=self.namespace,
                tmp_path=tmp_path,
            )
            # Create kube_job
            job_file1.create(namespace=self.namespace)
            time.sleep(timeout * 3)
            job_file2.create(namespace=self.namespace)
            time.sleep(timeout * 3)

            # Check all the PVC reached Bound state
            obc_mcg_bound_list = (
                scale_noobaa_lib.check_all_obc_reached_bound_state_in_kube_job(
                    kube_job_obj=job_file1,
                    namespace=self.namespace,
                    no_of_obc=int(self.num_obc_batch / 2),
                ))
            obc_rgw_bound_list = (
                scale_noobaa_lib.check_all_obc_reached_bound_state_in_kube_job(
                    kube_job_obj=job_file2,
                    namespace=self.namespace,
                    no_of_obc=int(self.num_obc_batch / 2),
                ))
            log.info(
                f"Number of OBCs in Bound state MCG: {len(obc_mcg_bound_list)},"
                f" RGW: {len(obc_rgw_bound_list)}")

        # Delete obc on cluster
        scale_noobaa_lib.cleanup(self.namespace)
Exemplo n.º 3
0
def create_fio_pod(project,
                   interface,
                   pvc_factory,
                   storageclass,
                   access_mode,
                   fio_job_dict,
                   fio_configmap_dict,
                   tmp_path,
                   volume_mode=None,
                   pvc_size=10):
    """
    Create pods for upgrade testing.

    Args:
        project (obj): Project in which to create resources
        interface (str): CephBlockPool or CephFileSystem
        pvc_factory (function): Function for creating PVCs
        storageclass (obj): Storageclass to use
        access_mode (str): ReadWriteOnce, ReadOnlyMany or ReadWriteMany.
            This decides the access mode to be used for the PVC
        fio_job_dict (dict): fio job dictionary to use
        fio_configmap_dict (dict): fio configmap dictionary to use
        tmp_path (obj): reference to tmp_path fixture object
        volume_mode (str): Volume mode for rbd RWO PVC
        pvc_size (int): Size of PVC in GiB

    Return:
        list: List of generated pods

    """
    log.info(f"Creating pod via {interface} using {access_mode}"
             f" access mode, {volume_mode} volume mode and {storageclass.name}"
             f" storageclass")
    pvc = pvc_factory(project=project,
                      storageclass=storageclass,
                      access_mode=access_mode,
                      volume_mode=volume_mode,
                      size=pvc_size,
                      status=None)
    helpers.wait_for_resource_state(pvc, constants.STATUS_BOUND, timeout=600)

    job_volume = fio_job_dict['spec']['template']['spec']['volumes'][0]
    job_volume['persistentVolumeClaim']['claimName'] = pvc.name
    fio_objs = [fio_configmap_dict, fio_job_dict]
    job_file = ObjectConfFile("fio_continuous", fio_objs, project, tmp_path)

    # deploy the Job to the cluster and start it
    job_file.create()

    ocp_pod_obj = ocp.OCP(kind=constants.POD, namespace=project.namespace)
    pods = ocp_pod_obj.get()['items']
    for pod in pods:
        pod_volume = pod['spec']['volumes'][0]
        if pod_volume['persistentVolumeClaim']['claimName'] == pvc.name:
            pod_data = pod
            break

    return Pod(**pod_data)
Exemplo n.º 4
0
def test_start_fio_job(
    tmp_path,
    fio_pvc_dict,
    fio_job_dict,
    fio_configmap_dict,
):
    """
    Start a fio job performing IO load, check that it's running, and keep
    it running even after the test finishes.
    """
    # creating project directly to set it's name and prevent it's deletion
    project = ocp.OCP(kind="Project", namespace=TEST_NS)
    project.new_project(TEST_NS)

    # size of the volume for fio
    pvc_size = 10  # GiB

    # test uses cephfs based volume, could be either parametrized or we can
    # try to start more jobs
    storage_class_name = "ocs-storagecluster-cephfs"

    # fio config file: random mixed read and write IO will be running for one
    # day (we expect that the other test will stop it), only 1/2 of the volume
    # is used, we don't need to utilize the PV 100%
    fio_size = int(pvc_size / 2)  # GiB
    fio_conf = textwrap.dedent(f"""
        [readwrite]
        readwrite=randrw
        buffered=1
        blocksize=4k
        ioengine=libaio
        directory=/mnt/target
        size={fio_size}G
        time_based
        runtime=24h
        """)

    # put the dicts together into yaml file of the Job
    fio_configmap_dict["data"]["workload.fio"] = fio_conf
    fio_pvc_dict["spec"]["storageClassName"] = storage_class_name
    fio_pvc_dict["spec"]["resources"]["requests"]["storage"] = f"{pvc_size}Gi"
    fio_objs = [fio_pvc_dict, fio_configmap_dict, fio_job_dict]
    job_file = ObjectConfFile("fio_continuous", fio_objs, project, tmp_path)

    # deploy the Job to the cluster and start it
    job_file.create()

    # wait for a pod for the job to be deployed and running
    ocp_pod = ocp.OCP(kind="Pod", namespace=project.namespace)
    try:
        ocp_pod.wait_for_resource(resource_count=1,
                                  condition=constants.STATUS_RUNNING,
                                  timeout=300,
                                  sleep=30)
    except TimeoutExpiredError:
        logger.error("pod for fio job wasn't deployed properly")
        raise
    def test_scale_obc_creation_noobaa_pod_respin(self, tmp_path, pod_name,
                                                  sc_name, mcg_job_factory):
        """
        OBC creation using RGW storage class
        This test case only runs on vSphere cluster deployment
        """

        # Create OBCs with FIO running using mcg_job_factory()
        for i in range(self.scale_obc_count_io):
            exec(f"job{i} = mcg_job_factory()")

        log.info(f"Start creating  {self.scale_obc_count} "
                 f"OBC in a batch of {self.num_obc_batch}")
        for i in range(int(self.scale_obc_count / self.num_obc_batch)):
            obc_dict_list = (
                scale_noobaa_lib.construct_obc_creation_yaml_bulk_for_kube_job(
                    no_of_obc=self.num_obc_batch,
                    sc_name=sc_name,
                    namespace=self.namespace,
                ))
            # Create job profile
            job_file = ObjectConfFile(
                name="job_profile",
                obj_dict_list=obc_dict_list,
                project=self.namespace,
                tmp_path=tmp_path,
            )
            # Create kube_job
            job_file.create(namespace=self.namespace)

            # Check all the OBCs reached Bound state
            obc_bound_list = (
                scale_noobaa_lib.check_all_obc_reached_bound_state_in_kube_job(
                    kube_job_obj=job_file,
                    namespace=self.namespace,
                    no_of_obc=self.num_obc_batch,
                ))
            log.info(f"Number of OBCs in Bound state: {len(obc_bound_list)}")

        # Reset node which noobaa pods is running on
        # And validate noobaa pods are re-spinned and in running state
        scale_noobaa_lib.noobaa_running_node_restart(pod_name=pod_name)

        # Verify all OBCs are in Bound state after node restart
        log.info("Verify all OBCs are in Bound state after node restart.....")
        obc_status_list = scale_noobaa_lib.check_all_obcs_status(
            namespace=self.namespace)
        log.info(f"Number of OBCs in Bound state after node reset: "
                 f"{len(obc_status_list[0])}")
        assert (len(obc_status_list[0]) == self.scale_obc_count
                ), "Not all OBCs in Bound state"
Exemplo n.º 6
0
def create_workload_job(job_name,
                        bucket,
                        project,
                        mcg_obj,
                        resource_path,
                        custom_options=None):
    """
    Creates kubernetes job that should utilize MCG bucket.

    Args:
        job_name (str): Name of the job
        bucket (objt): MCG bucket with S3 interface
        project (obj): OCP object representing OCP project which will be
            used for the job
        mcg_obj (obj): instance of MCG class
        resource_path (str): path to directory where should be created
            resources
        custom_options (dict): Dictionary of lists containing tuples with
            additional configuration for fio in format:
            {'section': [('option', 'value'),...],...}
            e.g.
            {'global':[('name','bucketname')],'create':[('time_based','1'),('runtime','48h')]}
            Those values can be added to the config or rewrite already existing
            values

    Returns:
        obj: Job object

    """
    fio_job_dict = get_job_dict(job_name)
    fio_configmap_dict = get_configmap_dict(fio_job_dict, mcg_obj, bucket,
                                            custom_options)
    fio_objs = [fio_configmap_dict, fio_job_dict]

    log.info(f"Creating MCG workload job {job_name}")
    job_file = ObjectConfFile("fio_continuous", fio_objs, project,
                              resource_path)

    # deploy the Job to the cluster and start it
    job_file.create()
    log.info(f"Job {job_name} created")

    # get job object
    ocp_job_obj = ocp.OCP(kind=constants.JOB, namespace=project.namespace)
    job = OCS(**ocp_job_obj.get(resource_name=job_name))

    return job
Exemplo n.º 7
0
def test_scale_obc_pre_upgrade(tmp_path, timeout=60):
    """
    Create scaled MCG OBC using Noobaa storage class before upgrade
    Save scaled obc data in a file for post upgrade validation
    """
    obc_scaled_list = []
    log.info(f"Start creating  {scale_obc_count} " f"OBC in a batch of {num_obc_batch}")
    for i in range(int(scale_obc_count / num_obc_batch)):
        obc_dict_list = scale_noobaa_lib.construct_obc_creation_yaml_bulk_for_kube_job(
            no_of_obc=num_obc_batch,
            sc_name=sc_name,
            namespace=namespace,
        )
        # Create job profile
        job_file = ObjectConfFile(
            name="job_profile",
            obj_dict_list=obc_dict_list,
            project=namespace,
            tmp_path=tmp_path,
        )
        # Create kube_job
        job_file.create(namespace=namespace)
        time.sleep(timeout * 5)

        # Check all the OBCs reached Bound state
        obc_bound_list = scale_noobaa_lib.check_all_obc_reached_bound_state_in_kube_job(
            kube_job_obj=job_file,
            namespace=namespace,
            no_of_obc=num_obc_batch,
        )
        obc_scaled_list.extend(obc_bound_list)

    log.info(
        f"Number of OBCs in scaled list: {len(obc_scaled_list)}",
    )

    # Write namespace, OBC data in a OBC_SCALE_DATA_FILE which
    # will be used during post_upgrade validation tests
    with open(obc_scaled_data_file, "a+") as w_obj:
        w_obj.write(str("# Scale Data File\n"))
        w_obj.write(str(f"NAMESPACE: {namespace}\n"))
        w_obj.write(str(f"OBC_SCALE_LIST: {obc_scaled_list}\n"))
Exemplo n.º 8
0
def mcg_workload_job(fio_job_dict_mcg, fio_configmap_dict_mcg, fio_conf_mcg,
                     fio_project_mcg, tmp_path, request):
    """
    Creates kubernetes job that should utilize MCG during upgrade.

    Returns:
        object: Job object

    """
    fio_configmap_dict_mcg["data"]["workload.fio"] = fio_conf_mcg
    fio_objs = [fio_configmap_dict_mcg, fio_job_dict_mcg]

    job_name = fio_job_dict_mcg['metadata']['name']

    log.info(f"Creating job {job_name}")
    job_file = ObjectConfFile("fio_continuous", fio_objs, fio_project_mcg,
                              tmp_path)

    # deploy the Job to the cluster and start it
    job_file.create()
    log.info(f"Job {job_name} created")

    # get job object
    ocp_job_obj = ocp.OCP(kind=constants.JOB,
                          namespace=fio_project_mcg.namespace)
    job = OCS(**ocp_job_obj.get(resource_name=job_name))

    def teardown():
        """
        Delete mcg job
        """
        job.delete()
        job.ocp.wait_for_delete(job.name)

    request.addfinalizer(teardown)

    return job
Exemplo n.º 9
0
def test_log_reader_writer_parallel(project, tmp_path):
    """
    Write and read logfile stored on cephfs volume, from all worker nodes of a
    cluster via k8s Deployment, while fetching content of the stored data via
    oc rsync to check the data locally.

    Reproduces BZ 1989301. Test failure means new blocker high priority bug.
    """
    pvc_dict = get_pvc_dict()
    # we need to mount the volume on every worker node, so RWX/cephfs
    pvc_dict["metadata"]["name"] = "logwriter-cephfs-many"
    pvc_dict["spec"]["accessModes"] = [constants.ACCESS_MODE_RWX]
    if (
        config.ENV_DATA["platform"].lower() not in constants.MANAGED_SERVICE_PLATFORMS
    ) and storagecluster_independent_check():
        sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS
    else:
        sc_name = constants.CEPHFILESYSTEM_SC
    pvc_dict["spec"]["storageClassName"] = sc_name
    # there is no need for lot of storage capacity for this test
    pvc_dict["spec"]["resources"]["requests"]["storage"] = "1Gi"

    # get deployment dict for the reproducer logwriter workload
    with open(constants.LOGWRITER_CEPHFS_REPRODUCER, "r") as deployment_file:
        deploy_dict = yaml.safe_load(deployment_file.read())
    # if we are running in disconnected environment, we need to mirror the
    # container image first, and then use the mirror instead of the original
    if config.DEPLOYMENT.get("disconnected"):
        update_container_with_mirrored_image(deploy_dict["spec"]["template"])
    # we need to match deployment replicas with number of worker nodes
    deploy_dict["spec"]["replicas"] = len(get_worker_nodes())
    # drop topology spread constraints related to zones
    topology.drop_topology_constraint(
        deploy_dict["spec"]["template"]["spec"], topology.ZONE_LABEL
    )
    # and link the deployment with the pvc
    try:
        link_spec_volume(
            deploy_dict["spec"]["template"]["spec"],
            "logwriter-cephfs-volume",
            pvc_dict["metadata"]["name"],
        )
    except Exception as ex:
        error_msg = "LOGWRITER_CEPHFS_REPRODUCER no longer matches code of this test"
        raise Exception(error_msg) from ex

    # prepare k8s yaml file for deployment
    workload_file = ObjectConfFile(
        "log_reader_writer_parallel", [pvc_dict, deploy_dict], project, tmp_path
    )
    # deploy the workload, starting the log reader/writer pods
    logger.info(
        "starting log reader/writer workload via Deployment, one pod per worker"
    )
    workload_file.create()

    logger.info("waiting for all pods of the workload Deployment to run")
    ocp_pod = ocp.OCP(kind="Pod", namespace=project.namespace)
    try:
        ocp_pod.wait_for_resource(
            resource_count=deploy_dict["spec"]["replicas"],
            condition=constants.STATUS_RUNNING,
            error_condition=constants.STATUS_ERROR,
            timeout=300,
            sleep=30,
        )
    except Exception as ex:
        # this is not a problem with feature under test, but with infra,
        # cluster configuration or unrelated bug which must have happened
        # before this test case
        error_msg = "unexpected problem with start of the workload, cluster is either misconfigured or broken"
        logger.exception(error_msg)
        logger.debug(workload_file.describe())
        raise exceptions.UnexpectedBehaviour(error_msg) from ex

    # while the workload is running, we will try to fetch and validate data
    # from the cephfs volume of the workload 120 times (this number of retries
    # is a bit larger than usual number required to reproduce bug from
    # BZ 1989301, but we need to be sure here)
    number_of_fetches = 120
    # if given fetch fail, we will ignore the failure unless the number of
    # failures is too high (this has no direct impact on feature under test,
    # we should be able to detect the bug even with 10% of rsync failures,
    # since data corruption doesn't simply go away ...)
    number_of_failures = 0
    allowed_failures = 12
    is_local_data_ok = True
    local_dir = tmp_path / "logwriter"
    local_dir.mkdir()
    workload_pods = ocp_pod.get()
    workload_pod_name = workload_pods["items"][0]["metadata"]["name"]
    logger.info(
        "while the workload is running, we will fetch and check data from the cephfs volume %d times",
        number_of_fetches,
    )
    for _ in range(number_of_fetches):
        # fetch data from cephfs volume into the local dir
        oc_cmd = [
            "oc",
            "rsync",
            "--loglevel=4",
            "-n",
            project.namespace,
            f"pod/{workload_pod_name}:/mnt/target",
            local_dir,
        ]
        try:
            run_cmd(cmd=oc_cmd, timeout=300)
        except Exception as ex:
            number_of_failures += 1
            # in case this fails, we are going to fetch extra evidence, that
            # said such failure is most likely related to OCP or infrastructure
            error_msg = "oc rsync failed: something is wrong with the cluster"
            logger.exception(error_msg)
            logger.debug(workload_file.describe())
            oc_rpm_debug = [
                "oc",
                "rsh",
                "-n",
                project.namespace,
                f"pod/{workload_pod_name}",
                "bash",
                "-c",
                ";".join(
                    [
                        "rpm -qa",
                        "rpm -qaV",
                        "type -a tar",
                        "tar --version",
                        "type -a rsync",
                        "rsync --version",
                    ]
                ),
            ]
            try:
                run_cmd(cmd=oc_rpm_debug, timeout=600)
            except Exception:
                # if fetch of additional evidence fails, log and ignore the
                # exception (so that we can retry if needed)
                logger.exception("failed to fetch additional evidence")
            # in case the rsync run failed because of a container restart,
            # we assume the pod name hasn't changed, and just wait for the
            # container to be running again - unless the number of rsync
            # failures is too high
            if number_of_failures > allowed_failures:
                logger.error("number of ignored rsync failures is too high")
            else:
                ocp_pod.wait_for_resource(
                    resource_count=deploy_dict["spec"]["replicas"],
                    condition=constants.STATUS_RUNNING,
                    error_condition=constants.STATUS_ERROR,
                    timeout=300,
                    sleep=30,
                )
                continue
            logger.debug(
                "before this failure, we ignored %d previous failures",
                number_of_failures,
            )
            raise exceptions.UnexpectedBehaviour(error_msg) from ex
        # look for null bytes in the just fetched local files in target dir,
        # and if these binary bytes are found, the test failed (the bug
        # was reproduced)
        target_dir = os.path.join(local_dir, "target")
        for file_name in os.listdir(target_dir):
            with open(os.path.join(target_dir, file_name), "r") as fo:
                data = fo.read()
                if "\0" in data:
                    is_local_data_ok = False
                    logger.error(
                        "file %s is corrupted: null byte found in a text file",
                        file_name,
                    )
        # is_local_data_ok = False
        assert is_local_data_ok, "data corruption detected"
        time.sleep(2)

    logger.debug("number of ignored rsync failures: %d", number_of_failures)

    # if no obvious problem was detected, run the logreader job to validate
    # checksums in the log files (so that we are 100% sure that nothing went
    # wrong with the IO or the data)
    with open(constants.LOGWRITER_CEPHFS_READER, "r") as job_file:
        job_dict = yaml.safe_load(job_file.read())
    # mirroring for disconnected environment, if necessary
    if config.DEPLOYMENT.get("disconnected"):
        update_container_with_mirrored_image(job_dict["spec"]["template"])
    # drop topology spread constraints related to zones
    topology.drop_topology_constraint(
        job_dict["spec"]["template"]["spec"], topology.ZONE_LABEL
    )
    # we need to match number of jobs with the number used in the workload
    job_dict["spec"]["completions"] = deploy_dict["spec"]["replicas"]
    job_dict["spec"]["parallelism"] = deploy_dict["spec"]["replicas"]
    # and reffer to the correct pvc name
    try:
        link_spec_volume(
            job_dict["spec"]["template"]["spec"],
            "logwriter-cephfs-volume",
            pvc_dict["metadata"]["name"],
        )
    except Exception as ex:
        error_msg = "LOGWRITER_CEPHFS_READER no longer matches code of this test"
        raise Exception(error_msg) from ex
    # prepare k8s yaml file for the job
    job_file = ObjectConfFile("log_reader", [job_dict], project, tmp_path)
    # deploy the job, starting the log reader pods
    logger.info(
        "starting log reader data validation job to fully check the log data",
    )
    job_file.create()
    # wait for the logreader job to complete (this should be rather quick)
    try:
        job.wait_for_job_completion(
            job_name=job_dict["metadata"]["name"],
            namespace=project.namespace,
            timeout=300,
            sleep_time=30,
        )
    except exceptions.TimeoutExpiredError:
        error_msg = (
            "verification failed to complete in time: data loss or broken cluster?"
        )
        logger.exception(error_msg)
    # and then check that the job completed with success
    logger.info("checking the result of data validation job")
    logger.debug(job_file.describe())
    ocp_job = ocp.OCP(
        kind="Job",
        namespace=project.namespace,
        resource_name=job_dict["metadata"]["name"],
    )
    job_status = ocp_job.get()["status"]
    logger.info("last status of data verification job: %s", job_status)
    if (
        "failed" in job_status
        or job_status["succeeded"] != deploy_dict["spec"]["replicas"]
    ):
        error_msg = "possible data corruption: data verification job failed!"
        logger.error(error_msg)
        job.log_output_of_job_pods(
            job_name=job_dict["metadata"]["name"], namespace=project.namespace
        )
        raise Exception(error_msg)
    def test_bulk_clone_performance(self, namespace, tmp_path, pod_factory):
        """
        Creates number of PVCs in a bulk using kube job
        Write 60% of PVC capacity to each one of the created PVCs
        Creates 1 clone per each PVC altogether in a bulk
        Measuring time for bulk of clones creation

        """
        pvc_count = 50
        log.info(f"Start creating {self.interface} {pvc_count} PVC")
        if self.interface == constants.CEPHBLOCKPOOL:
            sc_name = constants.DEFAULT_STORAGECLASS_RBD
            clone_yaml = constants.CSI_RBD_PVC_CLONE_YAML
        elif self.interface == constants.CEPHFILESYSTEM:
            sc_name = constants.DEFAULT_STORAGECLASS_CEPHFS
            clone_yaml = constants.CSI_CEPHFS_PVC_CLONE_YAML

        pvc_dict_list = scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job(
            no_of_pvc=pvc_count,
            access_mode=constants.ACCESS_MODE_RWO,
            sc_name=sc_name,
            pvc_size="5Gi",
        )

        job_pvc_file = ObjectConfFile(
            name="job_profile_pvc",
            obj_dict_list=pvc_dict_list,
            project=self.namespace,
            tmp_path=tmp_path,
        )

        # Create kube_job
        job_pvc_file.create(namespace=self.namespace)

        # Check all the PVC reached Bound state
        pvc_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job(
            kube_job_obj=job_pvc_file,
            namespace=self.namespace,
            no_of_pvc=pvc_count,
        )

        logging.info(f"Number of PVCs in Bound state {len(pvc_bound_list)}")

        total_files_size = self.run_fio_on_pvcs(pvc_dict_list, pod_factory)

        clone_dict_list = scale_lib.construct_pvc_clone_yaml_bulk_for_kube_job(
            pvc_dict_list, clone_yaml, sc_name)

        logging.info("Created clone dict list")

        job_clone_file = ObjectConfFile(
            name="job_profile_clone",
            obj_dict_list=clone_dict_list,
            project=self.namespace,
            tmp_path=tmp_path,
        )

        # Create kube_job that creates clones
        job_clone_file.create(namespace=self.namespace)

        logging.info("Going to check bound status for clones")
        # Check all the clones reached Bound state
        clone_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job(
            kube_job_obj=job_clone_file,
            namespace=self.namespace,
            no_of_pvc=pvc_count,
            timeout=200,
        )

        logging.info(
            f"Number of clones in Bound state {len(clone_bound_list)}")

        clone_objs = []
        all_pvc_objs = pvc.get_all_pvc_objs(namespace=self.namespace)
        for clone_yaml in clone_dict_list:
            name = clone_yaml["metadata"]["name"]
            size = clone_yaml["spec"]["resources"]["requests"]["storage"]
            logging.info(f"Clone {name} of size {size} created")
            for pvc_obj in all_pvc_objs:
                if pvc_obj.name == name:
                    clone_objs.append(pvc_obj)

        assert len(clone_bound_list) == len(
            clone_objs
        ), "Not all clones reached BOUND state, cannot measure time"
        start_time = helpers.get_provision_time(self.interface,
                                                clone_objs,
                                                status="start")
        end_time = helpers.get_provision_time(self.interface,
                                              clone_objs,
                                              status="end")
        total_time = (end_time - start_time).total_seconds()
        speed = round(total_files_size / total_time, 2)
        logging.info(
            f"Total creation time = {total_time} secs, data size = {total_files_size} MB, speed = {speed} MB/sec "
            f"for {self.interface} clone in bulk of {pvc_count} clones.")
Exemplo n.º 11
0
class TestBulkPodAttachPerformance(PASTest):
    """
    Test to measure performance of attaching pods to pvc in a bulk
    """

    pvc_size = "1Gi"

    def setup(self):
        """
        Setting up test parameters
        """
        log.info("Starting the test setup")
        super(TestBulkPodAttachPerformance, self).setup()
        self.benchmark_name = "bulk_pod_attach_time"

        self.create_test_project()
        # Pulling the pod image to the worker node, so pull image will not calculate
        # in the total attach time
        helpers.pull_images(constants.PERF_IMAGE)

        # Initializing some parameters
        self.pvc_objs = list()
        self.pods_obj = locals()

    def teardown(self):
        """
        Cleanup the test environment
        """
        log.info("Starting the test environment celanup")

        # Deleting All POD(s)
        log.info("Try to delete all created PODs")
        try:
            self.pods_obj.delete(namespace=self.namespace)
        except Exception as ex:
            log.warn(f"Failed to delete POD(s) [{ex}]")
        log.info("Wait for all PODs to be deleted")
        performance_lib.wait_for_resource_bulk_status("pod", 0, self.namespace,
                                                      constants.STATUS_BOUND,
                                                      len(self.pvc_objs) * 2,
                                                      10)
        log.info("All POD(s) was deleted")

        # Deleting PVC(s) for deletion time mesurment
        log.info("Try to delete all created PVCs")
        for pvc_obj in self.pvc_objs:
            pvc_obj.delete()
        log.info("Wait for all PVC(s) to be deleted")
        performance_lib.wait_for_resource_bulk_status("pvc", 0, self.namespace,
                                                      constants.STATUS_BOUND,
                                                      len(self.pvc_objs) * 2,
                                                      10)
        log.info("All PVC(s) was deleted")
        log.info("Wait for all PVC(s) backed PV(s) to be deleted")
        # Timeout for each PV to be deleted is 20 sec.
        performance_lib.wait_for_resource_bulk_status("pv", 0, self.namespace,
                                                      self.namespace,
                                                      len(self.pvc_objs) * 20,
                                                      10)
        log.info("All backed PV(s) was deleted")

        # Delete the test project (namespace)
        self.delete_test_project()

        super(TestBulkPodAttachPerformance, self).teardown()

    @pytest.mark.parametrize(
        argnames=["interface_type", "bulk_size"],
        argvalues=[
            pytest.param(*[constants.CEPHBLOCKPOOL, 120], ),
            pytest.param(*[constants.CEPHBLOCKPOOL, 240], ),
            pytest.param(*[constants.CEPHFILESYSTEM, 120], ),
            pytest.param(*[constants.CEPHFILESYSTEM, 240], ),
        ],
    )
    @polarion_id("OCS-1620")
    def test_bulk_pod_attach_performance(self, interface_type, bulk_size):
        """
        Measures pods attachment time in bulk_size bulk

        Args:
            interface_type (str): The interface type to be tested - CephBlockPool / CephFileSystem.
            bulk_size (int): Size of the bulk to be tested
        Returns:

        """
        self.interface = interface_type

        if self.dev_mode:
            bulk_size = 3

        # Initialize some variables
        timeout = bulk_size * 5
        pvc_names_list = list()
        pod_data_list = list()

        # Getting the test start time
        test_start_time = self.get_time()
        csi_start_time = self.get_time("csi")

        log.info(f"Start creating bulk of new {bulk_size} PVCs")
        self.pvc_objs, _ = helpers.create_multiple_pvcs(
            sc_name=Interfaces_info[self.interface]["sc"],
            namespace=self.namespace,
            number_of_pvc=bulk_size,
            size=self.pvc_size,
            burst=True,
            do_reload=False,
        )
        log.info("Wait for all of the PVCs to be in Bound state")
        performance_lib.wait_for_resource_bulk_status("pvc", bulk_size,
                                                      self.namespace,
                                                      constants.STATUS_BOUND,
                                                      timeout, 10)
        # in case of creation faliure, the wait_for_resource_bulk_status function
        # will raise an exception. so in this point the creation succeed
        log.info("All PVCs was created and in Bound state.")

        # Reload all PVC(s) information
        for pvc_obj in self.pvc_objs:
            pvc_obj.reload()
            pvc_names_list.append(pvc_obj.name)
        log.debug(f"The PVCs names are : {pvc_names_list}")

        # Create kube_job for pod creation
        pod_data_list.extend(
            scale_lib.attach_multiple_pvc_to_pod_dict(
                pvc_list=pvc_names_list,
                namespace=self.namespace,
                pvcs_per_pod=1,
            ))
        self.pods_obj = ObjectConfFile(
            name="pod_kube_obj",
            obj_dict_list=pod_data_list,
            project=self.namespace,
            tmp_path=pathlib.Path(ocsci_log_path()),
        )
        log.debug(f"PODs data list is : {json.dumps(pod_data_list, indent=3)}")

        log.info(f"{self.interface} : Before pod attach")
        bulk_start_time = time.time()
        self.pods_obj.create(namespace=self.namespace)
        # Check all the PODs reached Running state
        log.info("Checking that pods are running")
        performance_lib.wait_for_resource_bulk_status("pod", bulk_size,
                                                      self.namespace,
                                                      constants.STATUS_RUNNING,
                                                      timeout, 2)
        log.info("All POD(s) are in Running State.")
        bulk_end_time = time.time()
        bulk_total_time = bulk_end_time - bulk_start_time
        log.info(
            f"Bulk attach time of {bulk_size} pods is {bulk_total_time} seconds"
        )

        csi_bulk_total_time = performance_lib.pod_bulk_attach_csi_time(
            self.interface, self.pvc_objs, csi_start_time, self.namespace)

        # Collecting environment information
        self.get_env_info()

        # Initialize the results doc file.
        full_results = self.init_full_results(
            ResultsAnalyse(self.uuid, self.crd_data, self.full_log_path,
                           "pod_bulk_attachtime"))

        full_results.add_key("storageclass",
                             Interfaces_info[self.interface]["name"])
        full_results.add_key("pod_bulk_attach_time", bulk_total_time)
        full_results.add_key("pod_csi_bulk_attach_time", csi_bulk_total_time)
        full_results.add_key("pvc_size", self.pvc_size)
        full_results.add_key("bulk_size", bulk_size)

        # Getting the test end time
        test_end_time = self.get_time()

        # Add the test time to the ES report
        full_results.add_key("test_time", {
            "start": test_start_time,
            "end": test_end_time
        })

        # Write the test results into the ES server
        self.results_path = helpers.get_full_test_logs_path(cname=self)
        if full_results.es_write():
            res_link = full_results.results_link()
            # write the ES link to the test results in the test log.
            log.info(f"The result can be found at : {res_link}")

            # Create text file with results of all subtests (4 - according to the parameters)
            self.write_result_to_file(res_link)

    def test_bulk_pod_attach_results(self):
        """
        This is not a test - it is only check that previous test ran and finish as expected
        and reporting the full results (links in the ES) of previous tests (4)
        """

        self.add_test_to_results_check(
            test="test_bulk_pod_attach_performance",
            test_count=4,
            test_name="Bulk Pod Attach Time",
        )
        self.check_results_and_push_to_dashboard()

    def init_full_results(self, full_results):
        """
        Initialize the full results object which will send to the ES server

        Args:
            full_results (obj): an empty ResultsAnalyse object

        Returns:
            ResultsAnalyse (obj): the input object filled with data

        """
        for key in self.environment:
            full_results.add_key(key, self.environment[key])
        return full_results
Exemplo n.º 12
0
    def fetch_and_validate_data(self):
        """
        While the workload is running, try to validate the data
        from the cephfs volume of the workload.

        Raise:
            NotFoundError: When the given volume is not found in given spec
            Exception: When the data verification job failed

        """
        # if no obvious problem was detected, run the logreader job to validate
        # checksums in the log files (so that we are 100% sure that nothing went
        # wrong with the IO or the data)
        with open(constants.LOGWRITER_CEPHFS_READER, "r") as job_file:
            job_dict = yaml.safe_load(job_file.read())
        # if we are running in disconnected environment, we need to mirror the
        # container image first, and then use the mirror instead of the original
        if config.DEPLOYMENT.get("disconnected"):
            update_container_with_mirrored_image(
                self.deploy_dict["spec"]["template"])
        # drop topology spread constraints related to zones
        topology.drop_topology_constraint(job_dict["spec"]["template"]["spec"],
                                          topology.ZONE_LABEL)
        # we need to match number of jobs with the number used in the workload
        job_dict["spec"]["completions"] = self.deploy_dict["spec"]["replicas"]
        job_dict["spec"]["parallelism"] = self.deploy_dict["spec"]["replicas"]
        # and reffer to the correct pvc name
        try:
            link_spec_volume(
                job_dict["spec"]["template"]["spec"],
                "logwriter-cephfs-volume",
                self.pvc_dict["metadata"]["name"],
            )
        except (exceptions.NotFoundError, KeyError) as ex:
            logger.warning(
                "Failed to link the deployment with the pvc. We may need to check if the "
                "LOGWRITER_CEPHFS_REPRODUCER still matches the code of this test"
            )
            raise ex

        # prepare k8s yaml file for the job
        job_file = ObjectConfFile("log_reader", [job_dict], self.project,
                                  self.tmp_path)
        # deploy the job, starting the log reader pods
        logger.info(
            "starting log reader data validation job to fully check the log data",
        )
        job_file.create()
        # wait for the logreader job to complete (this should be rather quick)
        try:
            job.wait_for_job_completion(
                job_name=job_dict["metadata"]["name"],
                namespace=self.project.namespace,
                timeout=300,
                sleep_time=30,
            )
        except exceptions.TimeoutExpiredError:
            error_msg = "verification failed to complete in time: probably data loss or broken cluster"
            raise Exception(error_msg)
        # and then check that the job completed with success
        logger.info("checking the result of data validation job")
        logger.debug(job_file.describe())
        ocp_job = ocp.OCP(
            kind="Job",
            namespace=self.project.namespace,
            resource_name=job_dict["metadata"]["name"],
        )
        job_status = ocp_job.get()["status"]
        logger.info("last status of data verification job: %s", job_status)
        if ("failed" in job_status or job_status["succeeded"] !=
                self.deploy_dict["spec"]["replicas"]):
            error_msg = "possible data corruption: data verification job failed!"
            logger.error(error_msg)
            job.log_output_of_job_pods(job_name=job_dict["metadata"]["name"],
                                       namespace=self.project.namespace)
            raise Exception(error_msg)
    def test_all_4_type_pvc_creation_deletion_scale(self, namespace, tmp_path):
        """
        Measuring PVC creation time while scaling PVC of all 4 types,
        A total of 500 times the number of worker nodes
        will be created, i.e. 375 each pvc type
        Measure PVC deletion time in scale env
        """
        scale_pvc_count = scale_lib.get_max_pvc_count()
        log.info(f"Start creating {scale_pvc_count} PVC of all 4 types")
        cephfs_sc_obj = constants.DEFAULT_STORAGECLASS_CEPHFS
        rbd_sc_obj = constants.DEFAULT_STORAGECLASS_RBD

        # Get pvc_dict_list, append all the pvc.yaml dict to pvc_dict_list
        rbd_pvc_dict_list, cephfs_pvc_dict_list = ([] for i in range(2))
        for mode in [constants.ACCESS_MODE_RWO, constants.ACCESS_MODE_RWX]:
            rbd_pvc_dict_list.extend(
                scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job(
                    no_of_pvc=int(scale_pvc_count / 4),
                    access_mode=mode,
                    sc_name=rbd_sc_obj,
                ))
            cephfs_pvc_dict_list.extend(
                scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job(
                    no_of_pvc=int(scale_pvc_count / 4),
                    access_mode=mode,
                    sc_name=cephfs_sc_obj,
                ))

        # There is 2 kube_job for cephfs and rbd PVCs
        job_file_rbd = ObjectConfFile(
            name="rbd_pvc_job",
            obj_dict_list=rbd_pvc_dict_list,
            project=self.namespace,
            tmp_path=tmp_path,
        )
        job_file_cephfs = ObjectConfFile(
            name="cephfs_pvc_job",
            obj_dict_list=cephfs_pvc_dict_list,
            project=self.namespace,
            tmp_path=tmp_path,
        )

        # Create kube_job
        job_file_rbd.create(namespace=self.namespace)
        job_file_cephfs.create(namespace=self.namespace)

        # Check all the PVC reached Bound state
        rbd_pvc_name = scale_lib.check_all_pvc_reached_bound_state_in_kube_job(
            kube_job_obj=job_file_rbd,
            namespace=self.namespace,
            no_of_pvc=int(scale_pvc_count / 2),
        )
        fs_pvc_name = scale_lib.check_all_pvc_reached_bound_state_in_kube_job(
            kube_job_obj=job_file_cephfs,
            namespace=self.namespace,
            no_of_pvc=int(scale_pvc_count / 2),
        )

        # Get pvc objs from namespace, which is used to identify backend pv
        rbd_pvc_obj, cephfs_pvc_obj = ([] for i in range(2))
        pvc_objs = pvc.get_all_pvc_objs(namespace=self.namespace)
        for pvc_obj in pvc_objs:
            if pvc_obj.backed_sc == constants.DEFAULT_STORAGECLASS_RBD:
                rbd_pvc_obj.append(pvc_obj)
            elif pvc_obj.backed_sc == constants.DEFAULT_STORAGECLASS_CEPHFS:
                cephfs_pvc_obj.append(pvc_obj)

        # Get PVC creation time
        fs_pvc_create_time = helpers.measure_pvc_creation_time_bulk(
            interface=constants.CEPHFS_INTERFACE, pvc_name_list=fs_pvc_name)
        rbd_pvc_create_time = helpers.measure_pvc_creation_time_bulk(
            interface=constants.CEPHBLOCKPOOL, pvc_name_list=rbd_pvc_name)
        fs_pvc_create_time.update(rbd_pvc_create_time)

        # TODO: Update below code with google API, to record value in spreadsheet
        # TODO: For now observing Google API limit to write more than 100 writes
        log_path = f"{ocsci_log_path()}/All-type-PVC"
        with open(f"{log_path}-creation-time.csv", "w") as fd:
            csv_obj = csv.writer(fd)
            for k, v in fs_pvc_create_time.items():
                csv_obj.writerow([k, v])
        log.info(f"Create data present in {log_path}-creation-time.csv file")

        # Get pv_name, require pv_name to fetch deletion time data from log
        rbd_pv_list, fs_pv_list = ([] for i in range(2))
        get_rbd_kube_job = job_file_rbd.get(namespace=self.namespace)
        for i in range(int(scale_pvc_count / 2)):
            rbd_pv_list.append(
                get_rbd_kube_job["items"][i]["spec"]["volumeName"])

        get_fs_kube_job = job_file_cephfs.get(namespace=self.namespace)
        for i in range(int(scale_pvc_count / 2)):
            fs_pv_list.append(
                get_fs_kube_job["items"][i]["spec"]["volumeName"])

        # Delete kube_job
        job_file_rbd.delete(namespace=self.namespace)
        job_file_cephfs.delete(namespace=self.namespace)

        # Adding 1min wait time for PVC deletion logs to be updated
        # Observed failure when we immediately check the logs for pvc delete time
        # https://github.com/red-hat-storage/ocs-ci/issues/3371
        time.sleep(60)

        # Get PV deletion time
        fs_pvc_deletion_time = helpers.measure_pv_deletion_time_bulk(
            interface=constants.CEPHFS_INTERFACE, pv_name_list=fs_pv_list)
        rbd_pvc_deletion_time = helpers.measure_pv_deletion_time_bulk(
            interface=constants.CEPHBLOCKPOOL, pv_name_list=rbd_pv_list)
        fs_pvc_deletion_time.update(rbd_pvc_deletion_time)

        # TODO: Update below code with google API, to record value in spreadsheet
        # TODO: For now observing Google API limit to write more than 100 writes
        with open(f"{log_path}-deletion-time.csv", "w") as fd:
            csv_obj = csv.writer(fd)
            for k, v in fs_pvc_deletion_time.items():
                csv_obj.writerow([k, v])
        log.info(f"Delete data present in {log_path}-deletion-time.csv file")
        end_time = default_timer()
        log.info(f"Elapsed time -- {end_time - self.start_time} seconds")
    def test_scale_obc_create_delete_time(self, tmp_path):
        """
        MCG OBC creation and deletion using Noobaa MCG storage class

        """

        log.info(f"Start creating  {self.scale_obc_count} "
                 f"OBCs in a batch of {self.num_obc_batch}")
        obc_create = dict()
        obc_delete = dict()
        for i in range(int(self.scale_obc_count / self.num_obc_batch)):
            obc_dict_list = (
                scale_noobaa_lib.construct_obc_creation_yaml_bulk_for_kube_job(
                    no_of_obc=self.num_obc_batch,
                    sc_name=constants.NOOBAA_SC,
                    namespace=self.namespace,
                ))
            # Create job profile
            job_file = ObjectConfFile(
                name="job_profile",
                obj_dict_list=obc_dict_list,
                project=self.namespace,
                tmp_path=tmp_path,
            )
            # Create kube_job
            job_file.create(namespace=self.namespace)

            # Check all the OBCs to reach Bound state
            obc_bound_list = (
                scale_noobaa_lib.check_all_obc_reached_bound_state_in_kube_job(
                    kube_job_obj=job_file,
                    namespace=self.namespace,
                    no_of_obc=self.num_obc_batch,
                ))
            log.info(f"Number of OBCs in Bound state {len(obc_bound_list)}")

            # Measure obc creation and deletion time
            obc_creation_time = scale_noobaa_lib.measure_obc_creation_time(
                obc_name_list=obc_bound_list)
            obc_create.update(obc_creation_time)

        # Delete all obcs in a batch
        obc_name_list = list(oc_get_all_obc_names())
        new_list = [
            obc_name_list[i:i + 20]
            for i in range(0, len(obc_name_list), self.num_obc_batch)
        ]

        for i in range(len(new_list)):
            scale_noobaa_lib.cleanup(self.namespace, obc_count=new_list[i])
            obc_deletion_time = scale_noobaa_lib.measure_obc_deletion_time(
                obc_name_list=new_list[i])
            obc_delete.update(obc_deletion_time)

        # Store obc creation time on csv file
        log_path = f"{ocsci_log_path()}/obc-creation"
        with open(f"{log_path}-{constants.NOOBAA_SC}.csv", "w") as fd:
            csv_obj = csv.writer(fd)
            for k, v in obc_create.items():
                csv_obj.writerow([k, v])
        log.info(
            f"OBC creation data present in {log_path}-{constants.NOOBAA_SC}.csv"
        )

        # Store obc deletion time on csv file
        log_path = f"{ocsci_log_path()}/obc-deletion"
        with open(f"{log_path}-{constants.NOOBAA_SC}.csv", "w") as fd:
            csv_obj = csv.writer(fd)
            for k, v in obc_create.items():
                csv_obj.writerow([k, v])
        log.info(
            f"OBC deletion data present in {log_path}-{constants.NOOBAA_SC}.csv"
        )
Exemplo n.º 15
0
    def test_bulk_clone_performance(self, tmp_path, interface_iterate):
        """
        Creates number of PVCs in a bulk using kube job
        Write 60% of PVC capacity to each one of the created PVCs
        Creates 1 clone per each PVC altogether in a bulk
        Measuring total and csi creation times for bulk of clones

        """
        self.interface = interface_iterate
        job_pod_file, job_pvc_file, job_clone_file = [None, None, None]
        log.info(f"Start creating {self.interface} {self.pvc_count} PVC")

        try:
            pvc_dict_list = scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job(
                no_of_pvc=self.pvc_count,
                access_mode=Interfaces_info[self.interface]["accessmode"],
                sc_name=Interfaces_info[self.interface]["sc_name"],
                pvc_size=self.vol_size,
            )

            job_pvc_file = ObjectConfFile(
                name="job_profile_pvc",
                obj_dict_list=pvc_dict_list,
                project=self.namespace,
                tmp_path=tmp_path,
            )

            # Create kube_job
            job_pvc_file.create(namespace=self.namespace)

            # Check all the PVC reached Bound state
            performance_lib.wait_for_resource_bulk_status(
                resource="pvc",
                resource_count=self.pvc_count,
                namespace=self.namespace,
                status=constants.STATUS_BOUND,
                timeout=120,
                sleep_time=5,
            )
            log.info(
                f"All the PVCs ({self.pvc_count}) was created and are in Bound state"
            )

            # Getting the list of the PVC names
            pvc_bound_list = [
                p.name for p in pvc.get_all_pvc_objs(namespace=self.namespace)
            ]

            # Kube_job to Create pod
            log.info(
                "Attaching PODs to the PVCs and filling them with data (60%)")
            pod_dict_list = self.attach_pvcs_to_pod_dict(pvc_bound_list)
            job_pod_file = ObjectConfFile(
                name="job_profile_pod",
                obj_dict_list=pod_dict_list,
                project=self.namespace,
                tmp_path=tmp_path,
            )
            job_pod_file.create(namespace=self.namespace)

            # Check all PODs are in Completed state
            performance_lib.wait_for_resource_bulk_status(
                resource="pod",
                resource_count=self.pvc_count,
                namespace=self.namespace,
                status=constants.STATUS_COMPLETED,
                timeout=1200,
                sleep_time=30,
            )
            log.info("All the PODs completed writing data to the PVC's")

            clone_dict_list = scale_lib.construct_pvc_clone_yaml_bulk_for_kube_job(
                pvc_dict_list,
                Interfaces_info[self.interface]["clone_yaml"],
                Interfaces_info[self.interface]["sc_name"],
            )

            log.info("Created clone dict list")

            csi_bulk_start_time = self.get_time(time_format="csi")

            job_clone_file = ObjectConfFile(
                name="job_profile_clone",
                obj_dict_list=clone_dict_list,
                project=self.namespace,
                tmp_path=tmp_path,
            )

            # Create kube_job that creates clones
            job_clone_file.create(namespace=self.namespace)

            log.info("Going to check bound status for clones")
            # Check all the clones reached Bound state
            try:
                performance_lib.wait_for_resource_bulk_status(
                    resource="pvc",
                    resource_count=self.pvc_count * 2,
                    namespace=self.namespace,
                    status=constants.STATUS_BOUND,
                    timeout=1200,
                    sleep_time=30,
                )
            except Exception as ex:
                log.error("Failed to cvreate clones for PVCs")
                raise ex

            log.info(
                f"All the Clones ({self.pvc_count}) was created and are in Bound state"
            )

            all_pvc_objs = pvc.get_all_pvc_objs(namespace=self.namespace)
            clone_objs = [
                cl for cl in all_pvc_objs if re.match("clone", cl.name)
            ]
            for clone_yaml in clone_dict_list:
                name = clone_yaml["metadata"]["name"]
                size = clone_yaml["spec"]["resources"]["requests"]["storage"]
                log.info(f"Clone {name} of size {size} created")

            start_time = get_provision_time(self.interface,
                                            clone_objs,
                                            status="start")
            end_time = get_provision_time(self.interface,
                                          clone_objs,
                                          status="end")
            total_time = (end_time - start_time).total_seconds()
            speed = round(self.total_files_size / total_time, 2)

            csi_creation_time = performance_lib.csi_bulk_pvc_time_measure(
                self.interface, clone_objs, "create", csi_bulk_start_time)

            log.info(
                f"Total creation time = {total_time} secs, csi creation time = {csi_creation_time},"
                f" data size = {self.total_files_size} MB, speed = {speed} MB/sec "
                f"for {self.interface} clone in bulk of {self.pvc_count} clones."
            )

            # Produce ES report
            # Collecting environment information
            self.get_env_info()

            # Initialize the results' doc file.
            full_results = self.init_full_results(
                ResultsAnalyse(
                    self.uuid,
                    self.crd_data,
                    self.full_log_path,
                    "bulk_clone_perf_fullres",
                ))

            full_results.add_key("interface", self.interface)
            full_results.add_key("bulk_size", self.pvc_count)
            full_results.add_key("clone_size", self.vol_size)
            full_results.add_key("bulk_creation_time", total_time)
            full_results.add_key("bulk_csi_creation_time", csi_creation_time)
            full_results.add_key("data_size(MB)", self.total_files_size)
            full_results.add_key("speed", speed)
            full_results.add_key("es_results_link",
                                 full_results.results_link())

            # Write the test results into the ES server
            full_results.es_write()
            self.results_path = get_full_test_logs_path(cname=self)
            res_link = full_results.results_link()
            # write the ES link to the test results in the test log.
            log.info(f"The result can be found at : {res_link}")

            # Create text file with results of all subtest (3 - according to the parameters)
            self.write_result_to_file(res_link)

        # Finally, is used to clean up the resources created
        # Irrespective of try block pass/fail finally will be executed.
        finally:
            # Cleanup activities
            log.info(
                "Cleanup of all the resources created during test execution")
            for object_file in [job_pod_file, job_clone_file, job_pvc_file]:
                if object_file:
                    object_file.delete(namespace=self.namespace)
                    try:
                        object_file.wait_for_delete(
                            resource_name=object_file.name,
                            namespace=self.namespace)
                    except Exception:
                        log.error(f"{object_file['name']} didnt deleted !")

            # Check ceph health status
            utils.ceph_health_check(tries=20)
def test_workload_with_checksum_verify(
    tmp_path,
    project,
    fio_pvc_dict,
    fio_job_dict,
    fio_configmap_dict,
):
    """
    Verify that data written by fio during workload storageutilization fixture
    are still present on the persistent volume.

    This test case assumes that test case ``test_workload_with_checksum``
    (which uses the fixture) has been executed already, and that the PV it
    created is still around (the PV is identified via it's label, which
    references the fixture). There is no direct binding between these tests or
    fixtures, so that one can run ``test_workload_with_checksum`` first,
    then do some cluster wide temporary distruptive operation such as reboot,
    temporary shutdown or upgrade, and finally after that run this verification
    test to check that data are still there.

    Note/TODO: this test doesn't delete the PV created by the previous test
    on purpose, so that this test can be executed multiple times (which is
    important feature of this test, eg. it is possible to run it at different
    stages of the cluster wide distruptions). We may need to come up with a way
    to track it and delete it when it's no longer needed though.
    """
    fixture_name = "workload_storageutilization_checksum_rbd"
    storage_class_name = "ocs-storagecluster-ceph-rbd"
    pv_label = f'fixture={fixture_name}'

    # find the volume where the data are stored
    ocp_pv = ocp.OCP(kind=constants.PV, namespace=project.namespace)
    logger.info("Searching for PV with label %s, where fio stored data",
                pv_label)
    pv_data = ocp_pv.get(selector=pv_label)
    assert pv_data['kind'] == "List"
    pv_exists_msg = (f"Single PV with label {pv_label} should exists, "
                     "so that test can identify where to verify the data.")
    assert len(pv_data['items']) == 1, pv_exists_msg
    pv_dict = pv_data['items'][0]
    pv_name = pv_dict['metadata']['name']
    logger.info("PV %s was identified, test can continue.", pv_name)

    # We need to check the PV size so that we can ask for the same via PVC
    capacity = pv_dict['spec']['capacity']['storage']
    logger.info("Capacity of PV %s is %s.", pv_name, capacity)

    # Convert the storage capacity spec into number of GiB
    unit = capacity[-2:]
    assert unit in ("Gi", "Ti"), "PV size should be within reasonable range"
    if capacity.endswith("Gi"):
        pvc_size = int(capacity[0:-2])
    elif capacity.endswith("Ti"):
        pvc_size = int(capacity[0:-2]) * 2**10

    # And we need to drop claimRef, so that the PV will become available again
    if "claimRef" in pv_dict['spec']:
        logger.info("Dropping claimRef from PV %s.", pv_name)
        patch_success = ocp_pv.patch(
            resource_name=pv_name,
            params='[{ "op": "remove", "path": "/spec/claimRef" }]',
            format_type='json')
        patch_error_msg = (
            "claimRef should be dropped with success, "
            f"otherwise the test can't continue to reuse PV {pv_name}")
        assert patch_success, patch_error_msg
    else:
        logger.info("PV %s is already without claimRef.", pv_name)

    # The job won't be running fio, it will run sha1sum check only.
    container = fio_job_dict['spec']['template']['spec']['containers'][0]
    container['command'] = [
        "/usr/bin/sha1sum", "-c", "/mnt/target/fio.sha1sum"
    ]
    # we need to use the same PVC configuration to reuse the PV
    fio_pvc_dict["spec"]["storageClassName"] = storage_class_name
    fio_pvc_dict["spec"]["resources"]["requests"]["storage"] = capacity
    # put the dicts together into yaml file of the Job
    fio_objs = [fio_pvc_dict, fio_configmap_dict, fio_job_dict]
    job_file = ObjectConfFile(fixture_name, fio_objs, project, tmp_path)

    # compute timeout based on the minimal write speed
    fio_min_mbps = config.ENV_DATA['fio_storageutilization_min_mbps']
    job_timeout = fiojob.get_timeout(fio_min_mbps, pvc_size)
    # expand job timeout because during execution of this test is high
    # probability that there is more workload executed (from upgrade tests)
    # that slow down write time
    # TODO(fbalak): calculate this from actual work being executed
    job_timeout = job_timeout * 4

    # deploy the Job to the cluster and start it
    job_file.create()

    # Wait for the job to verify data on the volume. If this fails in any way
    # the job won't finish with success in given time, and the error message
    # below will be reported via exception.
    error_msg = (
        "Checksum verification job failed. We weren't able to verify that "
        "data previously written on the PV are still there.")
    pod_name = fiojob.wait_for_job_completion(project.namespace, job_timeout,
                                              error_msg)

    # provide clear evidence of the verification in the logs
    ocp_pod = ocp.OCP(kind="Pod", namespace=project.namespace)
    sha1sum_output = ocp_pod.exec_oc_cmd(f"logs {pod_name}",
                                         out_yaml_format=False)
    logger.info("sha1sum output: %s", sha1sum_output)
    def test_multiple_pvc_creation_deletion_scale(self, namespace, tmp_path,
                                                  access_mode, interface):
        """
        Measuring PVC creation time while scaling PVC
        Measure PVC deletion time after creation test
        """
        scale_pvc_count = scale_lib.get_max_pvc_count()
        log.info(
            f"Start creating {access_mode}-{interface} {scale_pvc_count} PVC")
        if interface == constants.CEPHBLOCKPOOL:
            sc_name = constants.DEFAULT_STORAGECLASS_RBD
        elif interface == constants.CEPHFS_INTERFACE:
            sc_name = constants.DEFAULT_STORAGECLASS_CEPHFS

        # Get pvc_dict_list, append all the pvc.yaml dict to pvc_dict_list
        pvc_dict_list1 = scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job(
            no_of_pvc=int(scale_pvc_count / 2),
            access_mode=access_mode,
            sc_name=sc_name)
        pvc_dict_list2 = scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job(
            no_of_pvc=int(scale_pvc_count / 2),
            access_mode=access_mode,
            sc_name=sc_name)

        # There is 2 kube_job to reduce the load, observed time_out problems
        # during delete process of single kube_job and heavy load.
        job_file1 = ObjectConfFile(
            name="job_profile_1",
            obj_dict_list=pvc_dict_list1,
            project=self.namespace,
            tmp_path=tmp_path,
        )
        job_file2 = ObjectConfFile(
            name="job_profile_2",
            obj_dict_list=pvc_dict_list2,
            project=self.namespace,
            tmp_path=tmp_path,
        )

        # Create kube_job
        job_file1.create(namespace=self.namespace)
        job_file2.create(namespace=self.namespace)

        # Check all the PVC reached Bound state
        pvc_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job(
            kube_job_obj=job_file1,
            namespace=self.namespace,
            no_of_pvc=int(scale_pvc_count / 2),
        )
        pvc_bound_list.extend(
            scale_lib.check_all_pvc_reached_bound_state_in_kube_job(
                kube_job_obj=job_file2,
                namespace=self.namespace,
                no_of_pvc=int(scale_pvc_count / 2),
            ))

        log.info(f"Number of PVCs in Bound state {len(pvc_bound_list)}")

        # Get PVC creation time
        pvc_create_time = helpers.measure_pvc_creation_time_bulk(
            interface=interface,
            pvc_name_list=pvc_bound_list,
            wait_time=300,
        )

        # TODO: Update below code with google API, to record value in spreadsheet
        # TODO: For now observing Google API limit to write more than 100 writes
        log_path = f"{ocsci_log_path()}/{interface}-{access_mode}"
        with open(f"{log_path}-creation-time.csv", "w") as fd:
            csv_obj = csv.writer(fd)
            for k, v in pvc_create_time.items():
                csv_obj.writerow([k, v])
        log.info(f"Create data present in {log_path}-creation-time.csv file")

        # Get pv_name, require pv_name to fetch deletion time data from log
        pv_name_list = list()
        get_kube_job_1 = job_file1.get(namespace=self.namespace)
        for i in range(int(scale_pvc_count / 2)):
            pv_name_list.append(
                get_kube_job_1["items"][i]["spec"]["volumeName"])

        get_kube_job_2 = job_file2.get(namespace=self.namespace)
        for i in range(int(scale_pvc_count / 2)):
            pv_name_list.append(
                get_kube_job_2["items"][i]["spec"]["volumeName"])

        # Delete kube_job
        job_file1.delete(namespace=self.namespace)
        job_file2.delete(namespace=self.namespace)

        # Adding 1min wait time for PVC deletion logs to be updated
        # Observed failure when we immediately check the logs for pvc delete time
        # https://github.com/red-hat-storage/ocs-ci/issues/3371
        time.sleep(60)

        # Get PVC deletion time
        pvc_deletion_time = helpers.measure_pv_deletion_time_bulk(
            interface=interface, pv_name_list=pv_name_list)

        # Update result to csv file.
        # TODO: Update below code with google API, to record value in spreadsheet
        # TODO: For now observing Google API limit to write more than 100 writes
        with open(f"{log_path}-deletion-time.csv", "w") as fd:
            csv_obj = csv.writer(fd)
            for k, v in pvc_deletion_time.items():
                csv_obj.writerow([k, v])
        log.info(f"Delete data present in {log_path}-deletion-time.csv file")
        end_time = default_timer()
        log.info(f"Elapsed time -- {end_time - self.start_time} seconds")
    def test_bulk_clone_performance(self, namespace, tmp_path):
        """
        Creates number of PVCs in a bulk using kube job
        Write 60% of PVC capacity to each one of the created PVCs
        Creates 1 clone per each PVC altogether in a bulk
        Measuring total and csi creation times for bulk of clones

        """
        pvc_count = 50
        vol_size = "5Gi"
        job_pod_file, job_pvc_file, job_clone_file = [None, None, None]
        log.info(f"Start creating {self.interface} {pvc_count} PVC")
        if self.interface == constants.CEPHBLOCKPOOL:
            sc_name = constants.DEFAULT_STORAGECLASS_RBD
            clone_yaml = constants.CSI_RBD_PVC_CLONE_YAML
        elif self.interface == constants.CEPHFILESYSTEM:
            sc_name = constants.DEFAULT_STORAGECLASS_CEPHFS
            clone_yaml = constants.CSI_CEPHFS_PVC_CLONE_YAML

        try:
            pvc_dict_list = scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job(
                no_of_pvc=pvc_count,
                access_mode=constants.ACCESS_MODE_RWO,
                sc_name=sc_name,
                pvc_size=vol_size,
            )

            job_pvc_file = ObjectConfFile(
                name="job_profile_pvc",
                obj_dict_list=pvc_dict_list,
                project=self.namespace,
                tmp_path=tmp_path,
            )

            # Create kube_job
            job_pvc_file.create(namespace=self.namespace)

            # Check all the PVC reached Bound state
            pvc_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job(
                kube_job_obj=job_pvc_file,
                namespace=self.namespace,
                no_of_pvc=pvc_count,
            )

            log.info(f"Number of PVCs in Bound state {len(pvc_bound_list)}")

            # Kube_job to Create pod
            pod_dict_list = scale_lib.attach_multiple_pvc_to_pod_dict(
                pvc_list=pvc_bound_list,
                namespace=self.namespace,
                pvcs_per_pod=1,
                start_io=False,
                pod_yaml=constants.NGINX_POD_YAML,
            )
            job_pod_file = ObjectConfFile(
                name="job_profile_pod",
                obj_dict_list=pod_dict_list,
                project=self.namespace,
                tmp_path=tmp_path,
            )
            job_pod_file.create(namespace=self.namespace)

            # Check all PODs in Running state
            scale_lib.check_all_pod_reached_running_state_in_kube_job(
                kube_job_obj=job_pod_file,
                namespace=self.namespace,
                no_of_pod=len(pod_dict_list),
                timeout=90,
            )
            log.info(f"Number of PODs in Running state {len(pod_dict_list)}")

            total_files_size = self.run_fio_on_pvcs(vol_size)

            clone_dict_list = scale_lib.construct_pvc_clone_yaml_bulk_for_kube_job(
                pvc_dict_list, clone_yaml, sc_name)

            log.info("Created clone dict list")

            csi_bulk_start_time = self.get_time(time_format="csi")

            job_clone_file = ObjectConfFile(
                name="job_profile_clone",
                obj_dict_list=clone_dict_list,
                project=self.namespace,
                tmp_path=tmp_path,
            )

            # Create kube_job that creates clones
            job_clone_file.create(namespace=self.namespace)

            log.info("Going to check bound status for clones")
            # Check all the clones reached Bound state
            clone_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job(
                kube_job_obj=job_clone_file,
                namespace=self.namespace,
                no_of_pvc=pvc_count,
                timeout=180,
            )

            log.info(
                f"Number of clones in Bound state {len(clone_bound_list)}")

            clone_objs = []
            all_pvc_objs = pvc.get_all_pvc_objs(namespace=self.namespace)
            for clone_yaml in clone_dict_list:
                name = clone_yaml["metadata"]["name"]
                size = clone_yaml["spec"]["resources"]["requests"]["storage"]
                log.info(f"Clone {name} of size {size} created")
                for pvc_obj in all_pvc_objs:
                    if pvc_obj.name == name:
                        clone_objs.append(pvc_obj)

            assert len(clone_bound_list) == len(
                clone_objs
            ), "Not all clones reached BOUND state, cannot measure time"
            start_time = helpers.get_provision_time(self.interface,
                                                    clone_objs,
                                                    status="start")
            end_time = helpers.get_provision_time(self.interface,
                                                  clone_objs,
                                                  status="end")
            total_time = (end_time - start_time).total_seconds()
            speed = round(total_files_size / total_time, 2)

            csi_creation_time = performance_lib.csi_bulk_pvc_time_measure(
                self.interface, clone_objs, "create", csi_bulk_start_time)

            log.info(
                f"Total creation time = {total_time} secs, csi creation time = {csi_creation_time},"
                f" data size = {total_files_size} MB, speed = {speed} MB/sec "
                f"for {self.interface} clone in bulk of {pvc_count} clones.")

            # Produce ES report
            # Collecting environment information
            self.get_env_info()

            # Initialize the results doc file.
            full_results = self.init_full_results(
                ResultsAnalyse(
                    self.uuid,
                    self.crd_data,
                    self.full_log_path,
                    "bulk_clone_perf_fullres",
                ))

            full_results.add_key("interface", self.interface)
            full_results.add_key("bulk_size", pvc_count)
            full_results.add_key("clone_size", vol_size)
            full_results.add_key("bulk_creation_time", total_time)
            full_results.add_key("bulk_csi_creation_time", csi_creation_time)
            full_results.add_key("data_size(MB)", total_files_size)
            full_results.add_key("speed", speed)
            full_results.add_key("es_results_link",
                                 full_results.results_link())

            # Write the test results into the ES server
            full_results.es_write()
            self.results_path = get_full_test_logs_path(cname=self)
            res_link = full_results.results_link()
            # write the ES link to the test results in the test log.
            log.info(f"The result can be found at : {res_link}")

            # Create text file with results of all subtest (3 - according to the parameters)
            self.write_result_to_file(res_link)

        # Finally is used to clean-up the resources created
        # Irrespective of try block pass/fail finally will be executed.
        finally:
            # Cleanup activities
            log.info(
                "Cleanup of all the resources created during test execution")
            if job_pod_file:
                job_pod_file.delete(namespace=self.namespace)
                job_pod_file.wait_for_delete(resource_name=job_pod_file.name,
                                             namespace=self.namespace)

            if job_clone_file:
                job_clone_file.delete(namespace=self.namespace)
                job_clone_file.wait_for_delete(
                    resource_name=job_clone_file.name,
                    namespace=self.namespace)

            if job_pvc_file:
                job_pvc_file.delete(namespace=self.namespace)
                job_pvc_file.wait_for_delete(resource_name=job_pvc_file.name,
                                             namespace=self.namespace)

            # Check ceph health status
            utils.ceph_health_check(tries=20)
Exemplo n.º 19
0
class LogReaderWriterParallel(object):
    """
    Write and read logfile stored on cephfs volume, from all worker nodes of a
    cluster via k8s Deployment, while fetching content of the stored data via
    oc rsync to check the data locally.

    TO DO: Update the test after the issue https://github.com/red-hat-storage/ocs-ci/issues/5724
    will be completed.

    """
    def __init__(
        self,
        project,
        tmp_path,
        storage_size=2,
    ):
        """
        Init of the LogReaderWriterParallel object

        Args:
            project (pytest fixture): The project fixture.
            tmp_path (pytest fixture): The tmp_path fixture.
            storage_size (str): The size of the storage in GB. The default value is 2 GB.

        """
        self.project = project
        self.tmp_path = tmp_path

        self.pvc_dict = get_pvc_dict()
        # we need to mount the volume on every worker node, so RWX/cephfs
        self.pvc_dict["metadata"]["name"] = "logwriter-cephfs-many"
        self.pvc_dict["spec"]["accessModes"] = [constants.ACCESS_MODE_RWX]
        if storagecluster_independent_check(
        ) and not is_managed_service_cluster():
            sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS
        else:
            sc_name = constants.CEPHFILESYSTEM_SC
        logger.info(f"Storage class name = {sc_name}")
        self.pvc_dict["spec"]["storageClassName"] = sc_name
        self.pvc_dict["spec"]["resources"]["requests"][
            "storage"] = f"{storage_size}Gi"

        self.deploy_dict = {}
        self.workload_file = None
        self.ocp_pod = None

        self.local_dir = self.tmp_path / "logwriter"
        self.local_dir.mkdir()

    def log_reader_writer_parallel(self):
        """
        Write and read logfile stored on cephfs volume, from all worker nodes of a
        cluster via k8s Deployment.

        Raise:
            NotFoundError: When given volume is not found in given spec
            UnexpectedBehaviour: When an unexpected problem with starting the workload occurred

        """

        # get deployment dict for the reproducer logwriter workload
        with open(constants.LOGWRITER_CEPHFS_REPRODUCER,
                  "r") as deployment_file:
            self.deploy_dict = yaml.safe_load(deployment_file.read())
        # if we are running in disconnected environment, we need to mirror the
        # container image first, and then use the mirror instead of the original
        if config.DEPLOYMENT.get("disconnected"):
            update_container_with_mirrored_image(
                self.deploy_dict["spec"]["template"])
        # we need to match deployment replicas with number of worker nodes
        self.deploy_dict["spec"]["replicas"] = len(get_worker_nodes())
        # drop topology spread constraints related to zones
        topology.drop_topology_constraint(
            self.deploy_dict["spec"]["template"]["spec"], topology.ZONE_LABEL)
        # and link the deployment with the pvc
        try:
            link_spec_volume(
                self.deploy_dict["spec"]["template"]["spec"],
                "logwriter-cephfs-volume",
                self.pvc_dict["metadata"]["name"],
            )
        except (exceptions.NotFoundError, KeyError) as ex:
            logger.warning(
                "Failed to link the deployment with the pvc. We may need to check if the "
                "LOGWRITER_CEPHFS_REPRODUCER still matches the code of this test"
            )
            raise ex

        # prepare k8s yaml file for deployment
        self.workload_file = ObjectConfFile(
            "log_reader_writer_parallel",
            [self.pvc_dict, self.deploy_dict],
            self.project,
            self.tmp_path,
        )
        # deploy the workload, starting the log reader/writer pods
        logger.info(
            "starting log reader/writer workload via Deployment, one pod per worker"
        )
        self.workload_file.create()

        logger.info("waiting for all pods of the workload Deployment to run")
        self.ocp_pod = ocp.OCP(kind="Pod", namespace=self.project.namespace)
        try:
            self.ocp_pod.wait_for_resource(
                resource_count=self.deploy_dict["spec"]["replicas"],
                condition=constants.STATUS_RUNNING,
                error_condition=constants.STATUS_ERROR,
                timeout=300,
                sleep=30,
            )
        except Exception as ex:
            # this is not a problem with feature under test, but with infra,
            # cluster configuration or unrelated bug which must have happened
            # before this test case
            error_msg = "unexpected problem with start of the workload, cluster is either misconfigured or broken"
            logger.exception(error_msg)
            logger.debug(self.workload_file.describe())
            raise exceptions.UnexpectedBehaviour(error_msg) from ex

    def fetch_and_validate_data(self):
        """
        While the workload is running, try to validate the data
        from the cephfs volume of the workload.

        Raise:
            NotFoundError: When the given volume is not found in given spec
            Exception: When the data verification job failed

        """
        # if no obvious problem was detected, run the logreader job to validate
        # checksums in the log files (so that we are 100% sure that nothing went
        # wrong with the IO or the data)
        with open(constants.LOGWRITER_CEPHFS_READER, "r") as job_file:
            job_dict = yaml.safe_load(job_file.read())
        # if we are running in disconnected environment, we need to mirror the
        # container image first, and then use the mirror instead of the original
        if config.DEPLOYMENT.get("disconnected"):
            update_container_with_mirrored_image(
                self.deploy_dict["spec"]["template"])
        # drop topology spread constraints related to zones
        topology.drop_topology_constraint(job_dict["spec"]["template"]["spec"],
                                          topology.ZONE_LABEL)
        # we need to match number of jobs with the number used in the workload
        job_dict["spec"]["completions"] = self.deploy_dict["spec"]["replicas"]
        job_dict["spec"]["parallelism"] = self.deploy_dict["spec"]["replicas"]
        # and reffer to the correct pvc name
        try:
            link_spec_volume(
                job_dict["spec"]["template"]["spec"],
                "logwriter-cephfs-volume",
                self.pvc_dict["metadata"]["name"],
            )
        except (exceptions.NotFoundError, KeyError) as ex:
            logger.warning(
                "Failed to link the deployment with the pvc. We may need to check if the "
                "LOGWRITER_CEPHFS_REPRODUCER still matches the code of this test"
            )
            raise ex

        # prepare k8s yaml file for the job
        job_file = ObjectConfFile("log_reader", [job_dict], self.project,
                                  self.tmp_path)
        # deploy the job, starting the log reader pods
        logger.info(
            "starting log reader data validation job to fully check the log data",
        )
        job_file.create()
        # wait for the logreader job to complete (this should be rather quick)
        try:
            job.wait_for_job_completion(
                job_name=job_dict["metadata"]["name"],
                namespace=self.project.namespace,
                timeout=300,
                sleep_time=30,
            )
        except exceptions.TimeoutExpiredError:
            error_msg = "verification failed to complete in time: probably data loss or broken cluster"
            raise Exception(error_msg)
        # and then check that the job completed with success
        logger.info("checking the result of data validation job")
        logger.debug(job_file.describe())
        ocp_job = ocp.OCP(
            kind="Job",
            namespace=self.project.namespace,
            resource_name=job_dict["metadata"]["name"],
        )
        job_status = ocp_job.get()["status"]
        logger.info("last status of data verification job: %s", job_status)
        if ("failed" in job_status or job_status["succeeded"] !=
                self.deploy_dict["spec"]["replicas"]):
            error_msg = "possible data corruption: data verification job failed!"
            logger.error(error_msg)
            job.log_output_of_job_pods(job_name=job_dict["metadata"]["name"],
                                       namespace=self.project.namespace)
            raise Exception(error_msg)
Exemplo n.º 20
0
def setup_netsplit(tmp_path,
                   master_zones,
                   worker_zones,
                   x_addr_list=None,
                   arbiter_zone=None):
    """
    Deploy machineconfig with network split scripts and configuration, tailored
    for the current cluster state.

    Args:
        tmp_path(pathlib.Path): Directory where a temporary yaml file will
                be created. In test context, use pytest fixture ``tmp_path``.
        master_zones(list[str]): zones where master nodes are placed
        worker_zones(list[str]): zones where worker nodes are placed
        x_addr_list(list[str]): IP addressess of external services (zone x)
        arbiter_zone(str): name of arbiter zone if arbiter deployment is used

    Raises:
        UnexpectedDeploymentConfiguration: in case of invalid cluster
            configuration, which prevents deployment of network split scripts
        ValueError: in case given zone configuration doesn't make any sense
    """
    logger.info("going to deploy ocpnetsplit scripts")
    # checking assumptions: each node has a zone label
    if not are_zone_labels_present():
        msg = "to use network_split_setup, all nodes needs a zone label"
        logger.error(msg)
        raise exceptions.UnexpectedDeploymentConfiguration(msg)
    # check zone assummtions: all worker zones are master zones as well
    worker_zones_without_master = set(worker_zones).difference(
        set(master_zones))
    if len(worker_zones_without_master) != 0:
        msg = ("there are zones which contains worker nodes, "
               f"but no master nodes: {worker_zones_without_master}")
        logger.error(msg)
        raise exceptions.UnexpectedDeploymentConfiguration(msg)
    if (arbiter_zone is not None) and (arbiter_zone not in master_zones):
        msg = "given arbiter zone not found among master zones"
        logger.error(msg)
        raise ValueError(msg)
    if len(master_zones) == 3:
        zone_a, zone_b, zone_c = master_zones
        # handle arbiter (so that zone a is always arbiter) if specified
        if arbiter_zone is not None:
            zone_a = arbiter_zone
            other_zones = master_zones.copy()
            other_zones.remove(arbiter_zone)
            zone_b, zone_c = other_zones
    else:
        msg = "ocpnetsplit can handle only 3 zones, setup can't continue"
        logger.error(msg)
        raise exceptions.UnexpectedDeploymentConfiguration(msg)
    # we assume that there are just 2 machine config pools: master and worker
    mcp_h = OCP(kind="MachineConfigPool", namespace="openshift-config")
    mcp_objects = mcp_h.get()
    mcp_names = [i["metadata"]["name"] for i in mcp_objects["items"]]
    if len(mcp_names) != 2:
        msg = ("ocpnetsplit can handle only 2 machine config pools, "
               f"but there are {mcp_names}")
        logger.error(msg)
        raise exceptions.UnexpectedDeploymentConfiguration(msg)
    for exp_pool in ("master", "worker"):
        if exp_pool not in mcp_names:
            msg = f"MachineConfigPool/{exp_pool} not found"
            logger.error(msg)
            raise exceptions.UnexpectedDeploymentConfiguration(msg)
    # generate zone config (list of node ip addressess for each zone)
    zone_config = ocpnetsplit.main.get_zone_config(zone_a, zone_b, zone_c,
                                                   x_addr_list)
    zone_env = zone_config.get_env_file()
    # get machinecofnig for network split firewall scripts
    mc = ocpnetsplit.main.get_networksplit_mc_spec(zone_env)
    # deploy it within openshift-config namespace
    mc_file = ObjectConfFile("network-split", mc, None, tmp_path)
    mc_file.create(namespace="openshift-config")
    # now let's make sure the MCO (machine config operator) noticed just
    # deployed network-split machine config and started to process it
    logger.info("waiting for both machineconfigpools to be updating "
                "as a result of deployment of network-split machineconfig")
    mcp_h.wait_for_resource(
        resource_count=2,
        condition="True",
        column="UPDATING",
        sleep=5,
        timeout=120,
    )
    # and now wait for MachineConfigPools to be updated and ready
    logger.info("waiting for both machineconfigpools to be updated and ready")
    mcp_h.wait_for_resource(
        resource_count=2,
        condition="True",
        column="UPDATED",
        sleep=60,
        timeout=1800,
    )
    # also check that no pools are degraded
    mcp_h.wait_for_resource(
        resource_count=2,
        condition="False",
        column="DEGRADED",
        sleep=10,
        timeout=120,
    )