Python measure_operation示例，ocs_ci.utility.workloadfixture.measure_operation Python示例

示例#1

0

显示文件

文件： conftest.py 项目： wusui/ocs-ci

def workload_idle(measurement_dir):
    """
    This workload represents a relative long timeframe when nothing special is
    happening, for test cases checking default status of various components
    (eg. no error alert is reported out of sudden, ceph should be healthy ...).

    Besides sheer waiting, this workload also checks that the number of ceph
    components (OSD and MON only) is the same at start and end of this wait,
    and passess the numbers to the test. If the number changes, something not
    exactly expected was happening with the cluster (eg. some node got offline,
    or cluster was expanded, ...) which doesn't match the idea of idle waiting
    and *invalidates the expectations of this workload*. Running test cases
    which expects idle workload in such case would be misleading, so we fail
    the workload in such case.
    """
    def count_ceph_components():
        ct_pod = pod.get_ceph_tools_pod()
        ceph_osd_ls_list = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd ls")
        logger.debug(f"ceph osd ls output: {ceph_osd_ls_list}")
        # the "+ 1" is a WORKAROUND for a bug in exec_ceph_cmd()
        # https://github.com/red-hat-storage/ocs-ci/issues/1152
        osd_num = len(ceph_osd_ls_list) + 1
        mon_num = len(ct_pod.exec_ceph_cmd(ceph_cmd="ceph mon metadata"))
        logger.info(f"There are {osd_num} OSDs, {mon_num} MONs")
        return osd_num, mon_num

    def do_nothing():
        sleep_time = 60 * 15  # seconds
        logger.info(f"idle workload is about to sleep for {sleep_time} s")
        osd_num_1, mon_num_1 = count_ceph_components()
        time.sleep(sleep_time)
        osd_num_2, mon_num_2 = count_ceph_components()
        # If this fails, we are likely observing an infra error or unsolicited
        # interference with test cluster from the outside. It could also be a
        # product bug, but this is less likely. See also docstring of this
        # workload fixture.
        msg = ("Assumption that nothing serious is happening not met, "
               "number of selected ceph components should be the same")
        assert osd_num_1 == osd_num_2, msg
        assert mon_num_1 == mon_num_2, msg
        assert osd_num_1 >= 3, "OCS cluster should have at least 3 OSDs"
        result = {'osd_num': osd_num_1, 'mon_num': mon_num_1}
        return result

    test_file = os.path.join(measurement_dir, 'measure_workload_idle.json')
    measured_op = measure_operation(do_nothing, test_file)
    return measured_op

示例#2

0

显示文件

文件： conftest.py 项目： sidhant-agrawal/ocs-ci

def measure_stop_ceph_mgr(measurement_dir):
    """
    Downscales Ceph Manager deployment, measures the time when it was
    downscaled and monitors alerts that were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            Ceph Manager pod
    """
    oc = ocp.OCP(
        kind=constants.DEPLOYMENT, namespace=config.ENV_DATA["cluster_namespace"]
    )
    mgr_deployments = oc.get(selector=constants.MGR_APP_LABEL)["items"]
    mgr = mgr_deployments[0]["metadata"]["name"]

    def stop_mgr():
        """
        Downscale Ceph Manager deployment for 6 minutes. First 5 minutes
        the alert should be in 'Pending'.
        After 5 minutes it should be 'Firing'.
        This configuration of monitoring can be observed in ceph-mixins which
        are used in the project:
            https://github.com/ceph/ceph-mixins/blob/d22afe8c0da34490cb77e52a202eefcf4f62a869/config.libsonnet#L25

        Returns:
            str: Name of downscaled deployment
        """
        # run_time of operation
        run_time = 60 * 6
        nonlocal oc
        nonlocal mgr
        logger.info(f"Downscaling deployment {mgr} to 0")
        oc.exec_oc_cmd(f"scale --replicas=0 deployment/{mgr}")
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return oc.get(mgr)

    test_file = os.path.join(measurement_dir, "measure_stop_ceph_mgr.json")
    measured_op = measure_operation(stop_mgr, test_file)
    logger.info(f"Upscaling deployment {mgr} back to 1")
    oc.exec_oc_cmd(f"scale --replicas=1 deployment/{mgr}")

    # wait for ceph to return into HEALTH_OK state after mgr deployment
    # is returned back to normal
    ceph_health_check(tries=20, delay=15)

    return measured_op

示例#3

0

显示文件

文件： conftest.py 项目： sidhant-agrawal/ocs-ci

def measure_stop_worker_node(measurement_dir, nodes):
    """
    Stop one worker node, measure the time when it was stopped and monitors
    alerts that were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            worker node

    """
    node = get_nodes(node_type="worker")[0]

    def stop_node():
        """
        Turn off one worker node for 6 minutes.

        Returns:
            str: Node that was turned down

        """
        # run_time of operation
        run_time = 60 * 6
        nonlocal node
        logger.info(f"Turning off node {node.name}")
        nodes.stop_nodes(nodes=[node])
        # Validate node reached NotReady state
        wait_for_nodes_status(node_names=[node.name], status=constants.NODE_NOT_READY)
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return node.name

    test_file = os.path.join(measurement_dir, "measure_stop_node.json")
    measured_op = measure_operation(stop_node, test_file)
    logger.info(f"Turning on node {node.name}")
    nodes.start_nodes(nodes=[node])
    # Validate all nodes are in READY state and up
    retry((CommandFailed, ResourceWrongStatusException,), tries=60, delay=15,)(
        wait_for_nodes_status
    )(timeout=900)

    # wait for ceph to return into HEALTH_OK state after mgr deployment
    # is returned back to normal
    ceph_health_check(tries=20, delay=15)

    return measured_op

示例#4

0

显示文件

文件： conftest.py 项目： sidhant-agrawal/ocs-ci

def measure_stop_rgw(measurement_dir, request, rgw_deployments):
    """
    Downscales RGW deployments, measures the time when it was
    downscaled and monitors alerts that were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            RGW pods

    """
    oc = ocp.OCP(
        kind=constants.DEPLOYMENT, namespace=config.ENV_DATA["cluster_namespace"]
    )

    def stop_rgw():
        """
        Downscale RGW interface deployments for 5 minutes.

        Returns:
            str: Name of downscaled deployment

        """
        # run_time of operation
        run_time = 60 * 5
        nonlocal oc
        nonlocal rgw_deployments
        for rgw_deployment in rgw_deployments:
            rgw = rgw_deployment["metadata"]["name"]
            logger.info(f"Downscaling deployment {rgw} to 0")
            oc.exec_oc_cmd(f"scale --replicas=0 deployment/{rgw}")
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return rgw_deployments

    test_file = os.path.join(measurement_dir, "measure_stop_rgw.json")
    measured_op = measure_operation(stop_rgw, test_file)

    logger.info("Return RGW pods")
    for rgw_deployment in rgw_deployments:
        rgw = rgw_deployment["metadata"]["name"]
        logger.info(f"Upscaling deployment {rgw} to 1")
        oc.exec_oc_cmd(f"scale --replicas=1 deployment/{rgw}")

    return measured_op

示例#5

0

显示文件

文件： fiojob.py 项目： ekuric/ocs-ci

def workload_fio_storageutilization(
    fixture_name,
    project,
    fio_pvc_dict,
    fio_job_dict,
    fio_configmap_dict,
    measurement_dir,
    tmp_path,
    target_percentage=None,
    target_size=None,
    with_checksum=False,
):
    """
    This function implements core functionality of fio storage utilization
    workload fixtures. This is necessary because we can't parametrize single
    general fixture over multiple parameters (it would mess with test case id
    and polarion test case tracking).

    It works as a workload fixture, as understood by
    :py:mod:`ocs_ci.utility.workloadfixture` module.

    When ``target_percentage`` is specified, the goal of the fixture is to fill
    whatever is left so that total cluster utilization reaches the target
    percentage. This means that in this mode, number of data written depends
    on both total capacity and current utilization. If the current storage
    utilization already exceeds the target, the test is skipped.

    On the other hand with ``target_size``, you can specify the size of data
    written by fio directly.

    Args:
        fixture_name (str): name of the fixture using this function (for
            logging and k8s object labeling purposes)
        project (ocs_ci.ocs.ocp.OCP): OCP object of project in which the Job is
            deployed, as created by ``project_factory`` or ``project`` fixture
        fio_pvc_dict (dict): PVC k8s struct for fio target volume
        fio_job_dict (dict): Job k8s struct for fio job
        fio_configmap_dict (dict): configmap k8s struct with fio config file
        measurement_dir (str): reference to a fixture which represents a
            directory where measurement results are stored, see also
            :py:func:`ocs_ci.utility.workloadfixture.measure_operation()`
        tmp_path (pathlib.PosixPath): reference to pytest ``tmp_path`` fixture
        target_percentage (float): target utilization as percentage wrt all
            usable OCS space, eg. 0.50 means a request to reach 50% of total
            OCS storage utilization (wrt usable space)
        target_size (int): target size of the PVC for fio to use, eg. 10 means
            a request for fio to write 10GiB of data
        with_checksum (bool): if true, sha1 checksum of the data written by
            fio is stored on the volume, and reclaim policy of the volume is
            changed to ``Retain`` so that the volume is not removed during test
            teardown for later verification runs

    Returns:
        dict: measurement results with timestamps and other medatada from
            :py:func:`ocs_ci.utility.workloadfixture.measure_operation()`

    """
    val_err_msg = "Specify either target_size or target_percentage"
    if target_size is None and target_percentage is None:
        raise ValueError(
            val_err_msg +
            ", it's not clear how much storage space should be used.")
    if target_size is not None and target_percentage is not None:
        raise ValueError(val_err_msg + ", not both.")

    # TODO: move out storage class names
    if fixture_name.endswith("rbd"):
        storage_class_name = "ocs-storagecluster-ceph-rbd"
        ceph_pool_name = "ocs-storagecluster-cephblockpool"
    elif fixture_name.endswith("cephfs"):
        storage_class_name = "ocs-storagecluster-cephfs"
        ceph_pool_name = "ocs-storagecluster-cephfilesystem-data0"
    else:
        raise UnexpectedVolumeType(
            "unexpected volume type, ocs-ci code is wrong")

    # make sure we communicate what is going to happen
    logger.info((f"starting {fixture_name} fixture, "
                 f"using {storage_class_name} storage class "
                 f"backed by {ceph_pool_name} ceph pool"))

    # log ceph mon_osd_*_ratio values for QE team to understand behaviour of
    # ceph cluster during high utilization levels (for expected values, consult
    # BZ 1775432 and check that there is no more recent BZ or JIRA in this
    # area)
    ceph_full_ratios = [
        'mon_osd_full_ratio',
        'mon_osd_backfillfull_ratio',
        'mon_osd_nearfull_ratio',
    ]
    ct_pod = pod.get_ceph_tools_pod()
    for ceph_ratio in ceph_full_ratios:
        logger.info("checking value of %s", ceph_ratio)
        value = ct_pod.exec_ceph_cmd(f'ceph config get mon.* {ceph_ratio}')
        logger.info(f"{ceph_ratio} is {value}")

    if target_size is not None:
        pvc_size = target_size
    else:
        pvc_size = get_storageutilization_size(target_percentage,
                                               ceph_pool_name)

    # To handle use case of test_workload_rbd_cephfs_minimal which writes data
    # to reach a small fraction of the total capacity only (eg. 5%), the test
    # is going increase the target 2x and try again.
    if pvc_size <= 0 and target_percentage is not None and target_percentage <= 0.10:
        new_target_percentage = 2 * target_percentage
        logger.info(
            "increasing storage utilization target percentage from %.2f to %.2f",
            target_percentage, new_target_percentage)
        target_percentage = new_target_percentage
        pvc_size = get_storageutilization_size(target_percentage,
                                               ceph_pool_name)
    # If this is still not enough, the test will be skipped, because the idea
    # of tests reaching a small total utilization is to do just that.
    # Moreover this will also skip this test case for any other utilization
    # level, which is easier to read in the test report than the actual
    # failure with negative pvc size.
    if pvc_size <= 0 and target_percentage is not None:
        skip_msg = (
            "current total storage utilization is too high, "
            f"the target utilization {target_percentage*100}% is already met")
        logger.warning(skip_msg)
        pytest.skip(skip_msg)

    fio_conf = textwrap.dedent("""
        [simple-write]
        readwrite=write
        buffered=1
        blocksize=4k
        ioengine=libaio
        directory=/mnt/target
        """)

    # When we ask for checksum to be generated for all files written in the
    # /mnt/target directory, we need to keep some space free so that the
    # checksum file would fit there. We overestimate this free space so that
    # it works both with CephFS and RBD volumes, as with RBD volumes actuall
    # usable capacity is smaller because of filesystem overhead (pvc size
    # defines size of a block device, on which local ext4 filesystem is
    # formatted).
    if with_checksum:
        # assume 4% fs overhead, and double to it make it safe
        fs_overhead = 0.08
        # size of file created by fio in MiB
        fio_size = int((pvc_size * (1 - fs_overhead)) * 2**10)
        fio_conf += f"size={fio_size}M\n"
    # Otherwise, we are tryting to write as much data as possible and fill the
    # persistent volume entirely.
    # For cephfs we can't use fill_fs because of BZ 1763808 (the process
    # will get *Disk quota exceeded* error instead of *No space left on
    # device* error).
    # On the other hand, we can't use size={pvc_size} for rbd, as we can't
    # write pvc_size bytes to a filesystem on a block device of {pvc_size}
    # size (obviously, some space is used by filesystem metadata).
    elif fixture_name.endswith("rbd"):
        fio_conf += "fill_fs=1\n"
    else:
        fio_conf += f"size={pvc_size}G\n"

    # When we ask for checksum to be generated for all files written in the
    # /mnt/target directory, we change the command of the container to run
    # both fio and sha1 checksum tool in the target directory. To do that,
    # we use '/bin/sh -c' hack.
    if with_checksum:
        container = fio_job_dict['spec']['template']['spec']['containers'][0]
        fio_command = " ".join(container['command'])
        sha_command = ("sha1sum /mnt/target/simple-write.*"
                       " > /mnt/target/fio.sha1sum"
                       " 2> /mnt/target/fio.stderr")
        shell_command = fio_command + " && " + sha_command
        container['command'] = ["/bin/bash", "-c", shell_command]

    # put the dicts together into yaml file of the Job
    fio_configmap_dict["data"]["workload.fio"] = fio_conf
    fio_pvc_dict["spec"]["storageClassName"] = storage_class_name
    fio_pvc_dict["spec"]["resources"]["requests"]["storage"] = f"{pvc_size}Gi"
    fio_objs = [fio_pvc_dict, fio_configmap_dict, fio_job_dict]
    fio_job_file = ObjectConfFile(fixture_name, fio_objs, project, tmp_path)

    fio_min_mbps = config.ENV_DATA['fio_storageutilization_min_mbps']
    write_timeout = get_timeout(fio_min_mbps, pvc_size)

    test_file = os.path.join(measurement_dir, f"{fixture_name}.json")

    measured_op = measure_operation(lambda: write_data_via_fio(
        fio_job_file, write_timeout, pvc_size, target_percentage),
                                    test_file,
                                    measure_after=True,
                                    minimal_time=480)

    # we don't need to delete anything if this fixture has been already
    # executed
    if not measured_op['first_run']:
        return measured_op

    # measure MAX AVAIL value just before reclamaion of data written by fio
    _, max_avail_before_delete = get_ceph_storage_stats(ceph_pool_name)

    def is_storage_reclaimed():
        """
        Check whether data created by the Job were actually deleted.
        """
        _, max_avail = get_ceph_storage_stats(ceph_pool_name)
        reclaimed_size = round((max_avail - max_avail_before_delete) / 2**30)
        logger.info("%d Gi of %d Gi (PVC size) seems already reclaimed",
                    reclaimed_size, pvc_size)
        result = reclaimed_size >= pvc_size * 0.9
        if result:
            logger.info("Storage for the PVC was at least 90% reclaimed.")
        else:
            logger.info("Storage for the PVC was not yet reclaimed enough.")
        return result

    if with_checksum:
        # Let's get the name of the PV via the PVC.
        ocp_pvc = ocp.OCP(kind=constants.PVC, namespace=project.namespace)
        pvc_data = ocp_pvc.get()
        # Explicit list of assumptions, if these assumptions are not met, the
        # code won't work and it either means that something went terrible
        # wrong or that the code needs to be changed.
        assert pvc_data['kind'] == "List"
        assert len(pvc_data['items']) == 1
        pvc_dict = pvc_data['items'][0]
        assert pvc_dict['kind'] == constants.PVC
        pv_name = pvc_dict['spec']['volumeName']
        logger.info("Identified PV of the finished fio Job: %s", pv_name)
        # We change reclaim policy of the volume, so that we can reuse it
        # later, while everyting but the volume will be deleted during project
        # teardown. Note that while a standard way of doing this would be via
        # custom storage class with redefined reclaim policy, we need to do
        # this on this single volume only here, so editing volume directly is
        # more straightforward.
        logger.info("Changing persistentVolumeReclaimPolicy of %s", pv_name)
        ocp_pv = ocp.OCP(kind=constants.PV)
        patch_success = ocp_pv.patch(
            resource_name=pv_name,
            params='{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}')
        if patch_success:
            logger.info('Reclaim policy of %s was changed.', pv_name)
        else:
            logger.error('Reclaim policy of %s failed to be changed.', pv_name)
        label = f'fixture={fixture_name}'
        ocp_pv.add_label(pv_name, label)
    else:
        # Without checksum, we just need to make sure that data were deleted
        # and wait for this to happen to avoid conflicts with tests executed
        # right after this one.
        delete_fio_data(fio_job_file, is_storage_reclaimed)

    return measured_op

示例#6

0

显示文件