示例#1
0
    def test_add_capacity(self, node_multiplier, capacity):
        """
        Test to add variable capacity to the OSD cluster while IOs running

        Args:
        node_multiplier: the number of OSD to add per worker node
        capacity: the storage capacity of each OSD
        """
        dt = config.ENV_DATA['deployment_type']
        if dt == 'ipi':
            storage_cluster = machine_utils.get_storage_cluster(
                namespace=defaults.ROOK_CLUSTER_NAMESPACE)
            worker_nodes = len(get_typed_nodes())
            machine_utils.add_capacity(storagecluster_name=storage_cluster,
                                       count=worker_nodes * node_multiplier)
            machine_utils.add_storage_capacity(
                storagecluster_name=storage_cluster, capacity=capacity)
            pod_obj = ocp.OCP(kind=constants.POD,
                              namespace=defaults.ROOK_CLUSTER_NAMESPACE)
            assert pod_obj.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.OSD_APP_LABEL,
                resource_count=worker_nodes * node_multiplier,
                timeout=600), "OSD pods failed to reach RUNNING state"
        else:
            pytest.skip("UPI not yet supported")
示例#2
0
def initialize_data():
    """
    Initialize the data dictionary with cluster data

    Returns:
        dict: A dictionary contains the data to push to the dashboard
    """
    worker_type = get_typed_nodes(
        num_of_nodes=1
    )[0].data['metadata']['labels']['beta.kubernetes.io/instance-type']

    (ocs_ver_info, _) = get_ocs_version()
    ocs_ver_full = ocs_ver_info['status']['desired']['version']
    m = re.match(r"(\d.\d).(\d)-", ocs_ver_full)
    if m.group(1) is not None:
        ocs_ver = m.group(1)
    platform = config.ENV_DATA['platform']
    if platform.lower() == 'aws':
        platform = platform.upper() + " " + worker_type
    data_template['commitid'] = ocs_ver_full
    data_template['project'] = f"OCS{ocs_ver}"
    data_template['branch'] = ocs_ver_info['spec']['channel']
    data_template['executable'] = ocs_ver
    data_template['environment'] = platform

    return data_template
示例#3
0
def initialize_data():
    """
    Initialize the data dictionary with cluster data

    Returns:
        dict: A dictionary contains the data to push to the dashboard
    """

    # worker type is relevant only for cloud instances.
    log.info('Initializing the dashboard data')
    worker_lbl = get_typed_nodes(num_of_nodes=1)[0].data['metadata']['labels']
    if 'beta.kubernetes.io/instance-type' in worker_lbl:
        worker_type = worker_lbl['beta.kubernetes.io/instance-type']
    else:
        # TODO: Maybe for None cloud we can add the Arch ?
        #   worker_type = worker_lbl['kubernetes.io/arch']
        worker_type = ""
    log.info(f'The worker type is {worker_type}')

    (ocs_ver_info, _) = get_ocs_version()
    ocs_ver_full = ocs_ver_info['status']['desired']['version']
    m = re.match(r"(\d.\d).(\d)", ocs_ver_full)
    if m and m.group(1) is not None:
        ocs_ver = m.group(1)
    log.info(f'ocs_ver is {ocs_ver_full}')
    platform = config.ENV_DATA['platform']
    if platform.lower() not in ['vsphere', 'baremetal']:
        platform = f'{platform.upper()} {worker_type}'
    data_template['commitid'] = ocs_ver_full
    data_template['project'] = f"OCS{ocs_ver}"
    data_template['branch'] = ocs_ver_info['spec']['channel']
    data_template['executable'] = ocs_ver
    data_template['environment'] = platform

    return data_template
示例#4
0
    def test_2_nodes_maintenance_same_type(
        self, pvc_factory, pod_factory, nodes_type
    ):
        """
        OCS-1273/OCs-1271:
        - Maintenance (mark as unscheduable and drain) 2 worker/master nodes
        - Mark the nodes as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 2 nodes
        typed_nodes = node.get_typed_nodes(node_type=nodes_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {nodes_type} node for the test"

        typed_node_names = [typed_node.name for typed_node in typed_nodes]

        # Maintenance the nodes (unschedule and drain)
        node.drain_nodes(typed_node_names)

        # Mark the nodes back to schedulable
        node.schedule_nodes(typed_node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
示例#5
0
    def node_operations_entry_criteria(self,
                                       node_type,
                                       number_of_nodes,
                                       operation_name="Node Operation",
                                       network_fail_time=None):
        """
        Entry criteria function for node related operations

        Args:
            node_type (str): Type of node
            number_of_nodes (int): Number of nodes
            operation_name (str): Name of the node operation
            network_fail_time (int): Total time to fail the network in a node

        Returns:
            tuple: containing the params used in Node operations

        """
        self.validate_cluster(node_status=True, operation_name=operation_name)

        logger.info(f"Getting parameters related to: {operation_name}")
        typed_nodes = node.get_typed_nodes(node_type=node_type,
                                           num_of_nodes=number_of_nodes)
        if network_fail_time:
            return typed_nodes, network_fail_time
        else:
            return typed_nodes
    def test_run_couchbase_node_drain(self, cb_setup, node_type='master'):
        """
        Test couchbase workload with node drain
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(
            node_type='worker', print_table=True
        )

        # Node drain with specific node type
        typed_nodes = node.get_typed_nodes(
            node_type=node_type, num_of_nodes=1
        )
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        for sample in TimeoutSampler(300, 5, self.cb.result.done):
            if sample:
                break
            else:
                logging.info(
                    "#### ....Waiting for couchbase threads to complete..."
                )
        utils.ceph_health_check()
示例#7
0
    def test_2_nodes_different_types(self, pvc_factory, pod_factory):
        """
        OCS-1274:
        - Maintenance (mark as unscheduable and drain) 1 worker node and 1
          master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the nodes as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node from each type
        nodes = [
            node.get_typed_nodes(
                node_type=node_type, num_of_nodes=1
            )[0] for node_type in ['worker', 'master']
        ]
        assert nodes, f"Failed to find a nodes for the test"

        node_names = [typed_node.name for typed_node in nodes]

        # Maintenance the nodes (unschedule and drain)
        node.drain_nodes(node_names)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the nodes back to schedulable
        node.schedule_nodes(node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
示例#8
0
    def test_node_maintenance(self, node_type, pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node
        typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1)
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain)
        node.drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
    def test_node_maintenance(self, node_type, pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get a list of 2 nodes. Pick one of them after checking
        # which one does't have the rook operator running on
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2)
        typed_node_name = typed_nodes[0].name
        # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get('metadata').get('name') == typed_node_name:
            typed_node_name = typed_nodes[1].name
        # End of workaround for BZ 1778488

        # Maintenance the node (unschedule and drain)
        drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
示例#10
0
    def test_scale_osds_reboot_nodes(self, interface, project_factory,
                                     multi_pvc_factory, dc_pod_factory):
        """
        Check storage utilization, if its less then runs IO,
        Scale osds from 3-6, check for rebalance and reboot workers
        """
        current_osd_count = count_cluster_osd()
        proj_obj = project_factory()
        if current_osd_count == 3:
            while not validate_osd_utilization(osd_used=50):
                # Create pvc
                pvc_objs = multi_pvc_factory(project=proj_obj,
                                             interface=interface,
                                             size=self.pvc_size,
                                             num_of_pvc=self.num_of_pvcs)

                dc_pod_objs = list()
                for pvc_obj in pvc_objs:
                    dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj))

                wait_for_dc_app_pods_to_reach_running_state(dc_pod_objs)

                for pod_obj in dc_pod_objs:
                    pod_obj.run_io(storage_type='fs',
                                   size='3G',
                                   runtime='60',
                                   fio_filename=f'{pod_obj.name}_io')

        # Add capacity
        osd_size = storage_cluster.get_osd_size()
        count = storage_cluster.add_capacity(osd_size)
        pod = OCP(kind=constants.POD,
                  namespace=config.ENV_DATA['cluster_namespace'])
        pod.wait_for_resource(timeout=300,
                              condition=constants.STATUS_RUNNING,
                              selector='app=rook-ceph-osd',
                              resource_count=count * 3)
        assert ceph_health_check(), "New OSDs failed to reach running state"

        cluster = CephCluster()

        # Get rebalance status
        rebalance_status = cluster.get_rebalance_status()
        logger.info(rebalance_status)
        if rebalance_status:
            time_taken = cluster.time_taken_to_complete_rebalance()
            logger.info(f"The time taken to complete rebalance {time_taken}")

        # Rolling reboot on worker nodes
        worker_nodes = get_typed_nodes(node_type='worker')

        factory = platform_nodes.PlatformNodesFactory()
        nodes = factory.get_nodes_platform()

        for node in worker_nodes:
            nodes.restart_nodes(nodes=[node])
            wait_for_nodes_status()

        assert ceph_health_check(
            delay=180), "Failed, Ceph health bad after nodes reboot"
示例#11
0
    def get_node_name_where_jenkins_pod_not_hosted(
        self, node_type=constants.WORKER_MACHINE, num_of_nodes=1
    ):
        """
        get nodes

        Args:
            node_type (str): The node type  (e.g. worker, master)
            num_of_nodes (int): The number of nodes to be returned

        Returns:
            list: List of compute node names
        """
        if node_type == constants.MASTER_MACHINE:
            nodes_drain = [node.name for node in get_typed_nodes(
                node_type=node_type, num_of_nodes=num_of_nodes
            )]
        elif node_type == constants.WORKER_MACHINE:
            pod_objs = []
            for project in self.projects:
                pod_names = get_pod_name_by_pattern(
                    pattern='jenkins', namespace=project
                )
                pod_obj = [get_pod_obj(name=pod_name, namespace=project) for pod_name in pod_names]
                pod_objs += pod_obj
            nodes_app_name = set(get_app_pod_running_nodes(pod_objs))
            nodes_worker_name = set(get_worker_nodes())
            nodes_drain = nodes_worker_name - nodes_app_name
        else:
            raise ValueError('The node type is worker or master')
        return list(nodes_drain)[:num_of_nodes]
示例#12
0
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=True)

        wait_for_nodes_status(node_names=[typed_node_name],
                              status=constants.NODE_READY_SCHEDULING_DISABLED)
        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
示例#13
0
    def test_node_maintenance(self, reduce_cluster_load, node_type, pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain)
        drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=90)
示例#14
0
def verify_image_versions(old_images, upgrade_version):
    """
    Verify if all the images of OCS objects got upgraded

    Args:
        old_images (set): set with old images
        upgrade_version (packaging.version.Version): version of OCS

    """
    number_of_worker_nodes = len(get_typed_nodes())
    osd_count = get_osd_count()
    verify_pods_upgraded(old_images, selector=constants.OCS_OPERATOR_LABEL)
    verify_pods_upgraded(old_images, selector=constants.OPERATOR_LABEL)
    # in 4.3 app selector nooba have those pods: noobaa-core-ID, noobaa-db-ID,
    # noobaa-operator-ID but in 4.2 only 2: noobaa-core-ID, noobaa-operator-ID
    nooba_pods = 2 if upgrade_version < parse_version('4.3') else 3
    verify_pods_upgraded(old_images,
                         selector=constants.NOOBAA_APP_LABEL,
                         count=nooba_pods)
    verify_pods_upgraded(
        old_images,
        selector=constants.CSI_CEPHFSPLUGIN_LABEL,
        count=number_of_worker_nodes,
    )
    verify_pods_upgraded(old_images,
                         selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                         count=2)
    verify_pods_upgraded(
        old_images,
        selector=constants.CSI_RBDPLUGIN_LABEL,
        count=number_of_worker_nodes,
    )
    verify_pods_upgraded(old_images,
                         selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
                         count=2)
    verify_pods_upgraded(old_images, selector=constants.MON_APP_LABEL, count=3)
    verify_pods_upgraded(old_images, selector=constants.MGR_APP_LABEL)
    # OSD upgrade have timeout 10mins for new attempt if cluster is not health.
    # https://bugzilla.redhat.com/show_bug.cgi?id=1840729 setting timeout for
    # 12.5 minutes per OSD
    verify_pods_upgraded(
        old_images,
        selector=constants.OSD_APP_LABEL,
        count=osd_count,
        timeout=750 * osd_count,
    )
    verify_pods_upgraded(old_images, selector=constants.MDS_APP_LABEL, count=2)
    if config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS or (
            config.ENV_DATA.get('platform') == constants.AZURE_PLATFORM):
        # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1857802 - RGW count is 1
        # post upgrade to OCS 4.5. Tracked with
        # https://github.com/red-hat-storage/ocs-ci/issues/2532
        # TODO: uncomment the below 1 line:
        # rgw_count = 2 if float(config.ENV_DATA['ocs_version']) >= 4.5 else 1
        # TODO: Delete the below 1 line
        rgw_count = 1
        verify_pods_upgraded(old_images,
                             selector=constants.RGW_APP_LABEL,
                             count=rgw_count)
示例#15
0
def get_new_device_paths(device_sets_required, osd_size_capacity_requested):
    """
    Get new device paths to add capacity over Baremetal cluster

    Args:
        device_sets_required (int) : Count of device sets to be added
        osd_size_capacity_requested (int) : Requested OSD size capacity

    Returns:
        list : List containing added device paths

    """
    ocp_obj = OCP(kind='localvolume',
                  namespace=config.ENV_DATA['local_storage_namespace'])
    workers = get_typed_nodes(node_type="worker")
    worker_names = [worker.name for worker in workers]
    config.ENV_DATA['worker_replicas'] = len(worker_names)
    output = ocp_obj.get(resource_name='local-block')
    # Fetch device paths present in the current LVCR
    cur_device_list = output["spec"]["storageClassDevices"][0]["devicePaths"]
    # Clone repo and run playbook to fetch all device paths from each node
    path = os.path.join(constants.EXTERNAL_DIR, "device-by-id-ocp")
    clone_repo(constants.OCP_QE_DEVICEPATH_REPO, path)
    os.chdir(path)
    run_cmd("ansible-playbook devices_by_id.yml")
    # Filter unused/unallocated device paths
    with open("local-storage-block.yaml", "r") as cloned_file:
        with open("local-block.yaml", "w") as our_file:
            device_from_worker = [1] * config.ENV_DATA['worker_replicas']
            cur_line = cloned_file.readline()
            while "devicePaths:" not in cur_line:
                our_file.write(cur_line)
                cur_line = cloned_file.readline()
            our_file.write(cur_line)
            cur_line = cloned_file.readline()
            # Add required number of device path from each worker node
            while cur_line:
                if str(osd_size_capacity_requested) in cur_line:
                    for i in range(len(worker_names)):
                        if device_from_worker[i] and (str(worker_names[i])
                                                      in cur_line):
                            if not any(s in cur_line for s in cur_device_list):
                                our_file.write(cur_line)
                                device_from_worker[
                                    i] = device_from_worker[i] - 1
                cur_line = cloned_file.readline()
    local_block_yaml = open("local-block.yaml")
    lvcr = yaml.load(local_block_yaml, Loader=yaml.FullLoader)
    new_dev_paths = lvcr["spec"]["storageClassDevices"][0]["devicePaths"]
    logger.info(f"Newly added devices are: {new_dev_paths}")
    if new_dev_paths:
        assert len(new_dev_paths) == (
            len(worker_names) * device_sets_required), (
                f"Current devices available = {len(new_dev_paths)}")
        os.chdir(constants.TOP_DIR)
        shutil.rmtree(path)
        # Return list of old device paths and newly added device paths
        cur_device_list.extend(new_dev_paths)
    return cur_device_list
示例#16
0
def get_new_device_paths(device_sets_required, osd_size_capacity_requested):
    """
    Get new device paths to add capacity over Baremetal cluster

    Args:
        device_sets_required (int) : Count of device sets to be added
        osd_size_capacity_requested (int) : Requested OSD size capacity

    Returns:
        cur_device_list (list) : List containing added device paths

    """
    ocp_obj = OCP()
    workers = get_typed_nodes(node_type="worker")
    worker_names = [worker.name for worker in workers]
    output = ocp_obj.exec_oc_cmd("get localvolume local-block -n local-storage -o yaml")
    cur_device_list = output["spec"]["storageClassDevices"][0]["devicePaths"]
    path = os.path.join(constants.EXTERNAL_DIR, "device-by-id-ocp")
    utils.clone_repo(constants.OCP_QE_DEVICEPATH_REPO, path)
    os.chdir(path)
    utils.run_cmd("ansible-playbook devices_by_id.yml")
    with open("local-storage-block.yaml", "r") as cloned_file:
        with open("local-block.yaml", "w") as our_file:
            device_from_worker1 = device_sets_required
            device_from_worker2 = device_sets_required
            device_from_worker3 = device_sets_required
            cur_line = cloned_file.readline()
            while "devicePaths:" not in cur_line:
                our_file.write(cur_line)
                cur_line = cloned_file.readline()
            our_file.write(cur_line)
            cur_line = cloned_file.readline()
            # Add required number of device path from each node
            while cur_line:
                if str(osd_size_capacity_requested) in cur_line:
                    if device_from_worker1 and (str(worker_names[0]) in cur_line):
                        if not any(s in cur_line for s in cur_device_list):
                            our_file.write(cur_line)
                            device_from_worker1 = device_from_worker1 - 1
                    if device_from_worker2 and (str(worker_names[1]) in cur_line):
                        if not any(s in cur_line for s in cur_device_list):
                            our_file.write(cur_line)
                            device_from_worker2 = device_from_worker2 - 1
                    if device_from_worker3 and (str(worker_names[2]) in cur_line):
                        if not any(s in cur_line for s in cur_device_list):
                            our_file.write(cur_line)
                            device_from_worker3 = device_from_worker3 - 1
                cur_line = cloned_file.readline()
    local_block_yaml = open("local-block.yaml")
    lvcr = yaml.load(local_block_yaml, Loader=yaml.FullLoader)
    new_dev_paths = lvcr["spec"]["storageClassDevices"][0]["devicePaths"]
    log.info(f"Newly added devices are: {new_dev_paths}")
    assert len(new_dev_paths) == (len(worker_names) * device_sets_required), (
        f"Current devices available = {len(new_dev_paths)}"
    )
    os.chdir(constants.TOP_DIR)
    shutil.rmtree(path)
    cur_device_list.extend(new_dev_paths)
    return cur_device_list
示例#17
0
def get_environment_info():
    """
    Getting the environment information, Information that will be collected

    Versions:
        OCP - version / build / channel
        OCS - version / build
        Ceph - version
        Rook - version

    Platform:
        BM / VmWare / Cloud provider etc.
        Instance type / architecture
        Cluster name
        User name that run the test

    Return:
      dict: dictionary that contain the environment information

    """
    results = {}
    # getting the name and email  of the user that running the test.
    try:
        user = utils.run_cmd('git config --get user.name').strip()
        email = utils.run_cmd('git config --get user.email').strip()
        results['user'] = f'{user} <{email}>'
    except CommandFailed:
        # if no git user define, the default user is none
        results['user'] = ''

    results['clustername'] = ocp.get_clustername()
    results['platform'] = node.get_provider()
    if results['platform'].lower() not in constants.ON_PREM_PLATFORMS:
        results['platform'] = results['platform'].upper()

    results['ocp_build'] = ocp.get_build()
    results['ocp_channel'] = ocp.get_ocp_channel()
    results['ocp_version'] = utils.get_ocp_version()

    results['ceph_version'] = utils.get_ceph_version()
    results['rook_version'] = utils.get_rook_version()

    results['ocs_build'] = ocp.get_ocs_version()
    # Extracting the version number x.y.z from full build name
    m = re.match(r"(\d.\d).(\d)", results['ocs_build'])
    if m and m.group(1) is not None:
        results['ocs_version'] = m.group(1)

    # Getting the instance type for cloud or Arch type for None cloud
    worker_lbl = node.get_typed_nodes(
        num_of_nodes=1)[0].data['metadata']['labels']
    if 'beta.kubernetes.io/instance-type' in worker_lbl:
        results['worker_type'] = worker_lbl['beta.kubernetes.io/instance-type']
    else:
        results['worker_type'] = worker_lbl['kubernetes.io/arch']

    return results
示例#18
0
def verify_image_versions(old_images, upgrade_version):
    """
    Verify if all the images of OCS objects got upgraded

    Args:
        old_images (set): set with old images
        upgrade_version (packaging.version.Version): version of OCS

    """
    namespace = config.ENV_DATA['cluster_namespace']
    number_of_worker_nodes = len(get_typed_nodes())
    storage_cluster = StorageCluster(
        resource_name=config.ENV_DATA['storage_cluster_name'],
        namespace=namespace)
    osd_count = (
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) *
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica']))
    verify_pods_upgraded(old_images, selector=constants.OCS_OPERATOR_LABEL)
    verify_pods_upgraded(old_images, selector=constants.OPERATOR_LABEL)
    # in 4.3 app selector nooba have those pods: noobaa-core-ID, noobaa-db-ID,
    # noobaa-operator-ID but in 4.2 only 2: noobaa-core-ID, noobaa-operator-ID
    nooba_pods = 2 if upgrade_version < parse_version('4.3') else 3
    verify_pods_upgraded(old_images,
                         selector=constants.NOOBAA_APP_LABEL,
                         count=nooba_pods)
    verify_pods_upgraded(
        old_images,
        selector=constants.CSI_CEPHFSPLUGIN_LABEL,
        count=number_of_worker_nodes,
    )
    verify_pods_upgraded(old_images,
                         selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                         count=2)
    verify_pods_upgraded(
        old_images,
        selector=constants.CSI_RBDPLUGIN_LABEL,
        count=number_of_worker_nodes,
    )
    verify_pods_upgraded(old_images,
                         selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
                         count=2)
    verify_pods_upgraded(old_images, selector=constants.MGR_APP_LABEL)
    verify_pods_upgraded(old_images, selector=constants.MON_APP_LABEL, count=3)
    verify_pods_upgraded(
        old_images,
        selector=constants.OSD_APP_LABEL,
        count=osd_count,
        timeout=1800,
    )
    verify_pods_upgraded(old_images, selector=constants.MDS_APP_LABEL, count=2)
    if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM:
        verify_pods_upgraded(old_images,
                             selector=constants.RGW_APP_LABEL,
                             count=1)
示例#19
0
def verify_image_versions(old_images, upgrade_version):
    """
    Verify if all the images of OCS objects got upgraded

    Args:
        old_images (set): set with old images
        upgrade_version (packaging.version.Version): version of OCS

    """
    number_of_worker_nodes = len(get_typed_nodes())
    osd_count = get_osd_count()
    verify_pods_upgraded(old_images, selector=constants.OCS_OPERATOR_LABEL)
    verify_pods_upgraded(old_images, selector=constants.OPERATOR_LABEL)
    # in 4.3 app selector nooba have those pods: noobaa-core-ID, noobaa-db-ID,
    # noobaa-operator-ID but in 4.2 only 2: noobaa-core-ID, noobaa-operator-ID
    nooba_pods = 2 if upgrade_version < parse_version('4.3') else 3
    verify_pods_upgraded(old_images,
                         selector=constants.NOOBAA_APP_LABEL,
                         count=nooba_pods)
    verify_pods_upgraded(
        old_images,
        selector=constants.CSI_CEPHFSPLUGIN_LABEL,
        count=number_of_worker_nodes,
    )
    verify_pods_upgraded(old_images,
                         selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                         count=2)
    verify_pods_upgraded(
        old_images,
        selector=constants.CSI_RBDPLUGIN_LABEL,
        count=number_of_worker_nodes,
    )
    verify_pods_upgraded(old_images,
                         selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
                         count=2)
    verify_pods_upgraded(old_images, selector=constants.MON_APP_LABEL, count=3)
    verify_pods_upgraded(old_images, selector=constants.MGR_APP_LABEL)
    # OSD upgrade have timeout 10mins for new attempt if cluster is not health.
    # https://bugzilla.redhat.com/show_bug.cgi?id=1840729 setting timeout for
    # 12.5 minutes per OSD
    verify_pods_upgraded(
        old_images,
        selector=constants.OSD_APP_LABEL,
        count=osd_count,
        timeout=750 * osd_count,
    )
    verify_pods_upgraded(old_images, selector=constants.MDS_APP_LABEL, count=2)
    if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM:
        verify_pods_upgraded(old_images,
                             selector=constants.RGW_APP_LABEL,
                             count=1)
示例#20
0
    def test_run_pgsql_node_drain(self,
                                  pgsql,
                                  transactions=900,
                                  node_type='master'):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type='worker',
                                                   print_table=True)

        # Node drain with specific node type
        typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1)
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)
示例#21
0
def add_worker_based_on_pods_count_per_node(node_count,
                                            expected_count,
                                            role_type=None,
                                            machineset_name=None):
    """
    Function to evaluate number of pods up in node and add new node accordingly.

    Args:
        machineset_name (list): Machineset_names to add more nodes if required.
        node_count (int): Additional nodes to be added
        expected_count (int): Expected pod count in one node
        role_type (str): To add type to the nodes getting added

    Returns:
        bool: True if Nodes gets added, else false.

    """
    # Check for POD running count on each nodes
    if config.ENV_DATA['deployment_type'] == 'ipi' and config.ENV_DATA[
            'platform'].lower() == 'aws':
        app_nodes = node.get_typed_nodes(node_type=role_type)
        pod_count_dict = node.get_running_pod_count_from_node(
            node_type=role_type)
        high_count_nodes, less_count_nodes = ([] for i in range(2))
        for node_obj in app_nodes:
            count = pod_count_dict[f"{node_obj.name}"]
            if count >= expected_count:
                high_count_nodes.append(node_obj.name)
            else:
                less_count_nodes.append(node_obj.name)
        if len(less_count_nodes) <= 1:
            for name in machineset_name:
                count = machine.get_replica_count(machine_set=name)
                machine.add_node(machine_set=name, count=(count + node_count))
                machine.wait_for_new_node_to_be_ready(name)
            return True
        else:
            logging.info(
                f"Enough pods can be created with available nodes {pod_count_dict}"
            )
            return False
    elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[
            'platform'].lower() == 'vsphere':
        raise UnsupportedPlatformError("Unsupported Platform to add worker")
    elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[
            'platform'].lower() == 'baremetal':
        raise UnsupportedPlatformError("Unsupported Platform to add worker")
    elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[
            'platform'].lower() == 'azure':
        raise UnsupportedPlatformError("Unsupported Platform to add worker")
示例#22
0
def add_worker_based_on_cpu_utilization(node_count,
                                        expected_percent,
                                        role_type=None,
                                        machineset_name=None):
    """
    Function to evaluate CPU utilization of nodes and add node if required.

    Args:
        machineset_name (list): Machineset_names to add more nodes if required.
        node_count (int): Additional nodes to be added
        expected_percent (int): Expected utilization precent
        role_type (str): To add type to the nodes getting added

    Returns:
        bool: True if Nodes gets added, else false.

    """
    # Check for CPU utilization on each nodes
    if config.ENV_DATA['deployment_type'] == 'ipi' and config.ENV_DATA[
            'platform'].lower() == 'aws':
        app_nodes = node.get_typed_nodes(node_type=role_type)
        uti_dict = node.get_node_resource_utilization_from_oc_describe(
            node_type=role_type)
        uti_high_nodes, uti_less_nodes = ([] for i in range(2))
        for node_obj in app_nodes:
            utilization_percent = uti_dict[f"{node_obj.name}"]['cpu']
            if utilization_percent > expected_percent:
                uti_high_nodes.append(node_obj.name)
            else:
                uti_less_nodes.append(node_obj.name)
        if len(uti_less_nodes) <= 1:
            for name in machineset_name:
                count = machine.get_replica_count(machine_set=name)
                machine.add_node(machine_set=name, count=(count + node_count))
                machine.wait_for_new_node_to_be_ready(name)
            return True
        else:
            logging.info(
                f"Enough resource available for more pod creation {uti_dict}")
            return False
    elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[
            'platform'].lower() == 'vsphere':
        raise UnsupportedPlatformError("Unsupported Platform to add worker")
    elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[
            'platform'].lower() == 'baremetal':
        raise UnsupportedPlatformError("Unsupported Platform to add worker")
    elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[
            'platform'].lower() == 'azure':
        raise UnsupportedPlatformError("Unsupported Platform to add worker")
示例#23
0
def verify_image_versions(old_images):
    """
    Verify if all the images of OCS objects got upgraded

    Args:
        old_images (set): set with old images

    """
    namespace = config.ENV_DATA['cluster_namespace']
    number_of_worker_nodes = len(get_typed_nodes())
    storage_cluster = StorageCluster(
        resource_name=config.ENV_DATA['storage_cluster_name'],
        namespace=namespace)
    osd_count = (
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) *
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica']))
    verify_pods_upgraded(old_images, selector=constants.OCS_OPERATOR_LABEL)
    verify_pods_upgraded(old_images, selector=constants.OPERATOR_LABEL)
    verify_pods_upgraded(old_images,
                         selector=constants.NOOBAA_APP_LABEL,
                         count=2)
    verify_pods_upgraded(
        old_images,
        selector=constants.CSI_CEPHFSPLUGIN_LABEL,
        count=number_of_worker_nodes,
    )
    verify_pods_upgraded(old_images,
                         selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                         count=2)
    verify_pods_upgraded(
        old_images,
        selector=constants.CSI_RBDPLUGIN_LABEL,
        count=number_of_worker_nodes,
    )
    verify_pods_upgraded(old_images,
                         selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
                         count=2)
    verify_pods_upgraded(old_images, selector=constants.MGR_APP_LABEL)
    verify_pods_upgraded(old_images, selector=constants.MON_APP_LABEL, count=3)
    verify_pods_upgraded(old_images,
                         selector=constants.OSD_APP_LABEL,
                         count=osd_count)
    verify_pods_upgraded(old_images, selector=constants.MDS_APP_LABEL, count=2)
    if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM:
        verify_pods_upgraded(old_images,
                             selector=constants.RGW_APP_LABEL,
                             count=1)
示例#24
0
    def test_run_couchbase_node_reboot(self, cb_setup, nodes,
                                       pod_name_of_node):
        """
        Test couchbase workload with node reboot
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type='worker',
                                                   print_table=True)
        get_node_resource_utilization_from_adm_top(node_type='master',
                                                   print_table=True)

        if pod_name_of_node == 'couchbase':
            node_list = self.cb.get_couchbase_nodes()
        elif pod_name_of_node == 'osd':
            node_list = get_osd_running_nodes()
        elif pod_name_of_node == 'master':
            master_node = get_typed_nodes(pod_name_of_node, num_of_nodes=1)

        # Restart relevant node
        if pod_name_of_node == 'master':
            nodes.restart_nodes(master_node, wait=False)
            waiting_time = 40
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)
        else:
            restart_node = get_node_objs(node_list[random.randint(
                0,
                len(node_list) - 1)])
            nodes.restart_nodes(restart_node)

        # Validate all nodes and services are in READY state and up

        retry((CommandFailed, TimeoutError, AssertionError,
               ResourceWrongStatusException),
              tries=60,
              delay=15)(ocp.wait_for_cluster_connectivity(tries=400))
        retry((CommandFailed, TimeoutError, AssertionError,
               ResourceWrongStatusException),
              tries=60,
              delay=15)(wait_for_nodes_status(timeout=1800))
        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        retry((CommandFailed), tries=60,
              delay=15)(bg_handler.wait_for_bg_operations(bg_ops,
                                                          timeout=3600))
        self.sanity_helpers.health_check()
示例#25
0
    def test_monitoring_after_rebooting_master_node(self, nodes, pods):
        """
        Test case to validate rebooting master node shouldn't delete
        the data collected on prometheus pod

        """

        # Get the master node list
        master_nodes = get_typed_nodes(node_type='master')

        # Reboot one after one master nodes
        for node in master_nodes:
            nodes.restart_nodes([node])

            wait_for_nodes_status_and_prometheus_health_check(pods)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()
示例#26
0
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get a list of 2 nodes. Pick one of them after checking
        # which one does't have the rook operator running on
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get('metadata').get('name') == typed_node_name:
            typed_node_name = typed_nodes[1].name
        # End of workaround for BZ 1778488

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=True)

        wait_for_nodes_status(node_names=[typed_node_name],
                              status=constants.NODE_READY_SCHEDULING_DISABLED)
        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
    def test_amq_after_rebooting_node(self, node_type, nodes, amq_setup):
        """
        Test case to validate rebooting master node shouldn't effect
        amq workloads running in background

        """
        # Get all amq pods
        pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE)

        # Get the node list
        node = get_typed_nodes(node_type, num_of_nodes=1)

        # Reboot one master nodes
        nodes.restart_nodes(node, wait=False)

        # Wait some time after rebooting master
        waiting_time = 40
        log.info(f"Waiting {waiting_time} seconds...")
        time.sleep(waiting_time)

        # Validate all nodes and services are in READY state and up
        retry((CommandFailed, TimeoutError, AssertionError,
               ResourceWrongStatusException),
              tries=60,
              delay=15)(ocp.wait_for_cluster_connectivity(tries=400))
        retry((CommandFailed, TimeoutError, AssertionError,
               ResourceWrongStatusException),
              tries=60,
              delay=15)(wait_for_nodes_status(timeout=1800))

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all amq pods are up and running
        assert POD.wait_for_resource(condition='Running',
                                     resource_count=len(pod_obj_list),
                                     timeout=300)

        # Validate the results
        log.info("Validate message run completely")
        for thread in self.threads:
            thread.result(timeout=1800)
示例#28
0
    def test_detach_attach_worker_volume(self, aws_obj, pvc_factory, pod_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be unhealthy)
          by creating resources and running IO
        - Attach back the volume to the node
        - Restart the node so the volume will get re-mounted

        """
        # Requesting 1 worker node for the test as this case includes detach and
        # attach of data volume of 1 worker node
        worker = node.get_typed_nodes(num_of_nodes=1)
        assert worker, "Failed to find a worker node for the test"
        worker = worker[0]

        # Get the worker node's ec2 instance ID and name
        instance = aws.get_instances_ids_and_names([worker])
        assert instance, f"Failed to get ec2 instances for node {worker.name}"

        instance_id = [*instance][0]

        # Get the ec2 instance data volume Volume instance
        ec2_volume = aws.get_data_volumes(instance_id)[0]

        # Detach volume (logging is done inside the function)
        aws_obj.detach_volume(ec2_volume)

        # Validate cluster is still functional
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        # Attach volume (logging is done inside the function)
        aws_obj.attach_volume(ec2_volume, instance_id)

        # Restart the instance so the volume will get re-mounted
        aws_obj.restart_ec2_instances(instances=instance, wait=True)

        # Cluster health check
        self.sanity_helpers.health_check()
示例#29
0
    def test_detach_attach_2_workers_volumes(self, aws_obj, pvc_factory, pod_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data volume from 2 of the worker nodes
        - Attach back the volume to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Requesting 2 worker nodes for the test as this case includes
        # detach and attach of data volume of 1 worker node
        workers = node.get_typed_nodes(num_of_nodes=2)
        assert workers, "Failed to find worker nodes for the test"

        # Get the worker nodes ec2 instance IDs and names
        instances = aws.get_instances_ids_and_names(workers)
        assert instances, (
            f"Failed to get ec2 instances for node {[w.name for w in workers]}"
        )

        for instance in instances.items():
            instance_id = [*instance][0]

            # Get the ec2 instance data volume Volume instance
            ec2_volume = aws.get_data_volumes(instance_id)[0]

            # Detach volume (logging is done inside the function)
            aws_obj.detach_volume(ec2_volume)

            # Attach volume (logging is done inside the function)
            aws_obj.attach_volume(ec2_volume, instance_id)

        # Restart the instances so the volume will get re-mounted
        aws_obj.restart_ec2_instances(instances=instances, wait=True)

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
示例#30
0
    def test_detach_attach_worker_volume(self, aws_obj, resources):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be unhealthy)
          by creating resources and running IO
        - Attach back the volume to the node
        - Restart the node so the volume will get re-mounted

        """
        # Requesting 1 worker node for the test as this case includes detach and
        # attach of data volume of 1 worker node
        worker = node.get_typed_nodes(num_of_nodes=1)[0]

        # Get the worker node's ec2 instance ID and name
        instance = aws.get_instances_ids_and_names([worker])
        instance_id = [*instance][0]

        # Get the ec2 instance data volume Volume instance
        ec2_volume = aws.get_data_volumes(instance_id)[0]

        # Detach volume (logging is done inside the function)
        aws_obj.detach_volume(ec2_volume)

        # Validate cluster is still functional
        self.validate_cluster(resources=resources,
                              nodes=list(instance.values()),
                              health_check=False)

        # Attach volume (logging is done inside the function)
        aws_obj.attach_volume(ec2_volume, instance_id)

        # Restart the instance so the volume will get re-mounted
        aws_obj.restart_ec2_instances(instances=instance, wait=True)

        # Cluster health check
        self.health_check()