def test_monitoring_after_rebooting_node_where_mgr_is_running(self):
        """
        Test case to validate rebooting a node where mgr is running
        should not delete the data collected on prometheus pod
        """

        aws_obj = aws.AWS()

        # Get the mgr pod obj
        mgr_pod_obj = pod.get_mgr_pods()

        # Get the node where the mgr pod is hosted
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        # Reboot the node where the mgr pod is hosted
        instances = aws.get_instances_ids_and_names([mgr_node_obj])
        aws_obj.restart_ec2_instances(instances=instances,
                                      wait=True,
                                      force=True)

        # Validate all nodes are in READY state
        wait_for_nodes_status()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for ceph health check metrics is updated with new mgr pod
        wait_to_update_mgrpod_info_prometheus_pod()

        # Check for the created pvc metrics after rebooting the node where mgr pod was running
        for pod_obj in self.pod_objs:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
예제 #2
0
    def delete_pods(self):
        """
        Try to delete pods:
            - Rook operator
            - OSD
            - MGR
            - MON
        """
        pod_list = []
        rook_operator_pod = pod.get_ocs_operator_pod(
            ocs_label=constants.OPERATOR_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        pod_list.append(rook_operator_pod)

        osd_pods = pod.get_osd_pods()
        pod_list.extend(osd_pods)

        mgr_pods = pod.get_mgr_pods()
        pod_list.extend(mgr_pods)

        mon_pods = pod.get_mon_pods()
        pod_list.extend(mon_pods)

        logger.info(f"Deleting pods: {[p.name for p in pod_list]}")
        pod.delete_pods(pod_objs=pod_list)
예제 #3
0
    def mgr_pod_node_restart(self):
        """
        Restart node that runs mgr pod
        """
        mgr_pod_obj = pod.get_mgr_pods()
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        self.nodes.restart_nodes([mgr_node_obj])

        wait_for_nodes_status()

        # Check for Ceph pods
        pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        assert pod_obj.wait_for_resource(
            condition="Running", selector="app=rook-ceph-mgr", timeout=600
        )
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-mon",
            resource_count=3,
            timeout=600,
        )
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-osd",
            resource_count=3,
            timeout=600,
        )
예제 #4
0
 def set_resource(self, resource):
     self.resource = resource
     if self.resource == 'mgr':
         self.resource_obj = pod.get_mgr_pods()
         self.type = 'rook-ceph'
     if self.resource == 'mon':
         self.resource_obj = pod.get_mon_pods()
         self.type = 'rook-ceph'
     if self.resource == 'osd':
         self.resource_obj = pod.get_osd_pods()
         self.type = 'rook-ceph'
     if self.resource == 'mds':
         self.resource_obj = pod.get_mds_pods()
         self.type = 'rook-ceph'
     if self.resource == 'cephfsplugin':
         self.resource_obj = pod.get_plugin_pods(
             interface=constants.CEPHFILESYSTEM
         )
         self.type = 'csi'
     if self.resource == 'rbdplugin':
         self.resource_obj = pod.get_plugin_pods(
             interface=constants.CEPHBLOCKPOOL
         )
         self.type = 'csi'
     self.resource_count = len(self.resource_obj)
예제 #5
0
    def scan_cluster(self):
        """
        Get accurate info on current state of pods
        """
        self._ceph_pods = pod.get_all_pods(self._namespace)
        # TODO: Workaround for BZ1748325:
        mons = pod.get_mon_pods(self.mon_selector, self.namespace)
        for mon in mons:
            if mon.ocp.get_resource_status(
                    mon.name) == constant.STATUS_RUNNING:
                self.mons.append(mon)
        # TODO: End of workaround for BZ1748325
        self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace)
        self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace)
        self.osds = pod.get_osd_pods(self.osd_selector, self.namespace)
        self.toolbox = pod.get_ceph_tools_pod()

        # set port attrib on mon pods
        self.mons = list(map(self.set_port, self.mons))
        self.cluster.reload()
        if self.cephfs:
            self.cephfs.reload()
        else:
            try:
                self.cephfs_config = self.CEPHFS.get().get('items')[0]
                self.cephfs = ocs.OCS(**self.cephfs_config)
                self.cephfs.reload()
            except IndexError as e:
                logging.warning(e)
                logging.warning("No CephFS found")

        self.mon_count = len(self.mons)
        self.mds_count = len(self.mdss)
        self.mgr_count = len(self.mgrs)
        self.osd_count = len(self.osds)
예제 #6
0
    def scan_cluster(self):
        """
        Get accurate info on current state of pods
        """
        self._ceph_pods = pod.get_all_pods(self._namespace)
        self.mons = pod.get_mon_pods(self.mon_selector, self.namespace)
        self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace)
        self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace)
        self.osds = pod.get_osd_pods(self.osd_selector, self.namespace)
        self.toolbox = pod.get_ceph_tools_pod()

        # set port attrib on mon pods
        self.mons = list(map(self.set_port, self.mons))
        self.cluster.reload()
        if self.cephfs:
            self.cephfs.reload()
        else:
            try:
                self.cephfs_config = self.CEPHFS.get().get('items')[0]
                self.cephfs = ocs.OCS(**self.cephfs_config)
                self.cephfs.reload()
            except IndexError as e:
                logging.warning(e)
                logging.warning("No CephFS found")

        self.mon_count = len(self.mons)
        self.mds_count = len(self.mdss)
        self.mgr_count = len(self.mgrs)
        self.osd_count = len(self.osds)
예제 #7
0
def verify_multus_network():
    """
    Verify Multus network(s) created successfully and are present on relevant pods.
    """
    with open(constants.MULTUS_YAML, mode="r") as f:
        multus_public_data = yaml.load(f)
        multus_namespace = multus_public_data["metadata"]["namespace"]
        multus_name = multus_public_data["metadata"]["name"]
        multus_public_network_name = f"{multus_namespace}/{multus_name}"

    log.info("Verifying multus NetworkAttachmentDefinitions")
    ocp.OCP(
        resource_name=multus_public_network_name,
        kind="network-attachment-definitions",
        namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
    )
    # TODO: also check if private NAD exists

    log.info("Verifying multus public network exists on ceph pods")
    osd_pods = get_osd_pods()
    for _pod in osd_pods:
        assert (_pod.data["metadata"]["annotations"]
                ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name)
    # TODO: also check private network if it exists on OSD pods

    mon_pods = get_mon_pods()
    mds_pods = get_mds_pods()
    mgr_pods = get_mgr_pods()
    rgw_pods = get_rgw_pods()
    ceph_pods = [*mon_pods, *mds_pods, *mgr_pods, *rgw_pods]
    for _pod in ceph_pods:
        assert (_pod.data["metadata"]["annotations"]
                ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name)

    log.info("Verifying multus public network exists on CSI pods")
    csi_pods = []
    interfaces = [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]
    for interface in interfaces:
        plugin_pods = get_plugin_pods(interface)
        csi_pods += plugin_pods

    cephfs_provisioner_pods = get_cephfsplugin_provisioner_pods()
    rbd_provisioner_pods = get_rbdfsplugin_provisioner_pods()

    csi_pods += cephfs_provisioner_pods
    csi_pods += rbd_provisioner_pods

    for _pod in csi_pods:
        assert (_pod.data["metadata"]["annotations"]
                ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name)

    log.info("Verifying StorageCluster multus network data")
    sc = get_storage_cluster()
    sc_data = sc.get().get("items")[0]
    network_data = sc_data["spec"]["network"]
    assert network_data["provider"] == "multus"
    selectors = network_data["selectors"]
    assert selectors[
        "public"] == f"{defaults.ROOK_CLUSTER_NAMESPACE}/ocs-public"
예제 #8
0
    def set_resource(self, resource, leader_type="provisioner"):
        self.resource = resource
        if (config.ENV_DATA["platform"]
                in constants.MANAGED_SERVICE_PLATFORMS) and (resource
                                                             in CEPH_PODS):
            # If the platform is Managed Services, then the ceph pods will be present in the provider cluster.
            # Consumer cluster will be the primary cluster context in a multicluster run. Setting 'cluster_kubeconfig'
            # attribute to use as the value of the parameter '--kubeconfig' in the 'oc' commands to get ceph pods.
            provider_kubeconfig = os.path.join(
                config.clusters[
                    config.get_provider_index()].ENV_DATA["cluster_path"],
                config.clusters[config.get_provider_index()].RUN.get(
                    "kubeconfig_location"),
            )
            self.cluster_kubeconfig = provider_kubeconfig
        resource_count = 0
        if self.resource == "mgr":
            self.resource_obj = pod.get_mgr_pods()
            self.selector = constants.MGR_APP_LABEL
        if self.resource == "mon":
            self.resource_obj = pod.get_mon_pods()
            self.selector = constants.MON_APP_LABEL
        if self.resource == "osd":
            self.resource_obj = pod.get_osd_pods()
            self.selector = constants.OSD_APP_LABEL
        if self.resource == "mds":
            self.resource_obj = pod.get_mds_pods()
            self.selector = constants.MDS_APP_LABEL
        if self.resource == "cephfsplugin":
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHFILESYSTEM)
            self.selector = constants.CSI_CEPHFSPLUGIN_LABEL
        if self.resource == "rbdplugin":
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHBLOCKPOOL)
            self.selector = constants.CSI_RBDPLUGIN_LABEL
        if self.resource == "cephfsplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHFILESYSTEM,
                    leader_type=leader_type)
            ]
            self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_cephfsplugin_provisioner_pods())
        if self.resource == "rbdplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHBLOCKPOOL, leader_type=leader_type)
            ]
            self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_rbdfsplugin_provisioner_pods())
        if self.resource == "operator":
            self.resource_obj = pod.get_operator_pods()
            self.selector = constants.OPERATOR_LABEL

        self.resource_count = resource_count or len(self.resource_obj)
예제 #9
0
 def set_resource(self, resource):
     self.resource = resource
     if self.resource == 'mgr':
         self.resource_obj = pod.get_mgr_pods()
     if self.resource == 'mon':
         self.resource_obj = pod.get_mon_pods()
     if self.resource == 'osd':
         self.resource_obj = pod.get_osd_pods()
     if self.resource == 'mds':
         self.resource_obj = pod.get_mds_pods()
     self.resource_count = len(self.resource_obj)
    def test_monitoring_after_rebooting_node_where_mgr_is_running(
            self, nodes, pods):
        """
        Test case to validate rebooting a node where mgr is running
        should not delete the data collected on prometheus pod

        """

        # Get the mgr pod obj
        mgr_pod_obj = pod.get_mgr_pods()

        # Get the node where the mgr pod is hosted
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        # Reboot the node where the mgr pod is hosted
        nodes.restart_nodes([mgr_node_obj])

        # Validate all nodes are in READY state
        retry((CommandFailed, ResourceWrongStatusException),
              tries=20,
              delay=15)(wait_for_nodes_status())

        # Check for Ceph pods
        pod_obj = ocp.OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        assert pod_obj.wait_for_resource(condition="Running",
                                         selector="app=rook-ceph-mgr",
                                         timeout=600)
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-mon",
            resource_count=3,
            timeout=600,
        )
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-osd",
            resource_count=3,
            timeout=600,
        )

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check for ceph health check metrics is updated with new mgr pod
        wait_to_update_mgrpod_info_prometheus_pod()

        # Check for the created pvc metrics after rebooting the node where mgr pod was running
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
예제 #11
0
def test_pod_log_after_upgrade():
    """
    Check OSD/MON/MGR pod logs after upgrade and verify the expected log exist

    """
    pod_objs = get_osd_pods() + get_mon_pods() + get_mgr_pods()
    pod_names = [osd_pod_obj.name for osd_pod_obj in pod_objs]
    expected_log_after_upgrade = "set uid:gid to 167:167 (ceph:ceph)"
    logging.info(f"Check that the log '{expected_log_after_upgrade}' "
                 f"appears after the osd/mon/mg pod is initialized")
    for pod_name in pod_names:
        pod_logs = get_pod_logs(pod_name=pod_name, all_containers=True)
        assert expected_log_after_upgrade in pod_logs, (
            f"The expected log after upgrade '{expected_log_after_upgrade}' does not exist"
            f" on pod {pod_name}")
    logging.info(
        f"The log '{expected_log_after_upgrade}' appears in all relevant pods."
    )
예제 #12
0
def get_node_pods_to_scale_down(node_name):
    """
    Get the pods of a node to scale down as described in the documents
    of node replacement with LSO

    Args:
        node_name (str): The node name

    Returns:
        list: The node's pods to scale down

    """
    pods_to_scale_down = [
        *pod.get_mon_pods(),
        *pod.get_osd_pods(),
        *pod.get_mgr_pods(),
    ]

    return get_node_pods(node_name, pods_to_scale_down)
예제 #13
0
    def set_resource(self, resource, leader_type="provisioner"):
        self.resource = resource
        resource_count = 0
        if self.resource == "mgr":
            self.resource_obj = pod.get_mgr_pods()
            self.selector = constants.MGR_APP_LABEL
        if self.resource == "mon":
            self.resource_obj = pod.get_mon_pods()
            self.selector = constants.MON_APP_LABEL
        if self.resource == "osd":
            self.resource_obj = pod.get_osd_pods()
            self.selector = constants.OSD_APP_LABEL
        if self.resource == "mds":
            self.resource_obj = pod.get_mds_pods()
            self.selector = constants.MDS_APP_LABEL
        if self.resource == "cephfsplugin":
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHFILESYSTEM)
            self.selector = constants.CSI_CEPHFSPLUGIN_LABEL
        if self.resource == "rbdplugin":
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHBLOCKPOOL)
            self.selector = constants.CSI_RBDPLUGIN_LABEL
        if self.resource == "cephfsplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHFILESYSTEM,
                    leader_type=leader_type)
            ]
            self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_cephfsplugin_provisioner_pods())
        if self.resource == "rbdplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHBLOCKPOOL, leader_type=leader_type)
            ]
            self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_rbdfsplugin_provisioner_pods())
        if self.resource == "operator":
            self.resource_obj = pod.get_operator_pods()
            self.selector = constants.OPERATOR_LABEL

        self.resource_count = resource_count or len(self.resource_obj)
예제 #14
0
    def set_resource(self, resource):
        self.resource = resource
        resource_count = 0
        if self.resource == 'mgr':
            self.resource_obj = pod.get_mgr_pods()
            self.selector = constants.MGR_APP_LABEL
        if self.resource == 'mon':
            self.resource_obj = pod.get_mon_pods()
            self.selector = constants.MON_APP_LABEL
        if self.resource == 'osd':
            self.resource_obj = pod.get_osd_pods()
            self.selector = constants.OSD_APP_LABEL
        if self.resource == 'mds':
            self.resource_obj = pod.get_mds_pods()
            self.selector = constants.MDS_APP_LABEL
        if self.resource == 'cephfsplugin':
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHFILESYSTEM)
            self.selector = constants.CSI_CEPHFSPLUGIN_LABEL
        if self.resource == 'rbdplugin':
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHBLOCKPOOL)
            self.selector = constants.CSI_RBDPLUGIN_LABEL
        if self.resource == 'cephfsplugin_provisioner':
            self.resource_obj = [
                pod.plugin_provisioner_leader(
                    interface=constants.CEPHFILESYSTEM)
            ]
            self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_cephfsplugin_provisioner_pods())
        if self.resource == 'rbdplugin_provisioner':
            self.resource_obj = [
                pod.plugin_provisioner_leader(
                    interface=constants.CEPHBLOCKPOOL)
            ]
            self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_rbdfsplugin_provisioner_pods())
        if self.resource == 'operator':
            self.resource_obj = pod.get_operator_pods()
            self.selector = constants.OPERATOR_LABEL

        self.resource_count = resource_count or len(self.resource_obj)
예제 #15
0
파일: cluster.py 프로젝트: xhtheking/ocs-ci
    def scan_cluster(self):
        """
        Get accurate info on current state of pods
        """
        self._ceph_pods = pod.get_all_pods(self._namespace)
        self.mons = pod.get_mon_pods(self.mon_selector, self.namespace)
        self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace)
        self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace)
        self.osds = pod.get_osd_pods(self.osd_selector, self.namespace)
        self.toolbox = pod.get_ceph_tools_pod()

        # set port attrib on mon pods
        self.mons = list(map(self.set_port, self.mons))
        self.cluster.reload()
        if self.cephfs_config:
            self.cephfs.reload()

        self.mon_count = len(self.mons)
        self.mds_count = len(self.mdss)
        self.mgr_count = len(self.mgrs)
        self.osd_count = len(self.osds)
예제 #16
0
def measure_stop_worker_nodes(request, measurement_dir, nodes):
    """
    Stop worker nodes that doesn't contain RGW (so that alerts are triggered
    correctly), measure the time when it was stopped and monitors alerts that
    were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            worker node

    """
    mgr_pod = pod.get_mgr_pods()[0]
    mgr_node = pod.get_pod_node(mgr_pod)
    test_nodes = [
        worker_node
        for worker_node in get_nodes(node_type=constants.WORKER_MACHINE)
        if worker_node.name != mgr_node.name
    ]

    def stop_nodes():
        """
        Turn off test nodes for 5 minutes.

        Returns:
            list: Names of nodes that were turned down

        """
        # run_time of operation
        run_time = 60 * 5
        nonlocal test_nodes
        node_names = [node.name for node in test_nodes]
        logger.info(f"Turning off nodes {node_names}")
        nodes.stop_nodes(nodes=test_nodes)
        # Validate node reached NotReady state
        wait_for_nodes_status(node_names=node_names, status=constants.NODE_NOT_READY)
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return node_names

    def finalizer():
        nodes.restart_nodes_by_stop_and_start_teardown()
        assert ceph_health_check(), "Ceph cluster health is not OK"
        logger.info("Ceph cluster health is OK")

    request.addfinalizer(finalizer)

    test_file = os.path.join(measurement_dir, "measure_stop_nodes.json")
    if config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS:
        # It seems that it takes longer to propagate incidents to PagerDuty.
        # Adding 3 extra minutes
        measured_op = measure_operation(stop_nodes, test_file, minimal_time=60 * 8)
    else:
        measured_op = measure_operation(stop_nodes, test_file)
    logger.info("Turning on nodes")
    try:
        nodes.start_nodes(nodes=test_nodes)
    except CommandFailed:
        logger.warning(
            "Nodes were not found: they were probably recreated. Check ceph health below"
        )
    # Validate all nodes are in READY state and up
    retry((CommandFailed, ResourceWrongStatusException,), tries=60, delay=15,)(
        wait_for_nodes_status
    )(timeout=900)

    # wait for ceph to return into HEALTH_OK state after mgr deployment
    # is returned back to normal
    ceph_health_check(tries=20, delay=15)

    return measured_op
예제 #17
0
    def set_resource(self, resource, leader_type="provisioner", cluster_index=None):
        self.resource = resource
        if (config.ENV_DATA["platform"] in constants.MANAGED_SERVICE_PLATFORMS) and (
            resource in CEPH_PODS
        ):
            # If the platform is Managed Services, then the ceph pods will be present in the provider cluster.
            # Consumer cluster will be the primary cluster context in a multicluster run. Setting 'cluster_kubeconfig'
            # attribute to use as the value of the parameter '--kubeconfig' in the 'oc' commands to get ceph pods.
            provider_kubeconfig = os.path.join(
                config.clusters[config.get_provider_index()].ENV_DATA["cluster_path"],
                config.clusters[config.get_provider_index()].RUN.get(
                    "kubeconfig_location"
                ),
            )
            self.cluster_kubeconfig = provider_kubeconfig
        elif config.ENV_DATA["platform"] in constants.MANAGED_SERVICE_PLATFORMS:
            # cluster_index is used to identify the the cluster in which the pod is residing. If cluster_index is not
            # passed, assume that the context is already changed to the cluster where the pod is residing.
            cluster_index = (
                cluster_index if cluster_index is not None else config.cur_index
            )
            self.cluster_kubeconfig = os.path.join(
                config.clusters[cluster_index].ENV_DATA["cluster_path"],
                config.clusters[cluster_index].RUN.get("kubeconfig_location"),
            )
        resource_count = 0
        if self.resource == "mgr":
            self.resource_obj = pod.get_mgr_pods()
            self.selector = constants.MGR_APP_LABEL
        if self.resource == "mon":
            self.resource_obj = pod.get_mon_pods()
            self.selector = constants.MON_APP_LABEL
        if self.resource == "osd":
            self.resource_obj = pod.get_osd_pods()
            self.selector = constants.OSD_APP_LABEL
        if self.resource == "mds":
            self.resource_obj = pod.get_mds_pods()
            self.selector = constants.MDS_APP_LABEL
        if self.resource == "cephfsplugin":
            self.resource_obj = pod.get_plugin_pods(interface=constants.CEPHFILESYSTEM)
            self.selector = constants.CSI_CEPHFSPLUGIN_LABEL
        if self.resource == "rbdplugin":
            self.resource_obj = pod.get_plugin_pods(interface=constants.CEPHBLOCKPOOL)
            self.selector = constants.CSI_RBDPLUGIN_LABEL
        if self.resource == "cephfsplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHFILESYSTEM, leader_type=leader_type
                )
            ]
            self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_cephfsplugin_provisioner_pods())
        if self.resource == "rbdplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHBLOCKPOOL, leader_type=leader_type
                )
            ]
            self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_rbdfsplugin_provisioner_pods())
        if self.resource == "operator":
            self.resource_obj = pod.get_operator_pods()
            self.selector = constants.OPERATOR_LABEL
        if self.resource == "ocs_operator":
            self.resource_obj = [pod.get_ocs_operator_pod()]
            self.selector = constants.OCS_OPERATOR_LABEL
        if self.resource == "alertmanager_managed_ocs_alertmanager":
            self.resource_obj = pod.get_alertmanager_managed_ocs_alertmanager_pods()
            self.selector = constants.MANAGED_ALERTMANAGER_LABEL
        if self.resource == "ocs_osd_controller_manager":
            self.resource_obj = [pod.get_ocs_osd_controller_manager_pod()]
            self.selector = constants.MANAGED_CONTROLLER_LABEL
            # Setting resource_count because odf-operator-controller-manager pod also have the same label.
            resource_count = len(
                pod.get_pods_having_label(
                    constants.MANAGED_CONTROLLER_LABEL,
                    config.ENV_DATA["cluster_namespace"],
                )
            )
        if self.resource == "prometheus_managed_ocs_prometheus":
            self.resource_obj = [pod.get_prometheus_managed_ocs_prometheus_pod()]
            self.selector = constants.MANAGED_PROMETHEUS_LABEL
        if self.resource == "prometheus_operator":
            self.resource_obj = [pod.get_prometheus_operator_pod()]
            self.selector = constants.PROMETHEUS_OPERATOR_LABEL
        if self.resource == "ocs_provider_server":
            self.resource_obj = [pod.get_ocs_provider_server_pod()]
            self.selector = constants.PROVIDER_SERVER_LABEL

        self.resource_count = resource_count or len(self.resource_obj)
예제 #18
0
    def test_coredump_check_for_ceph_daemon_crash(self):
        """
        Verify coredumpctl list updated after killing daemon

        """
        log.info("Get Node name where mon pod running")
        mon_pod_nodes = [get_pod_node(pod) for pod in get_mon_pods()]
        mon_pod_node_names = [node.name for node in mon_pod_nodes]

        log.info("Get Node name where mgr pod running")
        mgr_pod_nodes = [get_pod_node(pod) for pod in get_mgr_pods()]
        mgr_pod_node_names = [node.name for node in mgr_pod_nodes]

        log.info("Get Node name where osd pod running")
        osd_pod_nodes = [get_pod_node(pod) for pod in get_osd_pods()]
        osd_pod_node_names = [node.name for node in osd_pod_nodes]

        node_mgr_mon_osd_names = set(mgr_pod_node_names).intersection(
            osd_pod_node_names, mon_pod_node_names)
        node_mgr_osd_names = set(mgr_pod_node_names).intersection(
            osd_pod_node_names)
        node_mgr_mon_names = set(mgr_pod_node_names).intersection(
            mon_pod_node_names)

        if len(node_mgr_mon_osd_names) > 0:
            daemon_types = ["mgr", "osd", "mon"]
            node_name = list(node_mgr_mon_osd_names)[0]
        elif len(node_mgr_osd_names) > 0:
            daemon_types = ["mgr", "osd"]
            node_name = list(node_mgr_osd_names)[0]
        elif len(node_mgr_mon_names) > 0:
            daemon_types = ["mgr", "mon"]
            node_name = list(node_mgr_mon_names)[0]
        else:
            daemon_types = ["mgr"]
            node_name = mgr_pod_node_names[0]
        log.info(f"Test the daemon_types {daemon_types} on node {node_name}")

        log.info("Delete the contents of 'posted' directory "
                 "`/var/lib/rook/openshift-storage/crash/posted/`")
        cmd_bash = f"oc debug nodes/{node_name} -- chroot /host /bin/bash -c "
        cmd_delete_files = '"rm -rf /var/lib/rook/openshift-storage/crash/posted/*"'
        cmd = cmd_bash + cmd_delete_files
        run_cmd(cmd=cmd)

        for daemon_type in daemon_types:
            log.info(f"find ceph-{daemon_type} process-id")
            cmd_pid = f"pidof ceph-{daemon_type}"
            cmd_gen = "oc debug node/" + node_name + " -- chroot /host "
            cmd = cmd_gen + cmd_pid
            out = run_cmd(cmd=cmd)
            pids = out.strip().split()
            pid = pids[0]
            if not pid.isnumeric():
                raise Exception(
                    f"The ceph-{daemon_type} process-id was not found.")

            log.info(f"Kill ceph-{daemon_type} process-id {pid}")
            disruptions_obj = Disruptions()
            disruptions_obj.daemon_pid = pid
            disruptions_obj.kill_daemon(node_name=node_name,
                                        check_new_pid=False,
                                        kill_signal="11")

        log.info(
            f"Verify that we have a crash event for ceph-{daemon_types} crash (tool pod)"
        )
        sample = TimeoutSampler(
            timeout=600,
            sleep=10,
            func=run_cmd_verify_cli_output,
            cmd="ceph crash ls",
            expected_output_lst=daemon_types,
            cephtool_cmd=True,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                f"ceph-{daemon_types} process does not exist on crash list (tool pod)"
            )

        log.info(
            f"Verify coredumpctl list updated after killing {daemon_types} daemons on {node_name}"
        )
        sample = TimeoutSampler(
            timeout=600,
            sleep=10,
            func=run_cmd_verify_cli_output,
            cmd="coredumpctl list",
            expected_output_lst=daemon_types,
            debug_node=node_name,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                f"coredump not getting generated for ceph-{daemon_types} daemon crash"
            )

        log.info(
            f"Verify the directory postedcoredumpctl is not empty on {node_name}"
        )
        sample = TimeoutSampler(
            timeout=600,
            sleep=10,
            func=run_cmd_verify_cli_output,
            cmd="ls -ltr /var/lib/rook/openshift-storage/crash/posted/",
            expected_output_lst=[":"],
            debug_node=node_name,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                f"coredump not getting generated for {daemon_types} daemons crash"
            )

        log.info(
            "Verify ceph status moved to HEALTH_WARN state with the relevant "
            "information (daemons have recently crashed)")
        sample = TimeoutSampler(
            timeout=20,
            sleep=5,
            func=run_cmd_verify_cli_output,
            cmd="ceph health detail",
            expected_output_lst=daemon_types +
            ["HEALTH_WARN", "daemons have recently crashed"],
            cephtool_cmd=True,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                "The output of command ceph health detail did not show "
                "warning 'daemons have recently crashed'")
    def test_restart_mgr_while_two_mons_down(self, pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory):
        """
        Test Procedure:
        1.Scaling down two mons:
        oc scale --replicas=0 deploy/rook-ceph-mon-a
        oc scale --replicas=0 deploy/rook-ceph-mon-b

        2.Restarting mgr
        oc delete pod -l app=rook-ceph-mgr

        3.sleep 5 seconds

        4.Scaling mons back up
        oc scale --replicas=1 deploy/rook-ceph-mon-a
        oc scale --replicas=1 deploy/rook-ceph-mon-b

        5.sleep 10

        6.Waiting for mgr pod move to running state:
        oc get pod -l app=rook-ceph-mgr

        """
        self.oc = ocp.OCP(kind=constants.DEPLOYMENT,
                          namespace=config.ENV_DATA["cluster_namespace"])
        mons = [
            mon["metadata"]["name"] for mon in get_deployments_having_label(
                constants.MON_APP_LABEL, defaults.ROOK_CLUSTER_NAMESPACE)
        ]
        self.mons_scale = mons[0:2]
        tries = 11
        for index in range(1, tries):
            log.info(f"Scaling down two mons {self.mons_scale}, index={index}")
            for mon_scale in self.mons_scale:
                self.oc.exec_oc_cmd(
                    f"scale --replicas=0 deployment/{mon_scale}")

            log.info(f"Restarting mgr pod, index={index}")
            mgr_pod = get_mgr_pods()
            mgr_pod[0].delete(wait=True)

            time.sleep(5)

            log.info(f"Scaling up two mons {self.mons_scale}, index={index}")
            for mon_scale in self.mons_scale:
                self.oc.exec_oc_cmd(
                    f"scale --replicas=1 deployment/{mon_scale}")

            time.sleep(10)

            log.info(
                f"Waiting for mgr pod move to Running state, index={index}")
            mgr_pod_obj = ocp.OCP(kind=constants.POD,
                                  namespace=defaults.ROOK_CLUSTER_NAMESPACE)
            assert mgr_pod_obj.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MGR_APP_LABEL,
                resource_count=1,
                timeout=100,
            ), f"Mgr pod did'nt move to Running state after 100 seconds, index={index}"

        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        log.info("Deleting Resources using sanity helpers")
        self.sanity_helpers.delete_resources()
예제 #20
0
    def test_coredump_check_for_ceph_daemon_crash(self, daemon_type):
        """
        Verify coredumpctl list updated after killing daemon

        """
        log.info(f"Get Node name where {daemon_type} pod running")
        if daemon_type == "mon":
            mon_pod_nodes = [get_pod_node(pod) for pod in get_mon_pods()]
            node_obj = mon_pod_nodes[0]
        elif daemon_type == "mgr":
            mgr_pod_nodes = [get_pod_node(pod) for pod in get_mgr_pods()]
            node_obj = mgr_pod_nodes[0]
        elif daemon_type == "osd":
            osd_pod_nodes = [get_pod_node(pod) for pod in get_osd_pods()]
            node_obj = osd_pod_nodes[0]
        node_name = node_obj.name

        log.info("Delete the contents of 'posted' directory "
                 "`/var/lib/rook/openshift-storage/crash/posted/`")
        cmd_bash = f"oc debug nodes/{node_name} -- chroot /host /bin/bash -c "
        cmd_delete_files = '"rm -rf /var/lib/rook/openshift-storage/crash/posted/*"'
        cmd = cmd_bash + cmd_delete_files
        run_cmd(cmd=cmd)

        log.info(f"find ceph-{daemon_type} process-id")
        cmd_pid = f"pidof ceph-{daemon_type}"
        cmd_gen = "oc debug node/" + node_name + " -- chroot /host "
        cmd = cmd_gen + cmd_pid
        out = run_cmd(cmd=cmd)
        pid = out.strip()
        if not pid.isnumeric():
            raise Exception(
                f"The ceph-{daemon_type} process-id was not found.")

        log.info(f"Kill ceph-{daemon_type} process-id {pid}")
        disruptions_obj = Disruptions()
        disruptions_obj.daemon_pid = pid
        disruptions_obj.kill_daemon(node_name=node_name,
                                    check_new_pid=False,
                                    kill_signal="11")

        log.info(
            f"Verify that we have a crash event for ceph-{daemon_type} crash")
        sample = TimeoutSampler(
            timeout=600,
            sleep=10,
            func=run_cmd_verify_cli_output,
            cmd="ceph crash ls",
            expected_output_lst=[daemon_type],
            cephtool_cmd=True,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                f"ceph-{daemon_type} process does not exist on crash list (tool pod)"
            )

        log.info(
            f"Verify coredumpctl list updated after killing ceph-{daemon_type} daemon"
        )
        sample = TimeoutSampler(
            timeout=600,
            sleep=10,
            func=run_cmd_verify_cli_output,
            cmd="coredumpctl list",
            expected_output_lst=[daemon_type],
            debug_node=node_name,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                f"coredump not getting generated for ceph-{daemon_type} daemon crash"
            )

        sample = TimeoutSampler(
            timeout=600,
            sleep=10,
            func=run_cmd_verify_cli_output,
            cmd="ls -ltr /var/lib/rook/openshift-storage/crash/posted/",
            expected_output_lst=[":"],
            debug_node=node_name,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                f"coredump not getting generated for ceph-{daemon_type} daemon crash"
            )