示例#1
0
def wait_for_replication_resources_deletion(namespace, timeout, check_state=True):
    """
    Wait for replication resources to be deleted

    Args:
        namespace (str): the namespace of the resources'
        timeout (int): time in seconds to wait for resources to reach expected
            state or deleted
        check_state (bool): True for checking resources state before deletion, False otherwise

    Raises:
        TimeoutExpiredError: In case replication resources not deleted

    """
    if check_state:
        logger.info("Waiting for all VRs to reach secondary state")
        sample = TimeoutSampler(
            timeout=timeout,
            sleep=5,
            func=check_vr_state,
            state="secondary",
            namespace=namespace,
        )
        if not sample.wait_for_func_status(result=True):
            error_msg = "One or more VR haven't reached expected state secondary within the time limit."
            logger.error(error_msg)
            raise TimeoutExpiredError(error_msg)

        logger.info("Waiting for VRG to reach secondary state")
        sample = TimeoutSampler(
            timeout=timeout,
            sleep=5,
            func=check_vrg_state,
            state="secondary",
            namespace=namespace,
        )
        if not sample.wait_for_func_status(result=True):
            error_msg = (
                "VRG hasn't reached expected state secondary within the time limit."
            )
            logger.info(error_msg)
            raise TimeoutExpiredError(error_msg)

    logger.info("Waiting for VRG to be deleted")
    sample = TimeoutSampler(
        timeout=timeout, sleep=5, func=check_vrg_existence, namespace=namespace
    )
    if not sample.wait_for_func_status(result=False):
        error_msg = "VRG resource not deleted"
        logger.info(error_msg)
        raise TimeoutExpiredError(error_msg)

    logger.info("Waiting for all VRs to be deleted")
    sample = TimeoutSampler(
        timeout=timeout,
        sleep=5,
        func=get_vr_count,
        namespace=namespace,
    )
    sample.wait_for_func_value(0)
示例#2
0
    def detach_and_delete_vols(self, volumes):
        """
        Detach and delete volumes from the list

        Args:
            volumes (list): of Volume objects

        """
        for v in volumes:
            if v.status == "in-use":
                v.detach()
                v.get()
                sample = TimeoutSampler(
                    100,
                    5,
                    self.check_expected_vol_status,
                    vol=v,
                    expected_state="available",
                )
                if not sample.wait_for_func_status(True):
                    logger.error(f"Volume {v.name} failed to detach")
                    raise exceptions.PSIVolumeNotInExpectedState()

            v.delete()
            sample = TimeoutSampler(100, 5, self.check_vol_deleted, vol=v)
            if not sample.wait_for_func_status(True):
                logger.error(f"Failed to delete Volume {v.name}")
                raise exceptions.PSIVolumeDeletionFailed()
示例#3
0
    def verify_obc(self):
        """
        OBC verification from external cluster perspective,
        we will check 2 OBCs

        """
        sample = TimeoutSampler(300, 5, self.ceph_cluster.noobaa_health_check)
        sample.wait_for_func_status(True)
示例#4
0
def wait_for_replication_resources_creation(vr_count, namespace, timeout):
    """
    Wait for replication resources to be created

    Args:
        vr_count (int): Expected number of VR resources
        namespace (str): the namespace of the VR resources
        timeout (int): time in seconds to wait for VR resources to be created
            or reach expected state

    Raises:
        TimeoutExpiredError: In case replication resources not created

    """
    logger.info("Waiting for VRG to be created")
    sample = TimeoutSampler(
        timeout=timeout, sleep=5, func=check_vrg_existence, namespace=namespace
    )
    if not sample.wait_for_func_status(result=True):
        error_msg = "VRG resource is not created"
        logger.error(error_msg)
        raise TimeoutExpiredError(error_msg)

    logger.info(f"Waiting for {vr_count} VRs to be created")
    sample = TimeoutSampler(
        timeout=timeout,
        sleep=5,
        func=get_vr_count,
        namespace=namespace,
    )
    sample.wait_for_func_value(vr_count)

    logger.info(f"Waiting for {vr_count} VRs to reach primary state")
    sample = TimeoutSampler(
        timeout=timeout,
        sleep=5,
        func=check_vr_state,
        state="primary",
        namespace=namespace,
    )
    if not sample.wait_for_func_status(result=True):
        error_msg = "One or more VR haven't reached expected state primary within the time limit."
        logger.error(error_msg)
        raise TimeoutExpiredError(error_msg)

    logger.info("Waiting for VRG to reach primary state")
    sample = TimeoutSampler(
        timeout=timeout,
        sleep=5,
        func=check_vrg_state,
        state="primary",
        namespace=namespace,
    )
    if not sample.wait_for_func_status(result=True):
        error_msg = "VRG hasn't reached expected state primary within the time limit."
        logger.error(error_msg)
        raise TimeoutExpiredError(error_msg)
示例#5
0
    def check_capacity_breakdown(self, project_name, pod_name):
        """
        Check Capacity Breakdown

        Args:
            project_name (str): The name of the project
            pod_name (str): The name of pod

        Returns:
            bool: True if project_name and pod_name exist on capacity_breakdown, False otherwise

        """
        self.navigate_overview_page()
        if self.ocp_version == "4.7":
            self.do_click(self.validation_loc["persistent_storage_tab"])
        self.choose_expanded_mode(
            mode=True,
            locator=self.validation_loc["capacity_breakdown_options"])
        self.do_click(self.validation_loc["capacity_breakdown_projects"])
        self.take_screenshot()
        res = True
        sample = TimeoutSampler(
            timeout=30,
            sleep=2,
            func=self.check_element_text,
            expected_text=project_name,
        )
        if not sample.wait_for_func_status(result=True):
            logger.error(
                f"The project {project_name} not found on capacity_breakdown")
            res = False

        self.choose_expanded_mode(
            mode=True,
            locator=self.validation_loc["capacity_breakdown_options"])
        self.do_click(self.validation_loc["capacity_breakdown_pods"])
        self.take_screenshot()

        sample = TimeoutSampler(
            timeout=30,
            sleep=2,
            func=self.check_element_text,
            expected_text=pod_name,
        )
        if not sample.wait_for_func_status(result=True):
            logger.error(f"The pod {pod_name} not found on capacity_breakdown")
            res = False
        return res
示例#6
0
    def test_fio_with_block_storage(self):
        name = "test_workload"
        spec = self.pod_obj.data.get("spec")
        path = spec.get("containers")[0].get("volumeMounts")[0].get(
            "mountPath")
        work_load = "fio"
        storage_type = "fs"
        # few io parameters for Fio
        runtime = 10
        size = "200M"

        wl = workload.WorkLoad(name, path, work_load, storage_type,
                               self.pod_obj)
        assert wl.setup()
        io_params = templating.load_yaml(constants.FIO_IO_PARAMS_YAML)
        io_params["runtime"] = runtime
        io_params["size"] = size

        future_result = wl.run(**io_params)

        timeout = 1200
        sample = TimeoutSampler(timeout=timeout,
                                sleep=3,
                                func=future_result.done)
        assert sample.wait_for_func_status(result=True)

        try:
            logger.info(future_result.result())
        except exceptions.CommandFailed:
            logger.exception("FIO failed")
            raise
        except Exception:
            logger.exception("Found Exception")
            raise
示例#7
0
    def stop_powernodes_machines(self,
                                 powernode_machines,
                                 timeout=900,
                                 wait=True,
                                 force=True):
        """
        Stop PowerNode Machines

        Args:
            powernode_machines (list): PowerNode objects
            timeout (int): time in seconds to wait for node to reach 'not ready' state
            wait (bool): True if need to wait till the restarted node reaches timeout
                - for future use
            force (bool): True for PowerNode ungraceful power off, False for
                graceful PowerNode shutdown - for future use

        Raises:
            UnexpectedBehaviour: If PowerNode machine is still up

        """
        ocpversion = get_ocp_version("-")
        for node in powernode_machines:
            cmd = f"sudo virsh shutdown test-ocp{ocpversion}-{node.name}"
            result = exec_cmd(cmd)
            logger.info(f"Result of shutdown {result}")
            logger.info("Verifying node is down")
            ret = TimeoutSampler(
                timeout=timeout,
                sleep=3,
                func=self.verify_machine_is_down,
                node=node,
            )
            logger.info(ret)
            if not ret.wait_for_func_status(result=True):
                raise UnexpectedBehaviour("Node {node.name} is still Running")
示例#8
0
    def verify_ocs_operator_succeeded(self, timeout_install=300, sleep=20):
        """
        Verify OCS Installation

        timeout_install (int): Time in seconds to wait
        sleep (int): Sampling time in seconds

        """
        self.navigate_operatorhub_page()
        self.navigate_installed_operators_page()

        self.do_send_keys(
            locator=self.dep_loc["search_ocs_install"],
            text="OpenShift Container Storage",
        )
        sample = TimeoutSampler(
            timeout=timeout_install,
            sleep=sleep,
            func=self.check_element_text,
            expected_text="Succeeded",
        )
        if not sample.wait_for_func_status(result=True):
            logger.error(
                f"OCS Installation status is not Succeeded after {timeout_install} seconds"
            )
            raise TimeoutExpiredError
示例#9
0
def wait_for_vr_state(state, namespace, timeout=300):
    """
    Wait for all VR resources to reach expected state in the given namespace

    Args:
        state (str): The VR state to check for (e.g. 'primary', 'secondary')
        namespace (str): the namespace of the VR resources
        timeout (int): time in seconds to wait for VR resources to be created
            or reach expected state

    Returns:
        bool: True if all VR are in expected state

    Raises:
        AssertionError: If VR resources are not in expected state

    """
    sample = TimeoutSampler(
        timeout=timeout, sleep=3, func=check_vr_state, state=state, namespace=namespace
    )
    assert sample.wait_for_func_status(
        result=True
    ), f"One or more VR haven't reached expected state {state} within the time limit."

    return True
示例#10
0
def wait_for_mirroring_status_ok(replaying_images=None, timeout=300):
    """
    Wait for mirroring status to reach health OK and expected number of replaying
    images for each of the ODF cluster

    Args:
        replaying_images (int): Expected number of images in replaying state
        timeout (int): time in seconds to wait for mirroring status reach OK

    Returns:
        bool: True if status contains expected health and states values

    Raises:
        AssertionError: In case of unexpected mirroring status

    """
    for cluster in get_non_acm_cluster_config():
        config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"])
        logger.info(
            f"Validating mirroring status on cluster {cluster.ENV_DATA['cluster_name']}"
        )
        sample = TimeoutSampler(
            timeout=timeout,
            sleep=5,
            func=check_mirroring_status_ok,
            replaying_images=replaying_images,
        )
        assert sample.wait_for_func_status(result=True), (
            "The mirroring status does not have expected values within the time"
            f" limit on cluster {cluster.ENV_DATA['cluster_name']}"
        )
    def deploy(self):
        """
        Deploy the benchmark-operator

        """
        log.info("Deploy the benchmark-operator project")
        try:
            run("make deploy", shell=True, check=True, cwd=self.dir)
        except Exception as ex:
            log.error(f"Failed to deploy benchmark operator : {ex}")

        log.info("Wait for the benchmark-operator deployment be available")
        try:
            cmd = f'wait --for=condition=available "{BMO_DEPLOYMENT}" -n {BMO_NAME} --timeout=300s'
            self.pod_obj.exec_oc_cmd(cmd)

            # At this point the benchmark operator pod is ready, but we need to
            # verifying that all containers in the pod are ready (up to 30 sec.)
            sample = TimeoutSampler(timeout=30, sleep=3, func=self._is_ready)
            if not sample.wait_for_func_status(True):
                raise Exception("Not all the containers are ready")
        except Exception as ex:
            log.error(f"Failed to wait for benchmark operator : {ex}")

        log.info("the benchmark Operator is ready")
示例#12
0
    def wait_for_phase(self, phase, timeout=300, sleep=5):
        """
        Wait till phase of resource is the same as required one passed in
        the phase parameter.

        Args:
            phase (str): Desired phase of resource object
            timeout (int): Timeout in seconds to wait for desired phase
            sleep (int): Time in seconds to sleep between attempts

        Raises:
            ResourceInUnexpectedState: In case the resource is not in expected
                phase.
            NotSupportedFunctionError: If resource doesn't have phase!
            ResourceNameNotSpecifiedException: in case the name is not
                specified.

        """
        self.check_function_supported(self._has_phase)
        self.check_name_is_specified()
        sampler = TimeoutSampler(timeout, sleep, self.check_phase, phase=phase)
        if not sampler.wait_for_func_status(True):
            raise ResourceInUnexpectedState(
                f"Resource: {self.resource_name} is not in expected phase: "
                f"{phase}")
示例#13
0
    def start(self, node, timeout):
        """
        Start the given service using systemctl.

        Args:
            node (object): Node objects
            timeout (int): time in seconds to wait for service to start.

        Raises:
            UnexpectedBehaviour: If service on powerNode machine is still not up
        """
        nodeip = self.nodes[node.name]
        cmd = f"ssh core@{nodeip} sudo systemctl start {self.service_name}.service"
        result = exec_cmd(cmd)
        logger.info(
            f"Result of start of service {self.service_name} is {result}")
        ret = TimeoutSampler(
            timeout=timeout,
            sleep=3,
            func=self.verify_service,
            node=node,
            action=ACTIVE,
        )
        if not ret.wait_for_func_status(result=True):
            raise UnexpectedBehaviour(
                "Service {self.service_name} on Node {node.name} is still not Running"
            )
示例#14
0
    def cluster_health_check(self, timeout=None):
        """
        Check overall cluster health.
        Relying on health reported by CephCluster.get()

        Args:
            timeout (int): in seconds. By default timeout value will be scaled
                based on number of ceph pods in the cluster. This is just a
                crude number. Its been observed that as the number of pods
                increases it takes more time for cluster's HEALTH_OK.

        Returns:
            bool: True if "HEALTH_OK"  else False

        Raises:
            CephHealthException: if cluster is not healthy
        """
        # Scale timeout only if user hasn't passed any value
        timeout = timeout or (10 * len(self.pods))
        sample = TimeoutSampler(timeout=timeout,
                                sleep=3,
                                func=self.is_health_ok)

        if not sample.wait_for_func_status(result=True):
            raise exceptions.CephHealthException("Cluster health is NOT OK")
        # This way of checking health of different cluster entities and
        # raising only CephHealthException is not elegant.
        # TODO: add an attribute in CephHealthException, called "reason"
        # which should tell because of which exact cluster entity health
        # is not ok ?
        expected_mon_count = self.mon_count
        expected_mds_count = self.mds_count

        self.scan_cluster()
        try:
            self.mon_health_check(expected_mon_count)
        except exceptions.MonCountException as e:
            logger.error(e)
            raise exceptions.CephHealthException("Cluster health is NOT OK")

        try:
            if not expected_mds_count:
                pass
            else:
                self.mds_health_check(expected_mds_count)
        except exceptions.MDSCountException as e:
            logger.error(e)
            raise exceptions.CephHealthException("Cluster health is NOT OK")

        # check noobaa health
        if not self.mcg_obj.status:
            raise exceptions.NoobaaHealthException("Cluster health is NOT OK")

        # TODO: OSD and MGR health check
        logger.info("Cluster HEALTH_OK")
        # This scan is for reconcilation on *.count
        # because during first scan in this function some of the
        # pods may not be up and would have set count to lesser number
        self.scan_cluster()
        return True
示例#15
0
    def verify_operator_succeeded(self,
                                  operator="OpenShift Container Storage",
                                  timeout_install=300,
                                  sleep=20):
        """
        Verify Operator Installation

        Args:
            operator (str): type of operator
            timeout_install (int): Time in seconds to wait
            sleep (int): Sampling time in seconds

        """
        self.search_operator_installed_operators_page(operator=operator)
        sample = TimeoutSampler(
            timeout=timeout_install,
            sleep=sleep,
            func=self.check_element_text,
            expected_text="Succeeded",
        )
        if not sample.wait_for_func_status(result=True):
            logger.error(
                f"{operator} Installation status is not Succeeded after {timeout_install} seconds"
            )
            raise TimeoutExpiredError
示例#16
0
    def test_fio_with_block_storage(self):
        name = 'test_workload'
        spec = self.pod_obj.data.get('spec')
        path = (
            spec.get('containers')[0].get('volumeMounts')[0].get('mountPath'))
        work_load = 'fio'
        storage_type = 'fs'
        # few io parameters for Fio
        runtime = 10
        size = '200M'

        wl = workload.WorkLoad(name, path, work_load, storage_type,
                               self.pod_obj)
        assert wl.setup()
        io_params = templating.load_yaml(constants.FIO_IO_PARAMS_YAML)
        io_params['runtime'] = runtime
        io_params['size'] = size

        future_result = wl.run(**io_params)

        timeout = 1200
        sample = TimeoutSampler(timeout=timeout,
                                sleep=3,
                                func=future_result.done)
        assert sample.wait_for_func_status(result=True)

        try:
            logger.info(future_result.result())
        except exceptions.CommandFailed:
            logger.exception(f"FIO failed")
            raise
        except Exception:
            logger.exception(f"Found Exception")
            raise
示例#17
0
    def verify_disks_lso_attached(self, timeout=600, sleep=20):
        """
        Verify Disks Attached

        Args:
            timeout (int): Time in seconds to wait
            sleep (int): Sampling time in seconds

        """
        osd_size = config.ENV_DATA.get("device_size", defaults.DEVICE_SIZE)
        number_worker_nodes = get_worker_nodes()
        capacity = int(osd_size) * len(number_worker_nodes)
        if capacity >= 1024:
            capacity_str = str(
                capacity / 1024).rstrip("0").rstrip(".") + " TiB"
        else:
            capacity_str = str(capacity) + " GiB"
        sample = TimeoutSampler(
            timeout=timeout,
            sleep=sleep,
            func=self.check_element_text,
            expected_text=capacity_str,
        )
        if not sample.wait_for_func_status(result=True):
            logger.error(f" after {timeout} seconds")
            raise TimeoutExpiredError
示例#18
0
    def stop(self, node, timeout):
        """
        Stop the given service using systemctl.

        Args:
            node (object): Node objects
            timeout (int): time in seconds to wait for service to stop.

        Raises:
            UnexpectedBehaviour: If service on PowerNode machine is still up
        """
        nodeip = self.nodes[node.name]
        cmd = (
            f"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@{self.bastion_ip} ssh core@{nodeip} "
            f"sudo systemctl stop {self.service_name}.service")
        if self.force:
            cmd += " -f"
        result = exec_cmd(cmd)
        logger.info(
            f"Result of shutdown {result}. Checking if service {self.service_name} went down."
        )
        ret = TimeoutSampler(
            timeout=timeout,
            sleep=3,
            func=self.verify_service,
            node=node,
            action=INACTIVE,
        )
        if not ret.wait_for_func_status(result=True):
            raise UnexpectedBehaviour(
                f"Service {self.service_name} on Node {node.name} is still Running"
            )
示例#19
0
    def verify_nodes_added(self, hosts):
        """
        Verify RHEL workers are added

        Args:
             hosts (list): list of aws private hostnames

        Raises:
            FailedToAddNodeException: if node addition failed

        """
        timeout = 600
        ocp_obj = ocp.OCP(kind="node")
        node_info = ocp_obj.get()
        for i in range(len(hosts)):
            for entry in node_info["items"]:
                for each in entry["status"]["addresses"]:
                    if each["type"] == "Hostname":
                        if each["address"] in hosts:
                            logging.info(
                                f"Checking status for {each['address']}")
                            sample = TimeoutSampler(timeout, 3,
                                                    self.get_ready_status,
                                                    entry)
                            try:
                                assert sample.wait_for_func_status(result=True)
                            except AssertionError:
                                raise exceptions.FailedToAddNodeException(
                                    "Failed to add RHEL node")
示例#20
0
    def destroy(self, log_level="DEBUG"):
        """
        Destroy OCP cluster specific

        Args:
            log_level (str): log level openshift-installer (default: DEBUG)

        """
        cluster_details = ocm.get_cluster_details(self.cluster_name)
        cluster_id = cluster_details.get("id")
        delete_status = rosa.destroy_appliance_mode_cluster(self.cluster_name)
        if not delete_status:
            ocm.destroy_cluster(self.cluster_name)
        logger.info("Waiting for ROSA cluster to be uninstalled")
        sample = TimeoutSampler(
            timeout=7200,
            sleep=30,
            func=self.cluster_present,
            cluster_name=self.cluster_name,
        )
        if not sample.wait_for_func_status(result=False):
            err_msg = f"Failed to delete {self.cluster_name}"
            logger.error(err_msg)
            raise TimeoutExpiredError(err_msg)
        rosa.delete_operator_roles(cluster_id)
        rosa.delete_oidc_provider(cluster_id)
示例#21
0
    def delete(self):
        log.info(f"Cleaning up backingstore {self.name}")

        if self.method == "oc":
            OCP(kind="backingstore",
                namespace=config.ENV_DATA["cluster_namespace"]).delete(
                    resource_name=self.name)
        elif self.method == "cli":

            def _cli_deletion_flow():
                try:
                    self.mcg_obj.exec_mcg_cmd(
                        f"backingstore delete {self.name}")
                    return True
                except CommandFailed as e:
                    if "being used by one or more buckets" in str(e).lower():
                        log.warning(
                            f"Deletion of {self.name} failed because it's being used by a bucket. "
                            "Retrying...")
                        return False

            sample = TimeoutSampler(
                timeout=120,
                sleep=20,
                func=_cli_deletion_flow,
            )
            if not sample.wait_for_func_status(result=True):
                log.error(f"Failed to {self.name}")
                raise TimeoutExpiredError

        log.info(
            f"Verifying whether backingstore {self.name} exists after deletion"
        )
        bs_deleted_successfully = False

        try:
            if self.method == "oc":
                OCP(
                    kind="backingstore",
                    namespace=config.ENV_DATA["cluster_namespace"],
                    resource_name=self.name,
                ).get()
            elif self.method == "cli":
                self.mcg_obj.exec_mcg_cmd(f"backingstore status {self.name}")

        except CommandFailed as e:
            if "Not Found" in str(e) or "NotFound" in str(e):
                bs_deleted_successfully = True
            else:
                raise

        assert (bs_deleted_successfully
                ), f"Backingstore {self.name} was not deleted successfully"

        if "pv-backingstore" in self.name.lower():
            log.info(
                f"Waiting for backingstore {self.name} resources to be deleted"
            )
            wait_for_pv_backingstore_resource_deleted(self.name)
示例#22
0
 def wait_for_peer_ready_status(self):
     logger.info("Waiting for PeerReady status to be True")
     sample = TimeoutSampler(timeout=300,
                             sleep=10,
                             func=self.get_peer_ready_status)
     assert sample.wait_for_func_status(
         result=True
     ), "PeerReady status is not true, failover or relocate action can not be performed"
示例#23
0
 def finalizer():
     must_gather_pods = self.ocs.get_pods(label_selector='app=must-gather')
     logger.info(f"must_gather_pods: {must_gather_pods} ")
     sample_pods = TimeoutSampler(
         timeout=30, sleep=3, func=check_for_must_gather_pod,
     )
     sample_namespace = TimeoutSampler(
         timeout=30, sleep=3, func=check_for_must_gather_project,
     )
     if sample_pods.wait_for_func_status(result=True):
         for must_gather_pod in must_gather_pods:
             self.ocp_obj.wait_for_delete(resource_name=must_gather_pod)
             logger.info(f"deleted pods: {must_gather_pods}")
     if not sample_namespace.wait_for_func_status(result=False):
         must_gather_namespace = check_for_must_gather_project()
         logger.info(f"namespace to delete: {must_gather_namespace}")
         self.ocp_obj.wait_for_delete(resource_name=must_gather_namespace)
示例#24
0
    def stop_baremetal_machines(self, baremetal_machine, force=True):
        """
        Stop Baremetal Machines

        Args:
            baremetal_machine (list): BM objects
            force (bool): True for BM ungraceful power off, False for
                graceful BM shutdown

        Raises:
            UnexpectedBehaviour: If baremetal machine is still up

        """
        for node in baremetal_machine:
            if force:
                if self.mgmt_details[node.name]:
                    ipmi_ctx = self.get_ipmi_ctx(
                        host=self.mgmt_details[node.name]["mgmt_console"],
                        user=self.mgmt_details[node.name]["mgmt_username"],
                        password=self.mgmt_details[node.name]["mgmt_password"],
                    )
                    logger.info(f"Powering Off {node.name}")
                    ipmi_ctx.chassis_control_power_down()
            else:
                ocp = OCP(kind="node")
                ocp.exec_oc_debug_cmd(
                    node=node.name, cmd_list=["shutdown now"], timeout=60
                )
                if self.mgmt_details[node.name]:
                    ipmi_ctx = self.get_ipmi_ctx(
                        host=self.mgmt_details[node.name]["mgmt_console"],
                        user=self.mgmt_details[node.name]["mgmt_username"],
                        password=self.mgmt_details[node.name]["mgmt_password"],
                    )
                    for status in TimeoutSampler(
                        600, 5, self.get_power_status, ipmi_ctx
                    ):
                        logger.info(
                            f"Waiting for Baremetal Machine {node.name} to power off"
                            f"Current Baremetal status: {status}"
                        )
                        if status == VM_POWERED_OFF:
                            logger.info(
                                f"Baremetal Machine {node.name} reached poweredOff status"
                            )
                            break
        logger.info("Verifing machine is down")
        ret = TimeoutSampler(
            timeout=300,
            sleep=3,
            func=self.verify_machine_is_down,
            node=node,
        )
        logger.info(ret)
        if not ret.wait_for_func_status(result=True):
            raise UnexpectedBehaviour("Machine {node.name} is still Running")
    def test_rook_ceph_operator_log_type(self):
        """
        Test the ability to change the log level in rook-ceph operator dynamically
        without rook-ceph operator pod restart.

        """
        set_configmap_log_level_rook_ceph_operator(value="DEBUG")
        last_log_date_time_obj = get_last_log_time_date()

        log.info("Respin OSD pod")
        osd_pod_objs = get_osd_pods()
        osd_pod_obj = random.choice(osd_pod_objs)
        osd_pod_obj.delete()

        sample = TimeoutSampler(
            timeout=400,
            sleep=20,
            func=check_osd_log_exist_on_rook_ceph_operator_pod,
            last_log_date_time_obj=last_log_date_time_obj,
            expected_strings=["D |", "osd"],
        )
        if not sample.wait_for_func_status(result=True):
            raise ValueError("OSD DEBUG Log does not exist")

        set_configmap_log_level_rook_ceph_operator(value="INFO")
        last_log_date_time_obj = get_last_log_time_date()

        log.info("Respin OSD pod")
        osd_pod_objs = get_osd_pods()
        osd_pod_obj = random.choice(osd_pod_objs)
        osd_pod_obj.delete()

        sample = TimeoutSampler(
            timeout=400,
            sleep=20,
            func=check_osd_log_exist_on_rook_ceph_operator_pod,
            last_log_date_time_obj=last_log_date_time_obj,
            expected_strings=["I |", "osd"],
            unexpected_strings=["D |"],
        )
        if not sample.wait_for_func_status(result=True):
            raise ValueError(
                "OSD INFO Log does not exist or DEBUG Log exist on INFO mode")
示例#26
0
    def _deploy_es(self):
        """
        Deploying the Elasticsearch server

        """

        # Creating PVC for the elasticsearch server and wait until it bound
        log.info("Creating 10 GiB PVC for the ElasticSearch cluster on")
        try:
            self.pvc_obj = create_pvc(
                sc_name=self.args.get("sc") or constants.CEPHBLOCKPOOL_SC,
                namespace=self.namespace,
                pvc_name="elasticsearch-data-quickstart-es-default-0",
                access_mode=constants.ACCESS_MODE_RWO,
                size="10Gi",
            )

            # Make sure the PVC bound, or delete it and raise exception
            wait_for_resource_state(self.pvc_obj, constants.STATUS_BOUND)
        except ResourceWrongStatusException:
            log.error("The PVC couldn't created")
            return False

        self.pvc_obj.reload()

        log.info("Deploy the ElasticSearch cluster")
        self.ocp.apply(self.crd)

        sample = TimeoutSampler(
            timeout=300,
            sleep=10,
            func=self._pod_is_found,
            pattern="quickstart-es-default",
        )
        if not sample.wait_for_func_status(True):
            log.error("The ElasticSearch pod deployment Failed")
            return False

        self.espod = get_pod_name_by_pattern("quickstart-es-default",
                                             self.namespace)[0]
        log.info(f"The ElasticSearch pod {self.espod} Started")

        es_pod = OCP(kind="pod", namespace=self.namespace)
        log.info("Waiting for ElasticSearch to Run")
        if not es_pod.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=self.espod,
                sleep=30,
                timeout=600,
        ):
            log.error("TThe ElasticSearch pod is not running !")
            return False
        else:
            log.info("Elastic Search is ready !!!")
            return True
示例#27
0
    def _create_nss(method, nss_dict):
        """
        Tracks creation and cleanup of all the namespace stores that were created in the current scope

        Args:
            method (str): String for selecting method of backing store creation (CLI/OC)
            nss_dict (dict): Dictionary containing storage provider as key and a list of tuples
            as value.
            Namespace store dictionary examples - 'CloudName': [(amount, region), (amount, region)]
            i.e. - 'aws': [(3, us-west-1),(2, eu-west-2)]

        Returns:
            list: A list of the NamespaceStore objects created by the factory in the current scope

        """
        current_call_created_nss = []
        for platform, nss_lst in nss_dict.items():
            for nss_tup in nss_lst:
                # Create the actual namespace resource
                nss_name = create_unique_resource_name(constants.MCG_NSS,
                                                       platform)

                target_bucket_name = cmdMap[method.lower()](nss_name,
                                                            nss_tup[1],
                                                            cld_mgr,
                                                            cloud_uls_factory,
                                                            platform)

                # TODO: Check platform exists in endpointMap

                sample = TimeoutSampler(
                    timeout=60,
                    sleep=5,
                    func=mcg_obj.check_ns_resource_validity,
                    ns_resource_name=nss_name,
                    target_bucket_name=target_bucket_name,
                    endpoint=endpointMap[platform],
                )
                if not sample.wait_for_func_status(result=True):
                    log.error(f"{nss_name} failed its verification check")
                    raise TimeoutExpiredError

                nss_obj = NamespaceStore(
                    name=nss_name,
                    method=method.lower(),
                    mcg_obj=mcg_obj,
                    uls_name=target_bucket_name,
                )

                nss_obj.verify_health()

                created_nss.append(nss_obj)
                current_call_created_nss.append(nss_obj)

        return current_call_created_nss
示例#28
0
    def delete(self):
        """
        Deletes the current namespacestore by using OC/CLI commands

        """
        log.info(f"Cleaning up namespacestore {self.name}")

        if self.method == "oc":
            OCP(
                kind="namespacestore", namespace=config.ENV_DATA["cluster_namespace"]
            ).delete(resource_name=self.name)

        elif self.method == "cli":

            def _cli_deletion_flow():
                try:
                    self.mcg_obj.exec_mcg_cmd(f"namespacestore delete {self.name}")
                    return True
                except CommandFailed as e:
                    if "being used by one or more buckets" in str(e).lower():
                        log.warning(
                            f"Deletion of {self.name} failed because it's being used by a bucket. "
                            "Retrying..."
                        )
                    else:
                        log.warning(f"Deletion of self.name failed. Error:\n{str(e)}")
                    return False

            sample = TimeoutSampler(
                timeout=120,
                sleep=20,
                func=_cli_deletion_flow,
            )
            if not sample.wait_for_func_status(result=True):
                log.error(f"Failed to {self.name}")
                raise TimeoutExpiredError

        log.info(f"Verifying whether namespacestore {self.name} exists after deletion")
        ns_deleted_successfully = False

        if self.method == "oc":
            OCP(
                kind="namespacestore",
                namespace=config.ENV_DATA["cluster_namespace"],
                resource_name=self.name,
            ).get()
        elif self.method == "cli":
            if self.name not in self.mcg_obj.exec_mcg_cmd("namespacestore list"):
                ns_deleted_successfully = True

        assert (
            ns_deleted_successfully
        ), f"Namespacestore {self.name} was not deleted successfully"
示例#29
0
    def cluster_health_check(self, timeout=300):
        """
        This would be a comprehensive cluster health check
        which includes checking pods, external ceph cluster health.
        raise exceptions.CephHealthException("Cluster health is NOT OK")
        """
        sample = TimeoutSampler(timeout=timeout, sleep=3, func=self.is_health_ok)
        if not sample.wait_for_func_status(result=True):
            raise exceptions.CephHealthException("Cluster health is NOT OK")

        self.wait_for_noobaa_health_ok()
        self.validate_pvc()
示例#30
0
    def test_osd_heap_profile(self):
        """
        Generate heap profile dump file for OSDs and verify whether the file
        is created on '/var/log/ceph/'

        """
        strings_err = ["error", "fail"]
        osd_pods = get_osd_pods()
        osd_id = str(random.randint(0, len(osd_pods) - 1))

        log.info(f"Start heap profiler for osd-{osd_id}")
        pod_tool = get_ceph_tools_pod()
        out = pod_tool.exec_cmd_on_pod(
            command=f"ceph tell osd.{osd_id} heap start_profiler",
            out_yaml_format=False)
        logging.info(f"command output:{out}")
        for string_err in strings_err:
            assert (string_err not in out.lower()
                    ), f"{string_err} on the output command {out}"

        logging.info("Sleep 10 sec, for running heap profiler")
        time.sleep(10)

        log.info("Dump heap profile")
        out = pod_tool.exec_sh_cmd_on_pod(
            command=f"ceph tell osd.{osd_id} heap dump")
        logging.info(out)
        for string_err in strings_err:
            assert (string_err not in out.lower()
                    ), f"{string_err} on the output command {out}"

        log.info(f"Get osd-{osd_id} pod object")
        for osd_pod in osd_pods:
            if get_osd_pod_id(osd_pod) == osd_id:
                osd_pod_profile = osd_pod

        osd_profile_str = f"osd.{osd_id}.profile"
        log.info(f"Verify {osd_profile_str} log exist on /var/log/ceph/")
        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=self.verify_output_command_osd_pod,
            command="ls -ltr /var/log/ceph/",
            pod_obj=osd_pod_profile,
            str_to_check=osd_profile_str,
        )
        if not sample.wait_for_func_status(result=True):
            log.error(f"{osd_profile_str} log does not exist on /var/log/ceph")
            raise ValueError(
                f"{osd_profile_str} log does not exist on /var/log/ceph")

        log.info(f"osd.{osd_id}.profile log exist on /var/log/ceph")