def wait_for_replication_resources_deletion(namespace, timeout, check_state=True): """ Wait for replication resources to be deleted Args: namespace (str): the namespace of the resources' timeout (int): time in seconds to wait for resources to reach expected state or deleted check_state (bool): True for checking resources state before deletion, False otherwise Raises: TimeoutExpiredError: In case replication resources not deleted """ if check_state: logger.info("Waiting for all VRs to reach secondary state") sample = TimeoutSampler( timeout=timeout, sleep=5, func=check_vr_state, state="secondary", namespace=namespace, ) if not sample.wait_for_func_status(result=True): error_msg = "One or more VR haven't reached expected state secondary within the time limit." logger.error(error_msg) raise TimeoutExpiredError(error_msg) logger.info("Waiting for VRG to reach secondary state") sample = TimeoutSampler( timeout=timeout, sleep=5, func=check_vrg_state, state="secondary", namespace=namespace, ) if not sample.wait_for_func_status(result=True): error_msg = ( "VRG hasn't reached expected state secondary within the time limit." ) logger.info(error_msg) raise TimeoutExpiredError(error_msg) logger.info("Waiting for VRG to be deleted") sample = TimeoutSampler( timeout=timeout, sleep=5, func=check_vrg_existence, namespace=namespace ) if not sample.wait_for_func_status(result=False): error_msg = "VRG resource not deleted" logger.info(error_msg) raise TimeoutExpiredError(error_msg) logger.info("Waiting for all VRs to be deleted") sample = TimeoutSampler( timeout=timeout, sleep=5, func=get_vr_count, namespace=namespace, ) sample.wait_for_func_value(0)
def detach_and_delete_vols(self, volumes): """ Detach and delete volumes from the list Args: volumes (list): of Volume objects """ for v in volumes: if v.status == "in-use": v.detach() v.get() sample = TimeoutSampler( 100, 5, self.check_expected_vol_status, vol=v, expected_state="available", ) if not sample.wait_for_func_status(True): logger.error(f"Volume {v.name} failed to detach") raise exceptions.PSIVolumeNotInExpectedState() v.delete() sample = TimeoutSampler(100, 5, self.check_vol_deleted, vol=v) if not sample.wait_for_func_status(True): logger.error(f"Failed to delete Volume {v.name}") raise exceptions.PSIVolumeDeletionFailed()
def verify_obc(self): """ OBC verification from external cluster perspective, we will check 2 OBCs """ sample = TimeoutSampler(300, 5, self.ceph_cluster.noobaa_health_check) sample.wait_for_func_status(True)
def wait_for_replication_resources_creation(vr_count, namespace, timeout): """ Wait for replication resources to be created Args: vr_count (int): Expected number of VR resources namespace (str): the namespace of the VR resources timeout (int): time in seconds to wait for VR resources to be created or reach expected state Raises: TimeoutExpiredError: In case replication resources not created """ logger.info("Waiting for VRG to be created") sample = TimeoutSampler( timeout=timeout, sleep=5, func=check_vrg_existence, namespace=namespace ) if not sample.wait_for_func_status(result=True): error_msg = "VRG resource is not created" logger.error(error_msg) raise TimeoutExpiredError(error_msg) logger.info(f"Waiting for {vr_count} VRs to be created") sample = TimeoutSampler( timeout=timeout, sleep=5, func=get_vr_count, namespace=namespace, ) sample.wait_for_func_value(vr_count) logger.info(f"Waiting for {vr_count} VRs to reach primary state") sample = TimeoutSampler( timeout=timeout, sleep=5, func=check_vr_state, state="primary", namespace=namespace, ) if not sample.wait_for_func_status(result=True): error_msg = "One or more VR haven't reached expected state primary within the time limit." logger.error(error_msg) raise TimeoutExpiredError(error_msg) logger.info("Waiting for VRG to reach primary state") sample = TimeoutSampler( timeout=timeout, sleep=5, func=check_vrg_state, state="primary", namespace=namespace, ) if not sample.wait_for_func_status(result=True): error_msg = "VRG hasn't reached expected state primary within the time limit." logger.error(error_msg) raise TimeoutExpiredError(error_msg)
def check_capacity_breakdown(self, project_name, pod_name): """ Check Capacity Breakdown Args: project_name (str): The name of the project pod_name (str): The name of pod Returns: bool: True if project_name and pod_name exist on capacity_breakdown, False otherwise """ self.navigate_overview_page() if self.ocp_version == "4.7": self.do_click(self.validation_loc["persistent_storage_tab"]) self.choose_expanded_mode( mode=True, locator=self.validation_loc["capacity_breakdown_options"]) self.do_click(self.validation_loc["capacity_breakdown_projects"]) self.take_screenshot() res = True sample = TimeoutSampler( timeout=30, sleep=2, func=self.check_element_text, expected_text=project_name, ) if not sample.wait_for_func_status(result=True): logger.error( f"The project {project_name} not found on capacity_breakdown") res = False self.choose_expanded_mode( mode=True, locator=self.validation_loc["capacity_breakdown_options"]) self.do_click(self.validation_loc["capacity_breakdown_pods"]) self.take_screenshot() sample = TimeoutSampler( timeout=30, sleep=2, func=self.check_element_text, expected_text=pod_name, ) if not sample.wait_for_func_status(result=True): logger.error(f"The pod {pod_name} not found on capacity_breakdown") res = False return res
def test_fio_with_block_storage(self): name = "test_workload" spec = self.pod_obj.data.get("spec") path = spec.get("containers")[0].get("volumeMounts")[0].get( "mountPath") work_load = "fio" storage_type = "fs" # few io parameters for Fio runtime = 10 size = "200M" wl = workload.WorkLoad(name, path, work_load, storage_type, self.pod_obj) assert wl.setup() io_params = templating.load_yaml(constants.FIO_IO_PARAMS_YAML) io_params["runtime"] = runtime io_params["size"] = size future_result = wl.run(**io_params) timeout = 1200 sample = TimeoutSampler(timeout=timeout, sleep=3, func=future_result.done) assert sample.wait_for_func_status(result=True) try: logger.info(future_result.result()) except exceptions.CommandFailed: logger.exception("FIO failed") raise except Exception: logger.exception("Found Exception") raise
def stop_powernodes_machines(self, powernode_machines, timeout=900, wait=True, force=True): """ Stop PowerNode Machines Args: powernode_machines (list): PowerNode objects timeout (int): time in seconds to wait for node to reach 'not ready' state wait (bool): True if need to wait till the restarted node reaches timeout - for future use force (bool): True for PowerNode ungraceful power off, False for graceful PowerNode shutdown - for future use Raises: UnexpectedBehaviour: If PowerNode machine is still up """ ocpversion = get_ocp_version("-") for node in powernode_machines: cmd = f"sudo virsh shutdown test-ocp{ocpversion}-{node.name}" result = exec_cmd(cmd) logger.info(f"Result of shutdown {result}") logger.info("Verifying node is down") ret = TimeoutSampler( timeout=timeout, sleep=3, func=self.verify_machine_is_down, node=node, ) logger.info(ret) if not ret.wait_for_func_status(result=True): raise UnexpectedBehaviour("Node {node.name} is still Running")
def verify_ocs_operator_succeeded(self, timeout_install=300, sleep=20): """ Verify OCS Installation timeout_install (int): Time in seconds to wait sleep (int): Sampling time in seconds """ self.navigate_operatorhub_page() self.navigate_installed_operators_page() self.do_send_keys( locator=self.dep_loc["search_ocs_install"], text="OpenShift Container Storage", ) sample = TimeoutSampler( timeout=timeout_install, sleep=sleep, func=self.check_element_text, expected_text="Succeeded", ) if not sample.wait_for_func_status(result=True): logger.error( f"OCS Installation status is not Succeeded after {timeout_install} seconds" ) raise TimeoutExpiredError
def wait_for_vr_state(state, namespace, timeout=300): """ Wait for all VR resources to reach expected state in the given namespace Args: state (str): The VR state to check for (e.g. 'primary', 'secondary') namespace (str): the namespace of the VR resources timeout (int): time in seconds to wait for VR resources to be created or reach expected state Returns: bool: True if all VR are in expected state Raises: AssertionError: If VR resources are not in expected state """ sample = TimeoutSampler( timeout=timeout, sleep=3, func=check_vr_state, state=state, namespace=namespace ) assert sample.wait_for_func_status( result=True ), f"One or more VR haven't reached expected state {state} within the time limit." return True
def wait_for_mirroring_status_ok(replaying_images=None, timeout=300): """ Wait for mirroring status to reach health OK and expected number of replaying images for each of the ODF cluster Args: replaying_images (int): Expected number of images in replaying state timeout (int): time in seconds to wait for mirroring status reach OK Returns: bool: True if status contains expected health and states values Raises: AssertionError: In case of unexpected mirroring status """ for cluster in get_non_acm_cluster_config(): config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"]) logger.info( f"Validating mirroring status on cluster {cluster.ENV_DATA['cluster_name']}" ) sample = TimeoutSampler( timeout=timeout, sleep=5, func=check_mirroring_status_ok, replaying_images=replaying_images, ) assert sample.wait_for_func_status(result=True), ( "The mirroring status does not have expected values within the time" f" limit on cluster {cluster.ENV_DATA['cluster_name']}" )
def deploy(self): """ Deploy the benchmark-operator """ log.info("Deploy the benchmark-operator project") try: run("make deploy", shell=True, check=True, cwd=self.dir) except Exception as ex: log.error(f"Failed to deploy benchmark operator : {ex}") log.info("Wait for the benchmark-operator deployment be available") try: cmd = f'wait --for=condition=available "{BMO_DEPLOYMENT}" -n {BMO_NAME} --timeout=300s' self.pod_obj.exec_oc_cmd(cmd) # At this point the benchmark operator pod is ready, but we need to # verifying that all containers in the pod are ready (up to 30 sec.) sample = TimeoutSampler(timeout=30, sleep=3, func=self._is_ready) if not sample.wait_for_func_status(True): raise Exception("Not all the containers are ready") except Exception as ex: log.error(f"Failed to wait for benchmark operator : {ex}") log.info("the benchmark Operator is ready")
def wait_for_phase(self, phase, timeout=300, sleep=5): """ Wait till phase of resource is the same as required one passed in the phase parameter. Args: phase (str): Desired phase of resource object timeout (int): Timeout in seconds to wait for desired phase sleep (int): Time in seconds to sleep between attempts Raises: ResourceInUnexpectedState: In case the resource is not in expected phase. NotSupportedFunctionError: If resource doesn't have phase! ResourceNameNotSpecifiedException: in case the name is not specified. """ self.check_function_supported(self._has_phase) self.check_name_is_specified() sampler = TimeoutSampler(timeout, sleep, self.check_phase, phase=phase) if not sampler.wait_for_func_status(True): raise ResourceInUnexpectedState( f"Resource: {self.resource_name} is not in expected phase: " f"{phase}")
def start(self, node, timeout): """ Start the given service using systemctl. Args: node (object): Node objects timeout (int): time in seconds to wait for service to start. Raises: UnexpectedBehaviour: If service on powerNode machine is still not up """ nodeip = self.nodes[node.name] cmd = f"ssh core@{nodeip} sudo systemctl start {self.service_name}.service" result = exec_cmd(cmd) logger.info( f"Result of start of service {self.service_name} is {result}") ret = TimeoutSampler( timeout=timeout, sleep=3, func=self.verify_service, node=node, action=ACTIVE, ) if not ret.wait_for_func_status(result=True): raise UnexpectedBehaviour( "Service {self.service_name} on Node {node.name} is still not Running" )
def cluster_health_check(self, timeout=None): """ Check overall cluster health. Relying on health reported by CephCluster.get() Args: timeout (int): in seconds. By default timeout value will be scaled based on number of ceph pods in the cluster. This is just a crude number. Its been observed that as the number of pods increases it takes more time for cluster's HEALTH_OK. Returns: bool: True if "HEALTH_OK" else False Raises: CephHealthException: if cluster is not healthy """ # Scale timeout only if user hasn't passed any value timeout = timeout or (10 * len(self.pods)) sample = TimeoutSampler(timeout=timeout, sleep=3, func=self.is_health_ok) if not sample.wait_for_func_status(result=True): raise exceptions.CephHealthException("Cluster health is NOT OK") # This way of checking health of different cluster entities and # raising only CephHealthException is not elegant. # TODO: add an attribute in CephHealthException, called "reason" # which should tell because of which exact cluster entity health # is not ok ? expected_mon_count = self.mon_count expected_mds_count = self.mds_count self.scan_cluster() try: self.mon_health_check(expected_mon_count) except exceptions.MonCountException as e: logger.error(e) raise exceptions.CephHealthException("Cluster health is NOT OK") try: if not expected_mds_count: pass else: self.mds_health_check(expected_mds_count) except exceptions.MDSCountException as e: logger.error(e) raise exceptions.CephHealthException("Cluster health is NOT OK") # check noobaa health if not self.mcg_obj.status: raise exceptions.NoobaaHealthException("Cluster health is NOT OK") # TODO: OSD and MGR health check logger.info("Cluster HEALTH_OK") # This scan is for reconcilation on *.count # because during first scan in this function some of the # pods may not be up and would have set count to lesser number self.scan_cluster() return True
def verify_operator_succeeded(self, operator="OpenShift Container Storage", timeout_install=300, sleep=20): """ Verify Operator Installation Args: operator (str): type of operator timeout_install (int): Time in seconds to wait sleep (int): Sampling time in seconds """ self.search_operator_installed_operators_page(operator=operator) sample = TimeoutSampler( timeout=timeout_install, sleep=sleep, func=self.check_element_text, expected_text="Succeeded", ) if not sample.wait_for_func_status(result=True): logger.error( f"{operator} Installation status is not Succeeded after {timeout_install} seconds" ) raise TimeoutExpiredError
def test_fio_with_block_storage(self): name = 'test_workload' spec = self.pod_obj.data.get('spec') path = ( spec.get('containers')[0].get('volumeMounts')[0].get('mountPath')) work_load = 'fio' storage_type = 'fs' # few io parameters for Fio runtime = 10 size = '200M' wl = workload.WorkLoad(name, path, work_load, storage_type, self.pod_obj) assert wl.setup() io_params = templating.load_yaml(constants.FIO_IO_PARAMS_YAML) io_params['runtime'] = runtime io_params['size'] = size future_result = wl.run(**io_params) timeout = 1200 sample = TimeoutSampler(timeout=timeout, sleep=3, func=future_result.done) assert sample.wait_for_func_status(result=True) try: logger.info(future_result.result()) except exceptions.CommandFailed: logger.exception(f"FIO failed") raise except Exception: logger.exception(f"Found Exception") raise
def verify_disks_lso_attached(self, timeout=600, sleep=20): """ Verify Disks Attached Args: timeout (int): Time in seconds to wait sleep (int): Sampling time in seconds """ osd_size = config.ENV_DATA.get("device_size", defaults.DEVICE_SIZE) number_worker_nodes = get_worker_nodes() capacity = int(osd_size) * len(number_worker_nodes) if capacity >= 1024: capacity_str = str( capacity / 1024).rstrip("0").rstrip(".") + " TiB" else: capacity_str = str(capacity) + " GiB" sample = TimeoutSampler( timeout=timeout, sleep=sleep, func=self.check_element_text, expected_text=capacity_str, ) if not sample.wait_for_func_status(result=True): logger.error(f" after {timeout} seconds") raise TimeoutExpiredError
def stop(self, node, timeout): """ Stop the given service using systemctl. Args: node (object): Node objects timeout (int): time in seconds to wait for service to stop. Raises: UnexpectedBehaviour: If service on PowerNode machine is still up """ nodeip = self.nodes[node.name] cmd = ( f"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@{self.bastion_ip} ssh core@{nodeip} " f"sudo systemctl stop {self.service_name}.service") if self.force: cmd += " -f" result = exec_cmd(cmd) logger.info( f"Result of shutdown {result}. Checking if service {self.service_name} went down." ) ret = TimeoutSampler( timeout=timeout, sleep=3, func=self.verify_service, node=node, action=INACTIVE, ) if not ret.wait_for_func_status(result=True): raise UnexpectedBehaviour( f"Service {self.service_name} on Node {node.name} is still Running" )
def verify_nodes_added(self, hosts): """ Verify RHEL workers are added Args: hosts (list): list of aws private hostnames Raises: FailedToAddNodeException: if node addition failed """ timeout = 600 ocp_obj = ocp.OCP(kind="node") node_info = ocp_obj.get() for i in range(len(hosts)): for entry in node_info["items"]: for each in entry["status"]["addresses"]: if each["type"] == "Hostname": if each["address"] in hosts: logging.info( f"Checking status for {each['address']}") sample = TimeoutSampler(timeout, 3, self.get_ready_status, entry) try: assert sample.wait_for_func_status(result=True) except AssertionError: raise exceptions.FailedToAddNodeException( "Failed to add RHEL node")
def destroy(self, log_level="DEBUG"): """ Destroy OCP cluster specific Args: log_level (str): log level openshift-installer (default: DEBUG) """ cluster_details = ocm.get_cluster_details(self.cluster_name) cluster_id = cluster_details.get("id") delete_status = rosa.destroy_appliance_mode_cluster(self.cluster_name) if not delete_status: ocm.destroy_cluster(self.cluster_name) logger.info("Waiting for ROSA cluster to be uninstalled") sample = TimeoutSampler( timeout=7200, sleep=30, func=self.cluster_present, cluster_name=self.cluster_name, ) if not sample.wait_for_func_status(result=False): err_msg = f"Failed to delete {self.cluster_name}" logger.error(err_msg) raise TimeoutExpiredError(err_msg) rosa.delete_operator_roles(cluster_id) rosa.delete_oidc_provider(cluster_id)
def delete(self): log.info(f"Cleaning up backingstore {self.name}") if self.method == "oc": OCP(kind="backingstore", namespace=config.ENV_DATA["cluster_namespace"]).delete( resource_name=self.name) elif self.method == "cli": def _cli_deletion_flow(): try: self.mcg_obj.exec_mcg_cmd( f"backingstore delete {self.name}") return True except CommandFailed as e: if "being used by one or more buckets" in str(e).lower(): log.warning( f"Deletion of {self.name} failed because it's being used by a bucket. " "Retrying...") return False sample = TimeoutSampler( timeout=120, sleep=20, func=_cli_deletion_flow, ) if not sample.wait_for_func_status(result=True): log.error(f"Failed to {self.name}") raise TimeoutExpiredError log.info( f"Verifying whether backingstore {self.name} exists after deletion" ) bs_deleted_successfully = False try: if self.method == "oc": OCP( kind="backingstore", namespace=config.ENV_DATA["cluster_namespace"], resource_name=self.name, ).get() elif self.method == "cli": self.mcg_obj.exec_mcg_cmd(f"backingstore status {self.name}") except CommandFailed as e: if "Not Found" in str(e) or "NotFound" in str(e): bs_deleted_successfully = True else: raise assert (bs_deleted_successfully ), f"Backingstore {self.name} was not deleted successfully" if "pv-backingstore" in self.name.lower(): log.info( f"Waiting for backingstore {self.name} resources to be deleted" ) wait_for_pv_backingstore_resource_deleted(self.name)
def wait_for_peer_ready_status(self): logger.info("Waiting for PeerReady status to be True") sample = TimeoutSampler(timeout=300, sleep=10, func=self.get_peer_ready_status) assert sample.wait_for_func_status( result=True ), "PeerReady status is not true, failover or relocate action can not be performed"
def finalizer(): must_gather_pods = self.ocs.get_pods(label_selector='app=must-gather') logger.info(f"must_gather_pods: {must_gather_pods} ") sample_pods = TimeoutSampler( timeout=30, sleep=3, func=check_for_must_gather_pod, ) sample_namespace = TimeoutSampler( timeout=30, sleep=3, func=check_for_must_gather_project, ) if sample_pods.wait_for_func_status(result=True): for must_gather_pod in must_gather_pods: self.ocp_obj.wait_for_delete(resource_name=must_gather_pod) logger.info(f"deleted pods: {must_gather_pods}") if not sample_namespace.wait_for_func_status(result=False): must_gather_namespace = check_for_must_gather_project() logger.info(f"namespace to delete: {must_gather_namespace}") self.ocp_obj.wait_for_delete(resource_name=must_gather_namespace)
def stop_baremetal_machines(self, baremetal_machine, force=True): """ Stop Baremetal Machines Args: baremetal_machine (list): BM objects force (bool): True for BM ungraceful power off, False for graceful BM shutdown Raises: UnexpectedBehaviour: If baremetal machine is still up """ for node in baremetal_machine: if force: if self.mgmt_details[node.name]: ipmi_ctx = self.get_ipmi_ctx( host=self.mgmt_details[node.name]["mgmt_console"], user=self.mgmt_details[node.name]["mgmt_username"], password=self.mgmt_details[node.name]["mgmt_password"], ) logger.info(f"Powering Off {node.name}") ipmi_ctx.chassis_control_power_down() else: ocp = OCP(kind="node") ocp.exec_oc_debug_cmd( node=node.name, cmd_list=["shutdown now"], timeout=60 ) if self.mgmt_details[node.name]: ipmi_ctx = self.get_ipmi_ctx( host=self.mgmt_details[node.name]["mgmt_console"], user=self.mgmt_details[node.name]["mgmt_username"], password=self.mgmt_details[node.name]["mgmt_password"], ) for status in TimeoutSampler( 600, 5, self.get_power_status, ipmi_ctx ): logger.info( f"Waiting for Baremetal Machine {node.name} to power off" f"Current Baremetal status: {status}" ) if status == VM_POWERED_OFF: logger.info( f"Baremetal Machine {node.name} reached poweredOff status" ) break logger.info("Verifing machine is down") ret = TimeoutSampler( timeout=300, sleep=3, func=self.verify_machine_is_down, node=node, ) logger.info(ret) if not ret.wait_for_func_status(result=True): raise UnexpectedBehaviour("Machine {node.name} is still Running")
def test_rook_ceph_operator_log_type(self): """ Test the ability to change the log level in rook-ceph operator dynamically without rook-ceph operator pod restart. """ set_configmap_log_level_rook_ceph_operator(value="DEBUG") last_log_date_time_obj = get_last_log_time_date() log.info("Respin OSD pod") osd_pod_objs = get_osd_pods() osd_pod_obj = random.choice(osd_pod_objs) osd_pod_obj.delete() sample = TimeoutSampler( timeout=400, sleep=20, func=check_osd_log_exist_on_rook_ceph_operator_pod, last_log_date_time_obj=last_log_date_time_obj, expected_strings=["D |", "osd"], ) if not sample.wait_for_func_status(result=True): raise ValueError("OSD DEBUG Log does not exist") set_configmap_log_level_rook_ceph_operator(value="INFO") last_log_date_time_obj = get_last_log_time_date() log.info("Respin OSD pod") osd_pod_objs = get_osd_pods() osd_pod_obj = random.choice(osd_pod_objs) osd_pod_obj.delete() sample = TimeoutSampler( timeout=400, sleep=20, func=check_osd_log_exist_on_rook_ceph_operator_pod, last_log_date_time_obj=last_log_date_time_obj, expected_strings=["I |", "osd"], unexpected_strings=["D |"], ) if not sample.wait_for_func_status(result=True): raise ValueError( "OSD INFO Log does not exist or DEBUG Log exist on INFO mode")
def _deploy_es(self): """ Deploying the Elasticsearch server """ # Creating PVC for the elasticsearch server and wait until it bound log.info("Creating 10 GiB PVC for the ElasticSearch cluster on") try: self.pvc_obj = create_pvc( sc_name=self.args.get("sc") or constants.CEPHBLOCKPOOL_SC, namespace=self.namespace, pvc_name="elasticsearch-data-quickstart-es-default-0", access_mode=constants.ACCESS_MODE_RWO, size="10Gi", ) # Make sure the PVC bound, or delete it and raise exception wait_for_resource_state(self.pvc_obj, constants.STATUS_BOUND) except ResourceWrongStatusException: log.error("The PVC couldn't created") return False self.pvc_obj.reload() log.info("Deploy the ElasticSearch cluster") self.ocp.apply(self.crd) sample = TimeoutSampler( timeout=300, sleep=10, func=self._pod_is_found, pattern="quickstart-es-default", ) if not sample.wait_for_func_status(True): log.error("The ElasticSearch pod deployment Failed") return False self.espod = get_pod_name_by_pattern("quickstart-es-default", self.namespace)[0] log.info(f"The ElasticSearch pod {self.espod} Started") es_pod = OCP(kind="pod", namespace=self.namespace) log.info("Waiting for ElasticSearch to Run") if not es_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=self.espod, sleep=30, timeout=600, ): log.error("TThe ElasticSearch pod is not running !") return False else: log.info("Elastic Search is ready !!!") return True
def _create_nss(method, nss_dict): """ Tracks creation and cleanup of all the namespace stores that were created in the current scope Args: method (str): String for selecting method of backing store creation (CLI/OC) nss_dict (dict): Dictionary containing storage provider as key and a list of tuples as value. Namespace store dictionary examples - 'CloudName': [(amount, region), (amount, region)] i.e. - 'aws': [(3, us-west-1),(2, eu-west-2)] Returns: list: A list of the NamespaceStore objects created by the factory in the current scope """ current_call_created_nss = [] for platform, nss_lst in nss_dict.items(): for nss_tup in nss_lst: # Create the actual namespace resource nss_name = create_unique_resource_name(constants.MCG_NSS, platform) target_bucket_name = cmdMap[method.lower()](nss_name, nss_tup[1], cld_mgr, cloud_uls_factory, platform) # TODO: Check platform exists in endpointMap sample = TimeoutSampler( timeout=60, sleep=5, func=mcg_obj.check_ns_resource_validity, ns_resource_name=nss_name, target_bucket_name=target_bucket_name, endpoint=endpointMap[platform], ) if not sample.wait_for_func_status(result=True): log.error(f"{nss_name} failed its verification check") raise TimeoutExpiredError nss_obj = NamespaceStore( name=nss_name, method=method.lower(), mcg_obj=mcg_obj, uls_name=target_bucket_name, ) nss_obj.verify_health() created_nss.append(nss_obj) current_call_created_nss.append(nss_obj) return current_call_created_nss
def delete(self): """ Deletes the current namespacestore by using OC/CLI commands """ log.info(f"Cleaning up namespacestore {self.name}") if self.method == "oc": OCP( kind="namespacestore", namespace=config.ENV_DATA["cluster_namespace"] ).delete(resource_name=self.name) elif self.method == "cli": def _cli_deletion_flow(): try: self.mcg_obj.exec_mcg_cmd(f"namespacestore delete {self.name}") return True except CommandFailed as e: if "being used by one or more buckets" in str(e).lower(): log.warning( f"Deletion of {self.name} failed because it's being used by a bucket. " "Retrying..." ) else: log.warning(f"Deletion of self.name failed. Error:\n{str(e)}") return False sample = TimeoutSampler( timeout=120, sleep=20, func=_cli_deletion_flow, ) if not sample.wait_for_func_status(result=True): log.error(f"Failed to {self.name}") raise TimeoutExpiredError log.info(f"Verifying whether namespacestore {self.name} exists after deletion") ns_deleted_successfully = False if self.method == "oc": OCP( kind="namespacestore", namespace=config.ENV_DATA["cluster_namespace"], resource_name=self.name, ).get() elif self.method == "cli": if self.name not in self.mcg_obj.exec_mcg_cmd("namespacestore list"): ns_deleted_successfully = True assert ( ns_deleted_successfully ), f"Namespacestore {self.name} was not deleted successfully"
def cluster_health_check(self, timeout=300): """ This would be a comprehensive cluster health check which includes checking pods, external ceph cluster health. raise exceptions.CephHealthException("Cluster health is NOT OK") """ sample = TimeoutSampler(timeout=timeout, sleep=3, func=self.is_health_ok) if not sample.wait_for_func_status(result=True): raise exceptions.CephHealthException("Cluster health is NOT OK") self.wait_for_noobaa_health_ok() self.validate_pvc()
def test_osd_heap_profile(self): """ Generate heap profile dump file for OSDs and verify whether the file is created on '/var/log/ceph/' """ strings_err = ["error", "fail"] osd_pods = get_osd_pods() osd_id = str(random.randint(0, len(osd_pods) - 1)) log.info(f"Start heap profiler for osd-{osd_id}") pod_tool = get_ceph_tools_pod() out = pod_tool.exec_cmd_on_pod( command=f"ceph tell osd.{osd_id} heap start_profiler", out_yaml_format=False) logging.info(f"command output:{out}") for string_err in strings_err: assert (string_err not in out.lower() ), f"{string_err} on the output command {out}" logging.info("Sleep 10 sec, for running heap profiler") time.sleep(10) log.info("Dump heap profile") out = pod_tool.exec_sh_cmd_on_pod( command=f"ceph tell osd.{osd_id} heap dump") logging.info(out) for string_err in strings_err: assert (string_err not in out.lower() ), f"{string_err} on the output command {out}" log.info(f"Get osd-{osd_id} pod object") for osd_pod in osd_pods: if get_osd_pod_id(osd_pod) == osd_id: osd_pod_profile = osd_pod osd_profile_str = f"osd.{osd_id}.profile" log.info(f"Verify {osd_profile_str} log exist on /var/log/ceph/") sample = TimeoutSampler( timeout=100, sleep=10, func=self.verify_output_command_osd_pod, command="ls -ltr /var/log/ceph/", pod_obj=osd_pod_profile, str_to_check=osd_profile_str, ) if not sample.wait_for_func_status(result=True): log.error(f"{osd_profile_str} log does not exist on /var/log/ceph") raise ValueError( f"{osd_profile_str} log does not exist on /var/log/ceph") log.info(f"osd.{osd_id}.profile log exist on /var/log/ceph")