def validate_project_exists(self, pvc_obj): """ This function checks whether the new project exists in the EFK stack """ pod_list = get_all_pods(namespace='openshift-logging') elasticsearch_pod = [ pod.name for pod in pod_list if pod.name.startswith('elasticsearch') ] elasticsearch_pod_obj = get_pod_obj( name=elasticsearch_pod[1], namespace='openshift-logging' ) project_index = elasticsearch_pod_obj.exec_cmd_on_pod( command='indices', out_yaml_format=False ) project = pvc_obj.project.namespace if project in project_index: logger.info(f'The project {project} exists in the EFK stack') for item in project_index.split("\n"): if project in item: logger.info(item.strip()) assert 'green' in item.strip(), f"Project {project} is Unhealthy" else: raise ModuleNotFoundError
def validate_image_exists(app="redis"): """ Validate image exists on registries path Args: app (str): Label or application name Returns: image_list (str): Dir/Files/Images are listed in string format Raises: Exceptions if dir/folders not found """ if not config.DEPLOYMENT.get("disconnected"): pod_list = get_pod_name_by_pattern( pattern="image-registry", namespace=constants.OPENSHIFT_IMAGE_REGISTRY_NAMESPACE, ) for pod_name in pod_list: if "cluster" not in pod_name: pod_obj = pod.get_pod_obj( name=pod_name, namespace=constants.OPENSHIFT_IMAGE_REGISTRY_NAMESPACE, ) return pod_obj.exec_cmd_on_pod( command= f"find /registry/docker/registry/v2/repositories/openshift/{app}" )
def validate_image_exists(namespace=None): """ Validate image exists on registries path Args: namespace (str): Namespace where the images/builds are created Returns: image_list (str): Dir/Files/Images are listed in string format Raises: Exceptions if dir/folders not found """ if not config.DEPLOYMENT.get('disconnected'): pod_list = get_pod_name_by_pattern( pattern="image-registry", namespace=constants.OPENSHIFT_IMAGE_REGISTRY_NAMESPACE) for pod_name in pod_list: if "cluster" not in pod_name: pod_obj = pod.get_pod_obj( name=pod_name, namespace=constants.OPENSHIFT_IMAGE_REGISTRY_NAMESPACE) return pod_obj.exec_cmd_on_pod( command= f"find /registry/docker/registry/v2/repositories/{namespace}" )
def get_node_name_where_jenkins_pod_not_hosted( self, node_type=constants.WORKER_MACHINE, num_of_nodes=1 ): """ get nodes Args: node_type (str): The node type (e.g. worker, master) num_of_nodes (int): The number of nodes to be returned Returns: list: List of compute node names """ if node_type == constants.MASTER_MACHINE: nodes_drain = [node.name for node in get_typed_nodes( node_type=node_type, num_of_nodes=num_of_nodes )] elif node_type == constants.WORKER_MACHINE: pod_objs = [] for project in self.projects: pod_names = get_pod_name_by_pattern( pattern='jenkins', namespace=project ) pod_obj = [get_pod_obj(name=pod_name, namespace=project) for pod_name in pod_names] pod_objs += pod_obj nodes_app_name = set(get_app_pod_running_nodes(pod_objs)) nodes_worker_name = set(get_worker_nodes()) nodes_drain = nodes_worker_name - nodes_app_name else: raise ValueError('The node type is worker or master') return list(nodes_drain)[:num_of_nodes]
def check_health_of_clusterlogging(): """ * Checks for ElasticSearch, curator, fluentd and kibana pods in openshift-logging namespace * And check for the health of cluster logging, If status is green then the cluster is healthy,if status is red then health is bad Returns: list: Gives all the pods that are present in the namespace """ pod_list = [] pods = get_all_pods(namespace='openshift-logging') logger.info("Pods that are created by the instance") for pod in pods: pod_list.append(pod.name) logger.info(pod_list) elasticsearch_pod = [ pod for pod in pod_list if pod.startswith('elasticsearch') ] logger.info(elasticsearch_pod) pod_obj = get_pod_obj(name=elasticsearch_pod[0], namespace='openshift-logging') status_check = pod_obj.exec_cmd_on_pod( command='es_util --query=_cluster/health?pretty', out_yaml_format=False) logger.info(status_check) status_check = json.loads(status_check) if status_check['status'] == 'green': logger.info("Cluster logging is in Healthy state & Ready to use") else: logger.error("Cluster logging is in Bad state") return pod_list
def validate_messages_are_produced( self, namespace=constants.AMQ_NAMESPACE, value="10000", since_time=1800 ): """ Validates if all messages are sent in producer pod Args: namespace (str): Namespace of the pod value (str): Number of messages are sent since_time (int): Number of seconds to required to sent the msg Raises exception on failures """ # ToDo: Support multiple topics and users producer_pod_objs = [ get_pod_obj(pod) for pod in get_pod_name_by_pattern("hello-world-produce", namespace) ] for pod in producer_pod_objs: for msg in TimeoutSampler( 900, 30, self.validate_msg, pod.name, namespace, value, since_time ): if msg: break assert msg, "Few messages are not sent by producer" log.info("Producer sent all messages")
def get_new_pods(self, pod_list): """ Fetches info about the respun pods in the cluster Args: pod_list (list): list of previous pod objects Returns: list : list of respun pod objects """ new_pods = [] for pod_obj in pod_list: if any(str in pod_obj.name for str in ['mon', 'osd']): pod_label = pod_obj.labels.get('pod-template-hash') label_selector = f'pod-template-hash={pod_label}' else: pod_label = pod_obj.labels.get('deploymentconfig') label_selector = f'deploymentconfig={pod_label}' pods_data = pod.get_pods_having_label( label_selector, pod_obj.namespace ) for pod_data in pods_data: pod_name = pod_data.get('metadata').get('name') if '-deploy' not in pod_name and pod_name not in pod_obj.name: new_pods.append( pod.get_pod_obj(pod_name, pod_obj.namespace) ) logger.info( f"Previous pods: {[pod_obj.name for pod_obj in pod_list]}" ) logger.info( f"Respun pods: {[pod_obj.name for pod_obj in new_pods]}" ) return new_pods
def test_create_new_project_to_verify_logging( self, create_pvc_and_deploymentconfig_pod): """ This function creates new project to verify logging in EFK stack 1. Creates new project 2. Creates PVC 3. Creates Deployment pod in the new_project and run-io on the app pod 4. Logs into the EFK stack and check for new_project 5. And checks for the file_count in the new_project in EFK stack """ pod_obj, pvc_obj = create_pvc_and_deploymentconfig_pod # Running IO on the app_pod pod_obj.run_io(storage_type='fs', size=8000) # Searching for new_project in EFK stack pod_list = get_all_pods(namespace='openshift-logging') elasticsearch_pod = [ pod.name for pod in pod_list if pod.name.startswith('elasticsearch') ] elasticsearch_pod_obj = get_pod_obj(name=elasticsearch_pod[1], namespace='openshift-logging') projects = elasticsearch_pod_obj.exec_cmd_on_pod( command='indices | grep project', out_yaml_format=True) logger.info(projects) if pvc_obj.project.namespace in projects: logger.info("The new project exists in the EFK stack") else: raise ModuleNotFoundError
def validate_messages_are_consumed( self, namespace=constants.AMQ_NAMESPACE, value="10000", since_time=1800 ): """ Validates if all messages are received in consumer pod Args: namespace (str): Namespace of the pod value (str): Number of messages are recieved since_time (int): Number of seconds to required to receive the msg Raises exception on failures """ # ToDo: Support multiple topics and users consumer_pod_objs = [ get_pod_obj(pod) for pod in get_pod_name_by_pattern("hello-world-consumer", namespace) ] for pod in consumer_pod_objs: for msg in TimeoutSampler( 900, 30, self.validate_msg, pod.name, namespace, value, since_time ): if msg: break assert msg, "Consumer didn't receive all messages" log.info("Consumer received all messages")
def get_couchbase_nodes(self): """ Get nodes that contain a couchbase app pod Returns: list: List of nodes """ app_pods_list = get_pod_name_by_pattern( "cb-example", constants.COUCHBASE_OPERATOR ) app_pod_objs = list() for pod in app_pods_list: app_pod_objs.append( get_pod_obj(pod, namespace=constants.COUCHBASE_OPERATOR) ) log.info("Create a list of nodes that contain a couchbase app pod") nodes_set = set() for pod in app_pod_objs: logging.info( f"pod {pod.name} located on " f"node {pod.get().get('spec').get('nodeName')}" ) nodes_set.add(pod.get().get("spec").get("nodeName")) return list(nodes_set)
def noobaa_running_node_restart(pod_name): """ Function to restart node which has noobaa pod's running Args: pod_name (str): Name of noobaa pod """ nb_pod_obj = pod.get_pod_obj( (get_pod_name_by_pattern( pattern=pod_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE))[0], namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) nb_node_name = pod.get_pod_node(nb_pod_obj).name factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() nb_nodes = get_node_objs(node_names=nb_node_name) log.info(f"{pod_name} is running on {nb_node_name}") log.info(f"Restating node: {nb_node_name}....") nodes.restart_nodes_by_stop_and_start(nodes=nb_nodes, force=True) # Validate nodes are up and running wait_for_nodes_status() ceph_health_check(tries=30, delay=60) helpers.wait_for_resource_state(nb_pod_obj, constants.STATUS_RUNNING, timeout=180)
def get_new_pods(self, pod_list): """ Fetches info about the respun pods in the cluster Args: pod_list (list): list of previous pod objects Returns: list : list of respun pod objects """ new_pods = [] for pod_obj in pod_list: if any(str in pod_obj.name for str in ["mon", "osd"]): pod_label = pod_obj.labels.get("pod-template-hash") label_selector = f"pod-template-hash={pod_label}" else: pod_label = pod_obj.labels.get("deploymentconfig") label_selector = f"deploymentconfig={pod_label}" pods_data = pod.get_pods_having_label(label_selector, pod_obj.namespace) for pod_data in pods_data: pod_name = pod_data.get("metadata").get("name") if "-deploy" not in pod_name and pod_name not in pod_obj.name: new_pods.append( pod.get_pod_obj(pod_name, pod_obj.namespace)) logger.info(f"Previous pods: {[pod_obj.name for pod_obj in pod_list]}") logger.info(f"Respun pods: {[pod_obj.name for pod_obj in new_pods]}") return new_pods
def test_delete_rook_ceph_mon_pod(self): for i in range(5): rook_operator_pod = pod.get_ocs_operator_pod( ocs_label=constants.OPERATOR_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) assert rook_operator_pod, "No rook operator pod found" log.info( f"Found rook-operator pod {rook_operator_pod.name}. Deleting it." ) operator_deleted = rook_operator_pod.delete(wait=False) assert operator_deleted, f"Failed to delete pod {rook_operator_pod.name}" try: for pod_list in TimeoutSampler( 30, 1, pod.get_pods_having_label, constants.ROOK_CEPH_DETECT_VERSION_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ): if len(pod_list) > 0: self.rook_detect_pod_name = ( pod_list[0].get("metadata").get("name")) self.rook_detect_pod_obj = pod.get_pod_obj( self.rook_detect_pod_name, constants.OPENSHIFT_STORAGE_NAMESPACE, ) break except TimeoutExpiredError: assert True, "rook-ceph-detect-version pod not found" log.info( f"Found rook-ceph-detect-version pod {self.rook_detect_pod_name}. Deleting it" ) rook_detect_deleted = self.rook_detect_pod_obj.delete(wait=True) assert (rook_detect_deleted ), f"Failed to delete pod {self.rook_detect_pod_name}" self.rook_detect_pod_obj.ocp.wait_for_delete( self.rook_detect_pod_name) # Make sure there's no detect-version pod leftover try: for pod_list in TimeoutSampler( 30, 1, pod.get_pods_having_label, constants.ROOK_CEPH_DETECT_VERSION_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ): if len(pod_list) == 0: break else: log.info( f"Pod {pod_list[0].get('metadata').get('name')} found. waiting for it to be deleted" ) except TimeoutExpiredError: assert True, "rook-ceph-detect-version pod still exists"
def get_pgbench_pods(self): """ Get all pgbench pods Returns: List: pgbench pod objects list """ return [ get_pod_obj(pod, RIPSAW_NAMESPACE) for pod in get_pod_name_by_pattern("pgbench", RIPSAW_NAMESPACE) ]
def validate_pvc_are_mounted_on_monitoring_pods(pod_list): """ Validate created pvc are mounted on monitoring pods Args: pod_list (list): List of the pods where pvc are mounted """ for pod in pod_list: pod_obj = get_pod_obj( name=pod, namespace='openshift-monitoring' ) mount_point = pod_obj.exec_cmd_on_pod(command="df -kh") assert "/dev/rbd" in mount_point, f"pvc is not mounted on pod {pod}" logger.info("Verified all pvc are mounted on monitoring pods")
def validate_pvc_are_mounted_on_monitoring_pods(pod_list): """ Validate created pvc are mounted on monitoring pods Args: pod_list (list): List of the pods where pvc are mounted """ for pod in pod_list: pod_obj = get_pod_obj(name=pod.name, namespace=defaults.OCS_MONITORING_NAMESPACE) mount_point = pod_obj.exec_cmd_on_pod(command="df -kh") assert "/dev/rbd" in mount_point, f"pvc is not mounted on pod {pod.name}" logger.info("Verified all pvc are mounted on monitoring pods")
def get_jenkins_deploy_pods(self, namespace): """ Get all jenkins deploy pods Args: namespace (str): get pods in namespace Returns: pod_objs (list): jenkins deploy pod objects list """ return [ get_pod_obj(pod, namespace=namespace) for pod in get_pod_name_by_pattern('deploy', namespace=namespace) ]
def get_pgbench_status(self, pgbench_pod_name): """ Get pgbench status Args: pgbench_pod_name (str): Name of the pgbench pod Returns: str: state of pgbench pod (running/completed) """ pod_obj = get_pod_obj(pgbench_pod_name, namespace=RIPSAW_NAMESPACE) status = pod_obj.get().get('status').get('containerStatuses')[0].get( 'state') return 'running' if list( status.keys())[0] == 'running' else status['terminated']['reason']
def get_pgbench_status(self, pgbench_pod_name): """ Get pgbench status Args: pgbench_pod_name (str): Name of the pgbench pod Returns: str: state of pgbench pod (running/completed) """ pod_obj = get_pod_obj(pgbench_pod_name, namespace=RIPSAW_NAMESPACE) status = pod_obj.get().get("status").get("containerStatuses")[0].get( "state") return ("running" if list(status.keys())[0] == "running" else status["terminated"]["reason"])
def respin_couchbase_app_pod(self): """ Respin the couchbase app pod Returns: pod status """ app_pod_list = get_pod_name_by_pattern('cb-example', constants.COUCHBASE_OPERATOR) app_pod = app_pod_list[random.randint(0, len(app_pod_list) - 1)] logging.info(f"respin pod {app_pod}") app_pod_obj = get_pod_obj(app_pod, namespace=constants.COUCHBASE_OPERATOR) app_pod_obj.delete(wait=True, force=False) wait_for_resource_state(resource=app_pod_obj, state=constants.STATUS_RUNNING, timeout=300)
def validate_project_exists(self, pvc_obj): """ This function checks whether the new project exists in the EFK stack """ pod_list = get_all_pods(namespace='openshift-logging') elasticsearch_pod = [ pod.name for pod in pod_list if pod.name.startswith('elasticsearch') ] elasticsearch_pod_obj = get_pod_obj(name=elasticsearch_pod[1], namespace='openshift-logging') projects = elasticsearch_pod_obj.exec_cmd_on_pod( command='indices | grep project', out_yaml_format=True) logger.info(projects) if pvc_obj.project.namespace in projects: logger.info("The new project exists in the EFK stack") else: raise ModuleNotFoundError
def run_amq_workload(self, command, benchmark_pod_name, tiller_namespace, timeout): """ Runs amq workload in bg Args: command (str): Command to run on pod benchmark_pod_name (str): Pod name tiller_namespace (str): Namespace of pod timeout (int): Time to complete the run Returns: result (str): Returns benchmark run information """ pod_obj = get_pod_obj( name=f"{benchmark_pod_name}-driver", namespace=tiller_namespace ) return pod_obj.exec_cmd_on_pod( command=command, out_yaml_format=False, timeout=timeout )
def _cosbench_cli(self, workload): """ Runs Cosbench cli to initiate workload Args: workload (str): Workload file """ submit_key = "Accepted with ID" cobench_pod_obj = get_pod_obj(name=self.cosbench_pod.name, namespace=self.namespace) submit = cobench_pod_obj.exec_cmd_on_pod( command=f"/cos/cli.sh submit /cos/{workload}", out_yaml_format=True, timeout=180, ) if submit_key in submit.keys(): self.workload_id = submit[submit_key] else: assert f"Failed to submit the workload, ID not found. stdout: {submit}"
def test_delete_local_volume_sym_link(self): """ Delete sym link on LSO Cluster """ # Get rook-ceph-crashcollector pod objects crashcollector_pods = get_pod_name_by_pattern( pattern="rook-ceph-crashcollector", namespace=ROOK_CLUSTER_NAMESPACE) crashcollector_pods_objs = [] for crashcollector_pod in crashcollector_pods: crashcollector_pods_objs.append( get_pod_obj(name=crashcollector_pod, namespace=ROOK_CLUSTER_NAMESPACE)) # Get Node object node_obj = get_pod_node(pod_obj=crashcollector_pods_objs[0]) # Get Sym link osd_pvcs = get_deviceset_pvcs() pv_name = osd_pvcs[0].data["spec"]["volumeName"] ocp_obj = ocp.OCP(namespace=ROOK_CLUSTER_NAMESPACE, kind=constants.PV) pv_obj = ocp_obj.get(resource_name=pv_name) path = pv_obj["spec"]["local"]["path"] log.info("Delete sym link") oc_cmd = ocp.OCP(namespace=ROOK_CLUSTER_NAMESPACE) cmd = f"rm -rfv {path}" oc_cmd.exec_oc_debug_cmd(node=node_obj.name, cmd_list=[cmd]) log.info( "Waiting for rook-ceph-crashcollector pods to be reach Running state" ) for crashcollector_pods_obj in crashcollector_pods_objs: wait_for_resource_state(resource=crashcollector_pods_obj, state=constants.STATUS_RUNNING) # Check all OCS pods status, they should be in Running or Completed state wait_for_storage_pods() # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
def workloads_dir_setup(self, request): """ Setting up the environment for the test """ if config.DEPLOYMENT.get("local_storage"): self.worker_node = node.get_worker_nodes()[0] self.oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE) mon_pod_name = self.oc_cmd.exec_oc_debug_cmd( node=self.worker_node, cmd_list=["ls /var/lib/rook/ | grep mon"], ) mon_pod_id = mon_pod_name.split("-")[1].replace("\n", "") mon_pods_info = pod.get_pods_having_label( label=f"ceph_daemon_id={mon_pod_id}", namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) self.mon_pod = pod.get_pod_obj( name=mon_pods_info[0]["metadata"]["name"], namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) else: self.mon_pod = random.choice(pod.get_mon_pods()) self.mon_suffix = self.mon_pod.get().get("metadata").get("labels").get( "mon") self.workloads_dir = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}/workloads" log.info(f"Selected mon '{self.mon_pod.name}'") self.mon_pod.exec_cmd_on_pod(f"mkdir {self.workloads_dir}") self.mon_pod.exec_cmd_on_pod(f"touch {self.workloads_dir}/{TEMP_FILE}") def finalizer(): self.mon_pod.exec_cmd_on_pod(f"rm -rf {self.workloads_dir}") time.sleep(SLEEP_TIMEOUT) utils.ceph_health_check() request.addfinalizer(finalizer)
def get_helper_pods_output(): """ Get the output of "oc describe mg-helper pods" Returns: str: the output of "oc describe pods mg-helper" and "oc logs mg-helper" """ from ocs_ci.ocs.resources.pod import get_pod_obj, get_pod_logs output_describe_mg_helper = "" helper_pods = get_pod_name_by_pattern(pattern="helper") for helper_pod in helper_pods: try: helper_pod_obj = get_pod_obj( name=helper_pod, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) output_describe_mg_helper += ( f"****helper pod {helper_pod} describe****\n{helper_pod_obj.describe()}\n" f"****helper pod {helper_pod} logs***\n{get_pod_logs(pod_name=helper_pod)}" ) except Exception as e: log.error(e) return output_describe_mg_helper
def get_spun_dc_pods(pod_list): """ Fetches info about the re-spun dc pods Args: pod_list (list): list of previous pod objects Returns: list : list of respun pod objects """ new_pods = [] for pod_obj in pod_list: pod_label = pod_obj.labels.get("deploymentconfig") label_selector = f"deploymentconfig={pod_label}" pods_data = pod.get_pods_having_label(label_selector, pod_obj.namespace) for pod_data in pods_data: pod_name = pod_data.get("metadata").get("name") if "-deploy" not in pod_name and pod_name not in pod_obj.name: new_pods.append(pod.get_pod_obj(pod_name, pod_obj.namespace)) logger.info(f"Previous pods: {[pod_obj.name for pod_obj in pod_list]}") logger.info(f"Re-spun pods: {[pod_obj.name for pod_obj in new_pods]}") return new_pods
def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory): """ Test cluster recovery from disk deletion from the platform side. Based on documented procedure detailed in https://bugzilla.redhat.com/show_bug.cgi?id=1823183 """ logger.info("Picking a PV which to be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get("spec").get("claimRef").get("name") # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ ds for ds in osd_pvcs if ds.get().get("metadata").get("name") == claim_name ][0] # Get the corresponding OSD pod and ID logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] logger.info(f"OSD_POD {osd_pod.name}") osd_id = osd_pod.get().get("metadata").get("labels").get("ceph-osd-id") # Get the node that has the OSD pod running on logger.info( f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get( "labels").get("job-name")) osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_deployment_name = osd_deployment.name # Delete the volume from the platform side logger.info(f"Deleting {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Scale down OSD deployment logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") ocp.OCP().exec_oc_cmd( f"scale --replicas=0 deployment/{osd_deployment_name}") # Force delete OSD pod if necessary osd_pod_name = osd_pod.name logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") try: osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) except TimeoutError: osd_pod.delete(force=True) osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) # Run ocs-osd-removal job ocp_version = float(get_ocp_version()) if ocp_version >= 4.6: cmd = f"process ocs-osd-removal -p FAILED_OSD_IDS={osd_id} -o yaml" else: cmd = f"process ocs-osd-removal -p FAILED_OSD_ID={osd_id} -o yaml" logger.info(f"Executing OSD removal job on OSD-{osd_id}") ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"]) osd_removal_job_yaml = ocp_obj.exec_oc_cmd(cmd) osd_removal_job = OCS(**osd_removal_job_yaml) osd_removal_job.create(do_reload=False) # Get ocs-osd-removal pod name logger.info("Getting the ocs-osd-removal pod name") osd_removal_pod_name = get_osd_removal_pod_name(osd_id) osd_removal_pod_obj = get_pod_obj(osd_removal_pod_name, namespace="openshift-storage") osd_removal_pod_obj.ocp.wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=osd_removal_pod_name) # Verify OSD removal from the ocs-osd-removal pod logs logger.info( f"Verifying removal of OSD from {osd_removal_pod_name} pod logs") logs = get_pod_logs(osd_removal_pod_name) pattern = f"purged osd.{osd_id}" assert re.search(pattern, logs) osd_pvc_name = osd_pvc.name if ocp_version < 4.6: # Delete the OSD prepare job logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete( resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Delete the OSD deployment logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete( resource_name=osd_deployment_name, timeout=120) else: # If ocp version is '4.6' and above the osd removal job should # delete the OSD prepare job, OSD PVC, OSD deployment logger.info( f"Verifying deletion of OSD prepare job {osd_prepare_job_name}" ) osd_prepare_job.ocp.wait_for_delete( resource_name=osd_prepare_job_name, timeout=30) logger.info(f"Verifying deletion of OSD PVC {osd_pvc_name}") osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name, timeout=30) logger.info( f"Verifying deletion of OSD deployment {osd_deployment_name}") osd_deployment.ocp.wait_for_delete( resource_name=osd_deployment_name, timeout=30) # Delete PV logger.info(f"Verifying deletion of PV {osd_pv_name}") try: osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) except TimeoutError: osd_pv.delete() osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) if ocp_version < 4.6: # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info( f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() # Delete the OSD removal job logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") osd_removal_job = get_job_obj(f"ocs-osd-removal-{osd_id}") osd_removal_job.delete() osd_removal_job.ocp.wait_for_delete( resource_name=f"ocs-osd-removal-{osd_id}") timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info( "Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info( "Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810 # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438 if ocp_version >= 4.6: silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning( osd_pod_name) if not silence_osd_crash: logger.info("Didn't find ceph osd crash warning") # Validate cluster is still functional self.sanity_helpers.health_check(tries=100) self.sanity_helpers.create_resources(pvc_factory, pod_factory)
def test_rgw_kafka_notifications(self, bucket_factory): """ Test to verify rgw kafka notifications """ # Get sc sc = default_storage_class(interface_type=constants.CEPHBLOCKPOOL) # Deploy amq cluster self.amq.setup_amq_cluster(sc.name) # Create topic self.kafka_topic = self.amq.create_kafka_topic() # Create Kafkadrop pod ( self.kafkadrop_pod, self.kafkadrop_pod, self.kafkadrop_route, ) = self.amq.create_kafkadrop() # Get the kafkadrop route kafkadrop_host = self.kafkadrop_route.get().get("spec").get("host") # Create bucket bucketname = bucket_factory(amount=1, interface="RGW-OC")[0].name # Get RGW credentials rgw_obj = RGW() rgw_endpoint, access_key, secret_key = rgw_obj.get_credentials() # Clone notify repo notify_path = clone_notify() # Initialise to put objects data = "A random string data to write on created rgw bucket" obc_obj = OBC(bucketname) s3_resource = boto3.resource( "s3", verify=retrieve_verification_mode(), endpoint_url=rgw_endpoint, aws_access_key_id=obc_obj.access_key_id, aws_secret_access_key=obc_obj.access_key, ) s3_client = s3_resource.meta.client # Initialize notify command to run notify_cmd = ( f"python {notify_path} -e {rgw_endpoint} -a {obc_obj.access_key_id} " f"-s {obc_obj.access_key} -b {bucketname} -ke {constants.KAFKA_ENDPOINT} -t {self.kafka_topic.name}" ) log.info(f"Running cmd {notify_cmd}") # Put objects to bucket assert s3_client.put_object(Bucket=bucketname, Key="key-1", Body=data), "Failed: Put object: key-1" exec_cmd(notify_cmd) # Validate rgw logs notification are sent # No errors are seen pattern = "ERROR: failed to create push endpoint" rgw_pod_obj = get_rgw_pods() rgw_log = get_pod_logs(pod_name=rgw_pod_obj[0].name, container="rgw") assert re.search(pattern=pattern, string=rgw_log) is None, ( f"Error: {pattern} msg found in the rgw logs." f"Validate {pattern} found on rgw logs and also " f"rgw bucket notification is working correctly") assert s3_client.put_object(Bucket=bucketname, Key="key-2", Body=data), "Failed: Put object: key-2" exec_cmd(notify_cmd) # Validate message are received Kafka side using curl command # A temporary way to check from Kafka side, need to check from UI curl_command = ( f"curl -X GET {kafkadrop_host}/topic/{self.kafka_topic.name} " "-H 'content-type: application/vnd.kafka.json.v2+json'") json_output = run_cmd(cmd=curl_command) new_string = json_output.split() messages = new_string[new_string.index("messages</td>") + 1] if messages.find("1") == -1: raise Exception( "Error: Messages are not recieved from Kafka side." "RGW bucket notification is not working as expected.") # Validate the timestamp events ocs_version = config.ENV_DATA["ocs_version"] if Version.coerce(ocs_version) >= Version.coerce("4.8"): cmd = ( f"bin/kafka-console-consumer.sh --bootstrap-server {constants.KAFKA_ENDPOINT} " f"--topic {self.kafka_topic.name} --from-beginning --timeout-ms 20000" ) pod_list = get_pod_name_by_pattern( pattern="my-cluster-zookeeper", namespace=constants.AMQ_NAMESPACE) zookeeper_obj = get_pod_obj(name=pod_list[0], namespace=constants.AMQ_NAMESPACE) event_obj = zookeeper_obj.exec_cmd_on_pod(command=cmd) log.info(f"Event obj: {event_obj}") event_time = event_obj.get("Records")[0].get("eventTime") format_string = "%Y-%m-%dT%H:%M:%S.%fZ" try: datetime.strptime(event_time, format_string) except ValueError as ef: log.error( f"Timestamp event {event_time} doesnt match the pattern {format_string}" ) raise ef log.info( f"Timestamp event {event_time} matches the pattern {format_string}" )
def run_amq_benchmark( self, benchmark_pod_name="benchmark", kafka_namespace=constants.AMQ_NAMESPACE, tiller_namespace=AMQ_BENCHMARK_NAMESPACE, num_of_clients=8, worker=None, timeout=1800, amq_workload_yaml=None, run_in_bg=False, ): """ Run benchmark pod and get the results Args: benchmark_pod_name (str): Name of the benchmark pod kafka_namespace (str): Namespace where kafka cluster created tiller_namespace (str): Namespace where tiller pod needs to be created num_of_clients (int): Number of clients to be created worker (str) : Loads to create on workloads separated with commas e.g http://benchmark-worker-0.benchmark-worker:8080, http://benchmark-worker-1.benchmark-worker:8080 timeout (int): Time to complete the run amq_workload_yaml (dict): Contains amq workloads information keys and values :name (str): Name of the workloads :topics (int): Number of topics created :partitions_per_topic (int): Number of partitions per topic :message_size (int): Message size :payload_file (str): Load to run on workload :subscriptions_per_topic (int): Number of subscriptions per topic :consumer_per_subscription (int): Number of consumers per subscription :producers_per_topic (int): Number of producers per topic :producer_rate (int): Producer rate :consumer_backlog_sizegb (int): Size of block in gb :test_duration_minutes (int): Time to run the workloads run_in_bg (bool): On true the workload will run in background Return: result (str/Thread obj): Returns benchmark run information if run_in_bg is False. Otherwise a thread of the amq workload execution """ # Namespace for to helm/tiller try: self.create_namespace(tiller_namespace) except CommandFailed as ef: if ( f'project.project.openshift.io "{tiller_namespace}" already exists' not in str(ef) ): raise ef # Create rbac file try: sa_tiller = list( templating.load_yaml(constants.AMQ_RBAC_YAML, multi_document=True) ) sa_tiller[0]["metadata"]["namespace"] = tiller_namespace sa_tiller[1]["subjects"][0]["namespace"] = tiller_namespace self.sa_tiller = OCS(**sa_tiller[0]) self.crb_tiller = OCS(**sa_tiller[1]) self.sa_tiller.create() self.crb_tiller.create() except (CommandFailed, CalledProcessError) as cf: log.error("Failed during creation of service account tiller") raise cf # Install helm cli (version v2.16.0 as we need tiller component) # And create tiller pods wget_cmd = f"wget -c --read-timeout=5 --tries=0 {URL}" untar_cmd = "tar -zxvf helm-v2.16.1-linux-amd64.tar.gz" tiller_cmd = ( f"linux-amd64/helm init --tiller-namespace {tiller_namespace}" f" --service-account {tiller_namespace}" ) exec_cmd(cmd=wget_cmd, cwd=self.dir) exec_cmd(cmd=untar_cmd, cwd=self.dir) exec_cmd(cmd=tiller_cmd, cwd=self.dir) # Validate tiller pod is running log.info("Waiting for 30s for tiller pod to come up") time.sleep(30) if self.is_amq_pod_running( pod_pattern="tiller", expected_pods=1, namespace=tiller_namespace ): log.info("Tiller pod is running") else: raise ResourceWrongStatusException("Tiller pod is not in running state") # Create benchmark pods log.info("Create benchmark pods") values = templating.load_yaml(constants.AMQ_BENCHMARK_VALUE_YAML) values["numWorkers"] = num_of_clients benchmark_cmd = ( f"linux-amd64/helm install {constants.AMQ_BENCHMARK_POD_YAML}" f" --name {benchmark_pod_name} --tiller-namespace {tiller_namespace}" ) exec_cmd(cmd=benchmark_cmd, cwd=self.dir) # Making sure the benchmark pod and clients are running if self.is_amq_pod_running( pod_pattern="benchmark", expected_pods=(1 + num_of_clients), namespace=tiller_namespace, ): log.info("All benchmark pod is up and running") else: raise ResourceWrongStatusException( "Benchmark pod is not getting to running state" ) # Update commonConfig with kafka-bootstrap server details driver_kafka = templating.load_yaml(constants.AMQ_DRIVER_KAFKA_YAML) driver_kafka[ "commonConfig" ] = f"bootstrap.servers=my-cluster-kafka-bootstrap.{kafka_namespace}.svc.cluster.local:9092" json_file = f"{self.dir}/driver_kafka" templating.dump_data_to_json(driver_kafka, json_file) cmd = f"cp {json_file} {benchmark_pod_name}-driver:/" self.pod_obj.exec_oc_cmd(cmd) # Update the workload yaml if not amq_workload_yaml: amq_workload_yaml = templating.load_yaml(constants.AMQ_WORKLOAD_YAML) yaml_file = f"{self.dir}/amq_workload.yaml" templating.dump_data_to_temp_yaml(amq_workload_yaml, yaml_file) cmd = f"cp {yaml_file} {benchmark_pod_name}-driver:/" self.pod_obj.exec_oc_cmd(cmd) self.benchmark = True # Run the benchmark if worker: cmd = f"bin/benchmark --drivers /driver_kafka --workers {worker} /amq_workload.yaml" else: cmd = "bin/benchmark --drivers /driver_kafka /amq_workload.yaml" log.info(f"Run benchmark and running command {cmd} inside the benchmark pod ") if run_in_bg: executor = ThreadPoolExecutor(1) result = executor.submit( self.run_amq_workload, cmd, benchmark_pod_name, tiller_namespace, timeout, ) return result pod_obj = get_pod_obj( name=f"{benchmark_pod_name}-driver", namespace=tiller_namespace ) result = pod_obj.exec_cmd_on_pod( command=cmd, out_yaml_format=False, timeout=timeout ) return result