def test_delete_rook_ceph_osd_deployment(self): osd_deployments = get_osd_deployments() deployment_obj = OCP(kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) pod_obj = OCP(kind=constants.POD, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for osd_deployment in osd_deployments: # Get rook-ceph-osd pod name associated with the deployment osd_deployment_name = osd_deployment.name old_osd_pod = get_pod_name_by_pattern( pattern=osd_deployment_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, )[0] logger.info(f"Deleting OSD deployment: {osd_deployment_name}") try: deployment_obj.delete(resource_name=osd_deployment_name) deployment_obj.wait_for_resource( condition="0/1", resource_name=osd_deployment_name, column="READY") except CommandFailed as err: if "NotFound" not in str(err): raise # Wait for new OSD deployment to be Ready deployment_obj.wait_for_resource(condition="1/1", resource_name=osd_deployment_name, column="READY") # Check if a new OSD pod is created new_osd_pod = get_pod_name_by_pattern( pattern=osd_deployment_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, )[0] assert old_osd_pod != new_osd_pod, "New OSD pod not created" # Check if new OSD pod is up and running logger.info( "Waiting for a new OSD pod to get created and reach Running state" ) assert pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=new_osd_pod, column="STATUS", ), f"New OSD pod {new_osd_pod} is not in {constants.STATUS_RUNNING} state" # If clusterwide encryption is enabled, verify that the new OSDs are encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() assert ceph_health_check(delay=120, tries=50), "Ceph health check failed"
def validate_cluster_on_pvc(): """ Validate creation of PVCs for MON and OSD pods. Also validate that those PVCs are attached to the OCS pods Raises: AssertionError: If PVC is not mounted on one or more OCS pods """ # Get the PVCs for selected label (MON/OSD) ns = config.ENV_DATA['cluster_namespace'] ocs_pvc_obj = get_all_pvc_objs(namespace=ns) # Check all pvc's are in bound state pvc_names = [] for pvc_obj in ocs_pvc_obj: if (pvc_obj.name.startswith(constants.DEFAULT_DEVICESET_PVC_NAME) or pvc_obj.name.startswith(constants.DEFAULT_MON_PVC_NAME)): assert pvc_obj.status == constants.STATUS_BOUND, ( f"PVC {pvc_obj.name} is not Bound" ) logger.info(f"PVC {pvc_obj.name} is in Bound state") pvc_names.append(pvc_obj.name) mon_pods = get_pod_name_by_pattern('rook-ceph-mon', ns) if not config.DEPLOYMENT.get('local_storage'): logger.info("Validating all mon pods have PVC") validate_ocs_pods_on_pvc(mon_pods, pvc_names) else: logger.debug( "Skipping validation if all mon pods have PVC because in LSO " "deployment we don't have mon pods backed by PVC" ) logger.info("Validating all osd pods have PVC") osd_deviceset_pods = get_pod_name_by_pattern( 'rook-ceph-osd-prepare-ocs-deviceset', ns ) validate_ocs_pods_on_pvc(osd_deviceset_pods, pvc_names) osd_pods = get_pod_name_by_pattern('rook-ceph-osd', ns, filter='prepare') for ceph_pod in mon_pods + osd_pods: out = run_cmd(f'oc -n {ns} get pods {ceph_pod} -o yaml') out_yaml = yaml.safe_load(out) for vol in out_yaml['spec']['volumes']: if vol.get('persistentVolumeClaim'): claimName = vol.get('persistentVolumeClaim').get('claimName') logger.info(f"{ceph_pod} backed by pvc {claimName}") assert claimName in pvc_names, ( "Ceph Internal Volume not backed by PVC" )
def validate_messages_are_produced( self, namespace=constants.AMQ_NAMESPACE, value="10000", since_time=1800 ): """ Validates if all messages are sent in producer pod Args: namespace (str): Namespace of the pod value (str): Number of messages are sent since_time (int): Number of seconds to required to sent the msg Raises exception on failures """ # ToDo: Support multiple topics and users producer_pod_objs = [ get_pod_obj(pod) for pod in get_pod_name_by_pattern("hello-world-produce", namespace) ] for pod in producer_pod_objs: for msg in TimeoutSampler( 900, 30, self.validate_msg, pod.name, namespace, value, since_time ): if msg: break assert msg, "Few messages are not sent by producer" log.info("Producer sent all messages")
def validate_messages_are_consumed( self, namespace=constants.AMQ_NAMESPACE, value="10000", since_time=1800 ): """ Validates if all messages are received in consumer pod Args: namespace (str): Namespace of the pod value (str): Number of messages are recieved since_time (int): Number of seconds to required to receive the msg Raises exception on failures """ # ToDo: Support multiple topics and users consumer_pod_objs = [ get_pod_obj(pod) for pod in get_pod_name_by_pattern("hello-world-consumer", namespace) ] for pod in consumer_pod_objs: for msg in TimeoutSampler( 900, 30, self.validate_msg, pod.name, namespace, value, since_time ): if msg: break assert msg, "Consumer didn't receive all messages" log.info("Consumer received all messages")
def get_couchbase_nodes(self): """ Get nodes that contain a couchbase app pod Returns: list: List of nodes """ app_pods_list = get_pod_name_by_pattern( "cb-example", constants.COUCHBASE_OPERATOR ) app_pod_objs = list() for pod in app_pods_list: app_pod_objs.append( get_pod_obj(pod, namespace=constants.COUCHBASE_OPERATOR) ) log.info("Create a list of nodes that contain a couchbase app pod") nodes_set = set() for pod in app_pod_objs: logging.info( f"pod {pod.name} located on " f"node {pod.get().get('spec').get('nodeName')}" ) nodes_set.add(pod.get().get("spec").get("nodeName")) return list(nodes_set)
def validate_image_exists(namespace=None): """ Validate image exists on registries path Args: namespace (str): Namespace where the images/builds are created Returns: image_list (str): Dir/Files/Images are listed in string format Raises: Exceptions if dir/folders not found """ if not config.DEPLOYMENT.get('disconnected'): pod_list = get_pod_name_by_pattern( pattern="image-registry", namespace=constants.OPENSHIFT_IMAGE_REGISTRY_NAMESPACE) for pod_name in pod_list: if "cluster" not in pod_name: pod_obj = pod.get_pod_obj( name=pod_name, namespace=constants.OPENSHIFT_IMAGE_REGISTRY_NAMESPACE) return pod_obj.exec_cmd_on_pod( command= f"find /registry/docker/registry/v2/repositories/{namespace}" )
def validate_image_exists(app="redis"): """ Validate image exists on registries path Args: app (str): Label or application name Returns: image_list (str): Dir/Files/Images are listed in string format Raises: Exceptions if dir/folders not found """ if not config.DEPLOYMENT.get("disconnected"): pod_list = get_pod_name_by_pattern( pattern="image-registry", namespace=constants.OPENSHIFT_IMAGE_REGISTRY_NAMESPACE, ) for pod_name in pod_list: if "cluster" not in pod_name: pod_obj = pod.get_pod_obj( name=pod_name, namespace=constants.OPENSHIFT_IMAGE_REGISTRY_NAMESPACE, ) return pod_obj.exec_cmd_on_pod( command= f"find /registry/docker/registry/v2/repositories/openshift/{app}" )
def get_node_name_where_jenkins_pod_not_hosted( self, node_type=constants.WORKER_MACHINE, num_of_nodes=1 ): """ get nodes Args: node_type (str): The node type (e.g. worker, master) num_of_nodes (int): The number of nodes to be returned Returns: list: List of compute node names """ if node_type == constants.MASTER_MACHINE: nodes_drain = [node.name for node in get_typed_nodes( node_type=node_type, num_of_nodes=num_of_nodes )] elif node_type == constants.WORKER_MACHINE: pod_objs = [] for project in self.projects: pod_names = get_pod_name_by_pattern( pattern='jenkins', namespace=project ) pod_obj = [get_pod_obj(name=pod_name, namespace=project) for pod_name in pod_names] pod_objs += pod_obj nodes_app_name = set(get_app_pod_running_nodes(pod_objs)) nodes_worker_name = set(get_worker_nodes()) nodes_drain = nodes_worker_name - nodes_app_name else: raise ValueError('The node type is worker or master') return list(nodes_drain)[:num_of_nodes]
def test_monitoring_shutdown_mgr_pod(self, pods): """ Montoring backed by OCS, bring mgr down(replica: 0) for some time and check ceph related metrics """ # Check ceph metrics available assert ( check_ceph_metrics_available() ), "failed to get results for some metrics before Downscaling deployment mgr to 0" # Get pod mge name and mgr deployment oc_deployment = ocp.OCP(kind=constants.DEPLOYMENT, namespace=ROOK_CLUSTER_NAMESPACE) mgr_deployments = oc_deployment.get( selector=constants.MGR_APP_LABEL)["items"] mgr = mgr_deployments[0]["metadata"]["name"] pod_mgr_name = get_pod_name_by_pattern( pattern=mgr, namespace=ROOK_CLUSTER_NAMESPACE) log.info(f"Downscaling deployment {mgr} to 0") oc_deployment.exec_oc_cmd(f"scale --replicas=0 deployment/{mgr}") log.info(f"Wait for a mgr pod {pod_mgr_name[0]} to be deleted") oc_pod = ocp.OCP(kind=constants.POD, namespace=ROOK_CLUSTER_NAMESPACE) oc_pod.wait_for_delete(resource_name=pod_mgr_name[0]) log.info(f"Upscaling deployment {mgr} back to 1") oc_deployment.exec_oc_cmd(f"scale --replicas=1 deployment/{mgr}") log.info("Waiting for mgr pod to be reach Running state") oc_pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MGR_APP_LABEL) # Check ceph metrics available check_ceph_metrics_available_within_time()
def noobaa_running_node_restart(pod_name): """ Function to restart node which has noobaa pod's running Args: pod_name (str): Name of noobaa pod """ nb_pod_obj = pod.get_pod_obj( (get_pod_name_by_pattern( pattern=pod_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE))[0], namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) nb_node_name = pod.get_pod_node(nb_pod_obj).name factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() nb_nodes = get_node_objs(node_names=nb_node_name) log.info(f"{pod_name} is running on {nb_node_name}") log.info(f"Restating node: {nb_node_name}....") nodes.restart_nodes_by_stop_and_start(nodes=nb_nodes, force=True) # Validate nodes are up and running wait_for_nodes_status() ceph_health_check(tries=30, delay=60) helpers.wait_for_resource_state(nb_pod_obj, constants.STATUS_RUNNING, timeout=180)
def _deploy_es(self): """ Deploying the Elasticsearch server """ # Creating PVC for the elasticsearch server and wait until it bound log.info("Creating 10 GiB PVC for the ElasticSearch cluster on") try: self.pvc_obj = create_pvc( sc_name=self.args.get("sc") or constants.CEPHBLOCKPOOL_SC, namespace=self.namespace, pvc_name="elasticsearch-data-quickstart-es-default-0", access_mode=constants.ACCESS_MODE_RWO, size="10Gi", ) # Make sure the PVC bound, or delete it and raise exception wait_for_resource_state(self.pvc_obj, constants.STATUS_BOUND) except ResourceWrongStatusException: log.error("The PVC couldn't created") return False self.pvc_obj.reload() log.info("Deploy the ElasticSearch cluster") self.ocp.apply(self.crd) sample = TimeoutSampler( timeout=300, sleep=10, func=self._pod_is_found, pattern="quickstart-es-default", ) if not sample.wait_for_func_status(True): log.error("The ElasticSearch pod deployment Failed") return False self.espod = get_pod_name_by_pattern("quickstart-es-default", self.namespace)[0] log.info(f"The ElasticSearch pod {self.espod} Started") es_pod = OCP(kind="pod", namespace=self.namespace) log.info("Waiting for ElasticSearch to Run") if not es_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=self.espod, sleep=30, timeout=600, ): log.error("TThe ElasticSearch pod is not running !") return False else: log.info("Elastic Search is ready !!!") return True
def image_pull_and_push(project_name, template, image="", pattern="", wait=True): """ Pull and push images running oc new-app command Args: project_name (str): Name of project template (str): Name of the template of the image image (str): Name of the image with tag pattern (str): name of the build with given pattern wait (bool): If true waits till the image pull and push completes. """ ocp_obj = ocp.OCP(kind="template", namespace="openshift") try: ocp_obj.get(resource_name=template) except CommandFailed as cfe: if f'"{template}" not found' in str(cfe): logger.warn(f"Template {template} not found") template = "redis-ephemeral" else: raise if config.DEPLOYMENT.get("disconnected"): mirror_image(image=image) else: cmd = f"new-app --template={template} -n {project_name}" ocp_obj = ocp.OCP() ocp_obj.exec_oc_cmd(command=cmd, out_yaml_format=False) # Validate it completed if wait: if template == "redis-ephemeral": ocp_obj = ocp.OCP(kind=constants.POD, namespace=project_name) deploy_pod_name = get_pod_name_by_pattern( pattern="deploy", namespace=project_name) ocp_obj.wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=deploy_pod_name[0], ) else: wait_time = 300 logger.info( f"Wait for {wait_time} seconds for build to come up") time.sleep(300) build_list = get_build_name_by_pattern(pattern=pattern, namespace=project_name) if build_list is None: raise Exception("Build is not created") build_obj = ocp.OCP(kind="Build", namespace=project_name) for build in build_list: build_obj.wait_for_resource(condition="Complete", resource_name=build, timeout=900)
def _pod_is_found(self, pattern): """ Boolean function which check if pod (by pattern) is exist. Args: pattern (str): the pattern of the pod to look for Returns: bool : True if pod found, otherwise False """ return len(get_pod_name_by_pattern(pattern, self.namespace)) > 0
def get_pgbench_pods(self): """ Get all pgbench pods Returns: List: pgbench pod objects list """ return [ get_pod_obj(pod, RIPSAW_NAMESPACE) for pod in get_pod_name_by_pattern("pgbench", RIPSAW_NAMESPACE) ]
def validate_cluster_on_pvc(): """ Validate creation of PVCs for MON and OSD pods. Also validate that those PVCs are attached to the OCS pods Raises: AssertionError: If PVC is not mounted on one or more OCS pods """ # Get the PVCs for selected label (MON/OSD) ns = config.ENV_DATA['cluster_namespace'] ocs_pvc_obj = get_all_pvc_objs(namespace=ns) # Check all pvc's are in bound state pvc_names = [] for pvc_obj in ocs_pvc_obj: if (pvc_obj.name.startswith(constants.DEFAULT_DEVICESET_PVC_NAME) or pvc_obj.name.startswith(constants.DEFAULT_MON_PVC_NAME)): assert pvc_obj.status == constants.STATUS_BOUND, ( f"PVC {pvc_obj.name} is not Bound" ) logger.info(f"PVC {pvc_obj.name} is in Bound state") pvc_names.append(pvc_obj.name) mon_pods = get_pod_name_by_pattern('rook-ceph-mon', ns) osd_pods = get_pod_name_by_pattern('rook-ceph-osd', ns, filter='prepare') assert len(mon_pods) + len(osd_pods) == len(pvc_names), ( "Not enough PVC's available for all Ceph Pods" ) for ceph_pod in mon_pods + osd_pods: out = run_cmd(f'oc -n {ns} get pods {ceph_pod} -o yaml') out_yaml = yaml.safe_load(out) for vol in out_yaml['spec']['volumes']: if vol.get('persistentVolumeClaim'): claimName = vol.get('persistentVolumeClaim').get('claimName') logger.info(f"{ceph_pod} backed by pvc {claimName}") assert claimName in pvc_names, ( "Ceph Internal Volume not backed by PVC" )
def get_jenkins_deploy_pods(self, namespace): """ Get all jenkins deploy pods Args: namespace (str): get pods in namespace Returns: pod_objs (list): jenkins deploy pod objects list """ return [ get_pod_obj(pod, namespace=namespace) for pod in get_pod_name_by_pattern('deploy', namespace=namespace) ]
def respin_couchbase_app_pod(self): """ Respin the couchbase app pod Returns: pod status """ app_pod_list = get_pod_name_by_pattern('cb-example', constants.COUCHBASE_OPERATOR) app_pod = app_pod_list[random.randint(0, len(app_pod_list) - 1)] logging.info(f"respin pod {app_pod}") app_pod_obj = get_pod_obj(app_pod, namespace=constants.COUCHBASE_OPERATOR) app_pod_obj.delete(wait=True, force=False) wait_for_resource_state(resource=app_pod_obj, state=constants.STATUS_RUNNING, timeout=300)
def _deploy_data_dumper_client(self): """ Deploying elastic search client pod with utility which dump all the data from the server to .tgz file """ log.info("Deploying the es client for dumping all data") self.ocp.apply(self.dumper_file) sample = TimeoutSampler(timeout=300, sleep=10, func=self._pod_is_found, pattern="es-dumper") if not sample.wait_for_func_status(True): raise Exception("Dumper pod deployment Failed") self.dump_pod = get_pod_name_by_pattern("es-dumper", self.namespace)[0] log.info(f"The dumper client pod {self.dump_pod} is ready !")
def _deploy_es(self): """ Deploying the Elasticsearch server """ # Creating PVC for the elasticsearch server and wait until it bound log.info("Creating 10 GiB PVC for the ElasticSearch cluster on") self.pvc_obj = create_pvc( sc_name=constants.CEPHBLOCKPOOL_SC, namespace=self.namespace, pvc_name="elasticsearch-data-quickstart-es-default-0", access_mode=constants.ACCESS_MODE_RWO, size="10Gi", ) wait_for_resource_state(self.pvc_obj, constants.STATUS_BOUND) self.pvc_obj.reload() log.info("Deploy the ElasticSearch cluster") self.ocp.apply(self.crd) sample = TimeoutSampler( timeout=300, sleep=10, func=self._pod_is_found, pattern="quickstart-es-default", ) if not sample.wait_for_func_status(True): self.cleanup() raise Exception("The ElasticSearch pod deployment Failed") self.espod = get_pod_name_by_pattern("quickstart-es-default", self.namespace)[0] log.info(f"The ElasticSearch pod {self.espod} Started") es_pod = OCP(kind="pod", namespace=self.namespace) log.info("Waiting for ElasticSearch to Run") assert es_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=self.espod, sleep=30, timeout=600, ) log.info("Elastic Search is ready !!!")
def test_delete_local_volume_sym_link(self): """ Delete sym link on LSO Cluster """ # Get rook-ceph-crashcollector pod objects crashcollector_pods = get_pod_name_by_pattern( pattern="rook-ceph-crashcollector", namespace=ROOK_CLUSTER_NAMESPACE) crashcollector_pods_objs = [] for crashcollector_pod in crashcollector_pods: crashcollector_pods_objs.append( get_pod_obj(name=crashcollector_pod, namespace=ROOK_CLUSTER_NAMESPACE)) # Get Node object node_obj = get_pod_node(pod_obj=crashcollector_pods_objs[0]) # Get Sym link osd_pvcs = get_deviceset_pvcs() pv_name = osd_pvcs[0].data["spec"]["volumeName"] ocp_obj = ocp.OCP(namespace=ROOK_CLUSTER_NAMESPACE, kind=constants.PV) pv_obj = ocp_obj.get(resource_name=pv_name) path = pv_obj["spec"]["local"]["path"] log.info("Delete sym link") oc_cmd = ocp.OCP(namespace=ROOK_CLUSTER_NAMESPACE) cmd = f"rm -rfv {path}" oc_cmd.exec_oc_debug_cmd(node=node_obj.name, cmd_list=[cmd]) log.info( "Waiting for rook-ceph-crashcollector pods to be reach Running state" ) for crashcollector_pods_obj in crashcollector_pods_objs: wait_for_resource_state(resource=crashcollector_pods_obj, state=constants.STATUS_RUNNING) # Check all OCS pods status, they should be in Running or Completed state wait_for_storage_pods() # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
def collect_benchmark_logs(self): """ Collecting the test log from all benchmark pods """ # Getting full list of benchmark clients self.full_client_list = get_pod_name_by_pattern( self.client_pod_name, benchmark_operator.BMO_NAME) # Collecting logs from each pod for clpod in self.full_client_list: test_logs = self.pod_obj.exec_oc_cmd(f"logs {clpod}", out_yaml_format=False) log_file_name = f"{self.full_log_path}/{clpod}-pod.log" try: with open(log_file_name, "w") as f: f.write(test_logs) log.info(f"The Test log can be found at : {log_file_name}") except Exception: log.warning( f"Cannot write the log to the file {log_file_name}") log.info("Logs from all client pods got successfully")
def measure_obc_deletion_time(obc_name_list, timeout=60): """ Measure OBC deletion time Args: obc_name_list (list): List of obc names to measure deletion time timeout (int): Wait time in second before collecting log Returns: obc_dict (dict): Dictionary of obcs and deletion time in second """ # Get obc deletion logs nb_pod_name = get_pod_name_by_pattern("noobaa-operator-") nb_pod_log = pod.get_pod_logs( pod_name=nb_pod_name[0], namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) nb_pod_log = nb_pod_log.split("\n") loop_cnt = 0 while True: no_data = list() for obc_name in obc_name_list: start = [ i for i in nb_pod_log if re.search(f"removing ObjectBucket.*{obc_name}", i) ] end = [ i for i in nb_pod_log if re.search(f"ObjectBucket deleted.*{obc_name}", i) ] if not start or not end: no_data.append(obc_name) if no_data: time.sleep(timeout) nb_pod_name = get_pod_name_by_pattern("noobaa-operator-") nb_pod_log = pod.get_pod_logs( pod_name=nb_pod_name[0], namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) nb_pod_log = nb_pod_log.split("\n") loop_cnt += 1 if loop_cnt >= 10: log.info("Waited for more than 10 mins but still no data") raise UnexpectedBehaviour( f"There is no obc deletion data in noobaa-operator logs for {no_data}" ) continue else: break obc_dict = dict() this_year = str(datetime.datetime.now().year) for obc_name in obc_name_list: # Extract obc deletion start time start_item = [ i for i in nb_pod_log if re.search(f"removing ObjectBucket.*{obc_name}", i) ] mon_day = " ".join(start_item[0].split(" ")[0:2]) start = f"{this_year} {mon_day}" dt_start = datetime.datetime.strptime(start, "%Y I%m%d %H:%M:%S.%f") # Extract obc deletion end time end_item = [ i for i in nb_pod_log if re.search(f"ObjectBucket deleted.*{obc_name}", i) ] mon_day = " ".join(end_item[0].split(" ")[0:2]) end = f"{this_year} {mon_day}" dt_end = datetime.datetime.strptime(end, "%Y I%m%d %H:%M:%S.%f") total = dt_end - dt_start log.info(f"{obc_name}: {total.total_seconds()} sec") obc_dict[obc_name] = total.total_seconds() return obc_dict
def test_rgw_kafka_notifications(self, bucket_factory): """ Test to verify rgw kafka notifications """ # Get sc sc = default_storage_class(interface_type=constants.CEPHBLOCKPOOL) # Deploy amq cluster self.amq.setup_amq_cluster(sc.name) # Create topic self.kafka_topic = self.amq.create_kafka_topic() # Create Kafkadrop pod ( self.kafkadrop_pod, self.kafkadrop_pod, self.kafkadrop_route, ) = self.amq.create_kafkadrop() # Get the kafkadrop route kafkadrop_host = self.kafkadrop_route.get().get("spec").get("host") # Create bucket bucketname = bucket_factory(amount=1, interface="RGW-OC")[0].name # Get RGW credentials rgw_obj = RGW() rgw_endpoint, access_key, secret_key = rgw_obj.get_credentials() # Clone notify repo notify_path = clone_notify() # Initialise to put objects data = "A random string data to write on created rgw bucket" obc_obj = OBC(bucketname) s3_resource = boto3.resource( "s3", verify=retrieve_verification_mode(), endpoint_url=rgw_endpoint, aws_access_key_id=obc_obj.access_key_id, aws_secret_access_key=obc_obj.access_key, ) s3_client = s3_resource.meta.client # Initialize notify command to run notify_cmd = ( f"python {notify_path} -e {rgw_endpoint} -a {obc_obj.access_key_id} " f"-s {obc_obj.access_key} -b {bucketname} -ke {constants.KAFKA_ENDPOINT} -t {self.kafka_topic.name}" ) log.info(f"Running cmd {notify_cmd}") # Put objects to bucket assert s3_client.put_object(Bucket=bucketname, Key="key-1", Body=data), "Failed: Put object: key-1" exec_cmd(notify_cmd) # Validate rgw logs notification are sent # No errors are seen pattern = "ERROR: failed to create push endpoint" rgw_pod_obj = get_rgw_pods() rgw_log = get_pod_logs(pod_name=rgw_pod_obj[0].name, container="rgw") assert re.search(pattern=pattern, string=rgw_log) is None, ( f"Error: {pattern} msg found in the rgw logs." f"Validate {pattern} found on rgw logs and also " f"rgw bucket notification is working correctly") assert s3_client.put_object(Bucket=bucketname, Key="key-2", Body=data), "Failed: Put object: key-2" exec_cmd(notify_cmd) # Validate message are received Kafka side using curl command # A temporary way to check from Kafka side, need to check from UI curl_command = ( f"curl -X GET {kafkadrop_host}/topic/{self.kafka_topic.name} " "-H 'content-type: application/vnd.kafka.json.v2+json'") json_output = run_cmd(cmd=curl_command) new_string = json_output.split() messages = new_string[new_string.index("messages</td>") + 1] if messages.find("1") == -1: raise Exception( "Error: Messages are not recieved from Kafka side." "RGW bucket notification is not working as expected.") # Validate the timestamp events ocs_version = config.ENV_DATA["ocs_version"] if Version.coerce(ocs_version) >= Version.coerce("4.8"): cmd = ( f"bin/kafka-console-consumer.sh --bootstrap-server {constants.KAFKA_ENDPOINT} " f"--topic {self.kafka_topic.name} --from-beginning --timeout-ms 20000" ) pod_list = get_pod_name_by_pattern( pattern="my-cluster-zookeeper", namespace=constants.AMQ_NAMESPACE) zookeeper_obj = get_pod_obj(name=pod_list[0], namespace=constants.AMQ_NAMESPACE) event_obj = zookeeper_obj.exec_cmd_on_pod(command=cmd) log.info(f"Event obj: {event_obj}") event_time = event_obj.get("Records")[0].get("eventTime") format_string = "%Y-%m-%dT%H:%M:%S.%fZ" try: datetime.strptime(event_time, format_string) except ValueError as ef: log.error( f"Timestamp event {event_time} doesnt match the pattern {format_string}" ) raise ef log.info( f"Timestamp event {event_time} matches the pattern {format_string}" )
def create_cloned_pvc_and_verify_data( self, pgsql, postgres_pvcs_obj, postgres_pods_obj, pvc_clone_factory, sc_name=None, ): for i in range(3): # Create clone of pgsql pvc log.info("Creating clone of the Postgres PVCs") cloned_pvcs = [ pvc_clone_factory(pvc_obj, volume_mode=VOLUME_MODE_FILESYSTEM, storageclass=sc_name) for pvc_obj in postgres_pvcs_obj ] log.info( "Created clone of the PVCs and all cloned PVCs are in Bound state" ) # Attach to new postgres pod self.pgsql_obj_list = pgsql.attach_pgsql_pod_to_claim_pvc( pvc_objs=cloned_pvcs, postgres_name=f"postgres-cloned-{i}", run_benchmark=False, ) self.sset_list.extend(self.pgsql_obj_list) # Get usage of pgsql pvc parent_pods_obj = pgsql.get_postgres_used_file_space( postgres_pods_obj) # Validate cloned pvcs file space matches with parent cloned_pods_list = get_pod_name_by_pattern( pattern=f"postgres-cloned-{i}", namespace=BMO_NAME) cloned_pods_obj = [ get_pod_obj(name=pods, namespace=BMO_NAME) for pods in cloned_pods_list ] cloned_obj = pgsql.get_postgres_used_file_space(cloned_pods_obj) for pod_obj in parent_pods_obj: if (pod_obj.filespace != cloned_obj[parent_pods_obj.index(pod_obj)].filespace): # ToDo: Before clone need to check data is synced if (not abs( int(pod_obj.filespace.strip("M")) - int(cloned_obj[parent_pods_obj.index( pod_obj)].filespace.strip("M"))) < 3): raise Exception( f"Parent pvc {pod_obj.name} used file space is {pod_obj.filespace}. " f"And for cloned pvc {cloned_obj[parent_pods_obj.index(pod_obj)].name} " f"used file space is {cloned_obj[parent_pods_obj.index(pod_obj)].filespace}" ) log.warn( f"Parent pvc {pod_obj.name} used file space is {pod_obj.filespace}. " f"And for cloned pvc {cloned_obj[parent_pods_obj.index(pod_obj)].name} " f"used file space is {cloned_obj[parent_pods_obj.index(pod_obj)].filespace}" ) log.info("All cloned PVC matches the parent PVC data") # Run benchmark on parent PVC pgsql.create_pgbench_benchmark(replicas=1, pgbench_name=f"pgbench-{i}", wait=False) # Wait till pgbench client pods up wait_time = 180 log.info( f"Waiting {wait_time} seconds for pgbench client pods to be up" ) time.sleep(180) # Wait for pg_bench pod to initialized and complete log.info("Checking all pgbench benchmark reached Completed state") pgsql.wait_for_pgbench_status(status=STATUS_COMPLETED, timeout=1800) return cloned_pvcs
def test_replication_with_disruptions( self, awscli_pod_session, mcg_obj_session, cld_mgr, bucket_factory, source_bucketclass, target_bucketclass, test_directory_setup, nodes, ): # check uni bucket replication from multi (aws+azure) namespace bucket to s3-compatible namespace bucket target_bucket_name = bucket_factory( bucketclass=target_bucketclass)[0].name replication_policy = ("basic-replication-rule", target_bucket_name, None) source_bucket_name = bucket_factory( bucketclass=source_bucketclass, replication_policy=replication_policy)[0].name written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, source_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=5, pattern="first-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Uni-directional bucket replication working as expected") # change from uni-directional to bi-directional replication policy logger.info( "Changing the replication policy from uni to bi-directional!") bi_replication_policy_dict = { "spec": { "additionalConfig": { "replicationPolicy": json.dumps([{ "rule_id": "basic-replication-rule-2", "destination_bucket": source_bucket_name, }]) } } } OCP( namespace=config.ENV_DATA["cluster_namespace"], kind="obc", resource_name=target_bucket_name, ).patch(params=json.dumps(bi_replication_policy_dict), format_type="merge") logger.info( "Patch ran successfully! Changed the replication policy from uni to bi directional" ) # write objects to the second bucket and see if it's replicated on the other logger.info("checking if bi-directional replication works!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=3, pattern="second-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Bi directional bucket replication working as expected") # delete all the s3-compatible namespace buckets objects and then recover it from other namespace bucket on # write logger.info( "checking replication when one of the bucket's objects are deleted!!" ) try: mcg_obj_session.s3_resource.Bucket( target_bucket_name).objects.all().delete() except CommandFailed as e: logger.error(f"[Error] while deleting objects: {e}") if len( mcg_obj_session.s3_list_all_objects_in_bucket( target_bucket_name)) != 0: assert ( False ), f"[Error] Unexpectedly objects were not deleted from {target_bucket_name}" logger.info("All the objects in RGW namespace buckets are deleted!!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="third-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info( "All the objects retrieved back to s3-compatible bucket on new write!!" ) # restart RGW pods and then see if object sync still works logger.info( "Checking if the replication works when there is RGW pod restarts!!" ) written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="fourth-write-", ) logger.info(f"Written objects: {written_random_objects}") pod_names = get_pod_name_by_pattern( "rgw", namespace=config.ENV_DATA["cluster_namespace"]) pod_objs = get_rgw_pods(namespace=config.ENV_DATA["cluster_namespace"]) delete_pods(pod_objs=pod_objs) wait_for_pods_to_be_running( pod_names=pod_names, namespace=config.ENV_DATA["cluster_namespace"]) compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Object sync works after the RGW pod restarted!!") # write some object to any of the bucket, followed by immediate cluster restart logger.info("Checking replication when there is a cluster reboot!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="fifth-write-", ) logger.info(f"Written objects: {written_random_objects}") node_list = get_worker_nodes() node_objs = get_node_objs(node_list) nodes.restart_nodes(node_objs, timeout=500) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) wait_for_pods_to_be_running( namespace=config.ENV_DATA["cluster_namespace"], timeout=800) logger.info("Nodes rebooted successfully!!") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Objects sync works even when the cluster is rebooted")
def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory): """ Test cluster recovery from disk deletion from the platform side. Based on documented procedure detailed in https://bugzilla.redhat.com/show_bug.cgi?id=1823183 """ logger.info("Picking a PV which to be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get("spec").get("claimRef").get("name") # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ ds for ds in osd_pvcs if ds.get().get("metadata").get("name") == claim_name ][0] # Get the corresponding OSD pod and ID logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] logger.info(f"OSD_POD {osd_pod.name}") osd_id = osd_pod.get().get("metadata").get("labels").get("ceph-osd-id") # Get the node that has the OSD pod running on logger.info( f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get( "labels").get("job-name")) osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_deployment_name = osd_deployment.name # Delete the volume from the platform side logger.info(f"Deleting {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Scale down OSD deployment logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") ocp.OCP().exec_oc_cmd( f"scale --replicas=0 deployment/{osd_deployment_name}") # Force delete OSD pod if necessary osd_pod_name = osd_pod.name logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") try: osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) except TimeoutError: osd_pod.delete(force=True) osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) # Run ocs-osd-removal job logger.info(f"Executing OSD removal job on OSD-{osd_id}") osd_removal_job_yaml = ocp.OCP( namespace=config.ENV_DATA["cluster_namespace"]).exec_oc_cmd( f"process ocs-osd-removal" f" -p FAILED_OSD_ID={osd_id} -o yaml") osd_removal_job = OCS(**osd_removal_job_yaml) osd_removal_job.create(do_reload=False) # Get ocs-osd-removal pod name logger.info("Getting the ocs-osd-removal pod name") osd_removal_pod_name = get_pod_name_by_pattern( f"ocs-osd-removal-{osd_id}")[0] osd_removal_pod_obj = get_pod_obj(osd_removal_pod_name, namespace="openshift-storage") osd_removal_pod_obj.ocp.wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=osd_removal_pod_name) # Verify OSD removal from the ocs-osd-removal pod logs logger.info( f"Verifying removal of OSD from {osd_removal_pod_name} pod logs") logs = get_pod_logs(osd_removal_pod_name) pattern = f"purged osd.{osd_id}" assert re.search(pattern, logs) # Delete the OSD prepare job logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC osd_pvc_name = osd_pvc.name logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Delete the OSD deployment logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name, timeout=120) # Delete PV logger.info(f"Verifying deletion of PV {osd_pv_name}") try: osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) except TimeoutError: osd_pv.delete() osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info( f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() # Delete the OSD removal job logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") osd_removal_job = get_job_obj(f"ocs-osd-removal-{osd_id}") osd_removal_job.delete() osd_removal_job.ocp.wait_for_delete( resource_name=f"ocs-osd-removal-{osd_id}") timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info( "Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info( "Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # Validate cluster is still functional self.sanity_helpers.health_check(tries=80) self.sanity_helpers.create_resources(pvc_factory, pod_factory)
def test_pvc_clone(self, pgsql_factory_fixture, pvc_clone_factory): """ 1. Deploy PGSQL workload 2. Create multiple clone of same PVC when the PVC usage is different 3. Attach a new pgsql pod to it. 4. Create pgbench benchmark to new pgsql pod """ # Deploy PGSQL workload log.info("Deploying pgsql workloads") pgsql = pgsql_factory_fixture(replicas=3, clients=3, transactions=600) # Get postgres pvcs obj list postgres_pvcs_obj = pgsql.get_postgres_pvc() # Get postgres pods obj list postgres_pods_obj = pgsql.get_postgres_pods() self.sset_list = [] for i in range(3): # Create clone of pgsql pvc log.info("Creating clone of the Postgres PVCs") cloned_pvcs = [ pvc_clone_factory(pvc_obj, volume_mode=VOLUME_MODE_FILESYSTEM) for pvc_obj in postgres_pvcs_obj ] log.info( "Created clone of the PVCs and all cloned PVCs are in Bound state" ) # Attach to new postgres pod self.pgsql_obj_list = pgsql.attach_pgsql_pod_to_claim_pvc( pvc_objs=cloned_pvcs, postgres_name=f"postgres-cloned-{i}", run_benchmark=False, ) self.sset_list.extend(self.pgsql_obj_list) # Get usage of pgsql pvc parent_pods_obj = pgsql.get_postgres_used_file_space( postgres_pods_obj) # Validate cloned pvcs file space matches with parent cloned_pods_list = get_pod_name_by_pattern( pattern=f"postgres-cloned-{i}", namespace=RIPSAW_NAMESPACE) cloned_pods_obj = [ get_pod_obj(name=pods, namespace=RIPSAW_NAMESPACE) for pods in cloned_pods_list ] cloned_obj = pgsql.get_postgres_used_file_space(cloned_pods_obj) for pod_obj in parent_pods_obj: if (pod_obj.filespace != cloned_obj[parent_pods_obj.index(pod_obj)].filespace): # ToDo: Before clone need to check data is synced if (not abs( int(pod_obj.filespace.strip("M")) - int(cloned_obj[parent_pods_obj.index( pod_obj)].filespace.strip("M"))) < 2): raise Exception( f"Parent pvc {pod_obj.name} used file space is {pod_obj.filespace}. " f"And for cloned pvc {cloned_obj[parent_pods_obj.index(pod_obj)].name} " f"used file space is {cloned_obj[parent_pods_obj.index(pod_obj)].filespace}" ) log.warn( f"Parent pvc {pod_obj.name} used file space is {pod_obj.filespace}. " f"And for cloned pvc {cloned_obj[parent_pods_obj.index(pod_obj)].name} " f"used file space is {cloned_obj[parent_pods_obj.index(pod_obj)].filespace}" ) log.info("All cloned PVC matches the parent PVC data") # Run benchmark on parent PVC pgsql.create_pgbench_benchmark(replicas=3, pgbench_name=f"pgbench-{i}", wait=False) # Wait till pgbench client pods up wait_time = 180 log.info( f"Waiting {wait_time} seconds for pgbench client pods to be up" ) time.sleep(180) # Wait for pg_bench pod to initialized and complete log.info("Checking all pgbench benchmark reached Completed state") pgsql.wait_for_pgbench_status(status=STATUS_COMPLETED, timeout=1800)