def wait_for_cluster_connectivity(tries=200, delay=3): """ Wait for the cluster to be reachable Args: tries (int): The number of retries delay (int): The delay in seconds between retries Returns: bool: True if cluster is reachable, False otherwise Raises: CommandFailed: In case the cluster is unreachable """ service = OCP() log.info("Waiting for cluster connectivity") return retry(CommandFailed, tries=tries, delay=delay, backoff=1)(service.get)()
def test_run_couchbase_node_reboot(self, cb_setup, nodes, pod_name_of_node): """ Test couchbase workload with node reboot """ # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) get_node_resource_utilization_from_adm_top(node_type="master", print_table=True) if pod_name_of_node == "couchbase": node_list = self.cb.get_couchbase_nodes() elif pod_name_of_node == "osd": node_list = get_osd_running_nodes() elif pod_name_of_node == "master": master_node = get_nodes(pod_name_of_node, num_of_nodes=1) # Restart relevant node if pod_name_of_node == "master": nodes.restart_nodes(master_node, wait=False) waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) else: restart_node = get_node_objs(node_list[random.randint( 0, len(node_list) - 1)]) nodes.restart_nodes(restart_node) # Validate all nodes and services are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_nodes_status(timeout=1800)) bg_handler = flowtest.BackgroundOps() bg_ops = [self.cb.result] retry((CommandFailed), tries=60, delay=15)(bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)) self.sanity_helpers.health_check(tries=40)
def measure_stop_worker_nodes(request, measurement_dir, nodes): """ Stop worker nodes that doesn't contain RGW (so that alerts are triggered correctly), measure the time when it was stopped and monitors alerts that were triggered during this event. Returns: dict: Contains information about `start` and `stop` time for stopping worker node """ mgr_pod = pod.get_mgr_pods()[0] mgr_node = pod.get_pod_node(mgr_pod) test_nodes = [ worker_node for worker_node in get_nodes(node_type=constants.WORKER_MACHINE) if worker_node.name != mgr_node.name ] def stop_nodes(): """ Turn off test nodes for 5 minutes. Returns: list: Names of nodes that were turned down """ # run_time of operation run_time = 60 * 5 nonlocal test_nodes node_names = [node.name for node in test_nodes] logger.info(f"Turning off nodes {node_names}") nodes.stop_nodes(nodes=test_nodes) # Validate node reached NotReady state wait_for_nodes_status(node_names=node_names, status=constants.NODE_NOT_READY) logger.info(f"Waiting for {run_time} seconds") time.sleep(run_time) return node_names def finalizer(): nodes.restart_nodes_by_stop_and_start_teardown() assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") request.addfinalizer(finalizer) test_file = os.path.join(measurement_dir, "measure_stop_nodes.json") if config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS: # It seems that it takes longer to propagate incidents to PagerDuty. # Adding 3 extra minutes measured_op = measure_operation(stop_nodes, test_file, minimal_time=60 * 8) else: measured_op = measure_operation(stop_nodes, test_file) logger.info("Turning on nodes") try: nodes.start_nodes(nodes=test_nodes) except CommandFailed: logger.warning( "Nodes were not found: they were probably recreated. Check ceph health below" ) # Validate all nodes are in READY state and up retry((CommandFailed, ResourceWrongStatusException,), tries=60, delay=15,)( wait_for_nodes_status )(timeout=900) # wait for ceph to return into HEALTH_OK state after mgr deployment # is returned back to normal ceph_health_check(tries=20, delay=15) return measured_op
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") self.deploy_ocs_via_operator() pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # validate PDB creation of MON, MDS, OSD pods validate_pdb_creation() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector='app=rook-ceph-tools', resource_count=1, timeout=600) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info("MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error("MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( 'persistent-monitoring'): sc = helpers.default_storage_class( interface_type=constants.CEPHBLOCKPOOL) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager']) # Create configmap cluster-monitoring-config and reconfigure # storage class and telemeter server (if the url is specified in a # config file) create_configmap_cluster_monitoring_pod( sc_name=sc.name, telemeter_server_url=config.ENV_DATA.get( "telemeter_server_url")) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state retry((CommandFailed, ResourceWrongStatusException), tries=3, delay=15)(validate_pods_are_respinned_and_running_state)( pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods retry((CommandFailed, AssertionError), tries=3, delay=15)( validate_pvc_are_mounted_on_monitoring_pods)(pods_list) elif config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( "telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") try: ceph_health_check(namespace=self.namespace, tries=30, delay=10) except CephHealthException as ex: err = str(ex) logger.warning(f"Ceph health check failed with {err}") if "clock skew detected" in err: logger.info(f"Changing NTP on compute nodes to" f" {constants.RH_NTP_CLOCK}") update_ntp_compute_nodes() assert ceph_health_check(namespace=self.namespace, tries=60, delay=10) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default() if check_nodes_specs(min_cpu=constants.MIN_NODE_CPU, min_memory=constants.MIN_NODE_MEMORY): logger.info("The cluster specs meet the minimum requirements and " "therefore, NooBaa auto scale will be enabled") min_nb_eps = config.DEPLOYMENT.get('min_noobaa_endpoints') max_nb_eps = config.DEPLOYMENT.get('max_noobaa_endpoints') change_noobaa_endpoints_count(min_nb_eps=min_nb_eps, max_nb_eps=max_nb_eps) else: logger.warning( "The cluster specs do not meet the minimum requirements" " and therefore, NooBaa auto scale will remain disabled") change_noobaa_endpoints_count(min_nb_eps=1, max_nb_eps=1)
def test_check_pod_status_after_two_nodes_shutdown_recovery( self, nodes, node_restart_teardown): """ Test case to check MDS pods rbd and cephfs plugin Provisioner pods not running on same node post shutdown and recovery node """ # Get MDS, rbd, cephfs plugin provisioner pods running nodes # before shutdown log.info("Check pod nodes before nodes shutdown") list_of_nodes_running_pods(selector="rook-ceph-mds") list_of_nodes_running_pods(selector="csi-rbdplugin-provisioner") list_of_nodes_running_pods(selector="csi-cephfsplugin-provisioner") # Get the node list node = get_nodes(node_type="worker", num_of_nodes=2) # Shutdown 2 worker nodes for 10 mins nodes.stop_nodes(nodes=node) waiting_time = 600 log.info(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) nodes.start_nodes(nodes=node) # Validate all nodes are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=30, delay=15, )(wait_for_nodes_status(timeout=1800)) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() wait_for_storage_pods() # Get MDS, rbd & cephfs plugin provisioner pods running # nodes post-recovery mds_running_nodes_after_recovery = list_of_nodes_running_pods( selector="rook-ceph-mds") rbd_provisioner_running_nodes_after_recovery = list_of_nodes_running_pods( selector="csi-rbdplugin-provisioner") cephfs_provisioner_running_nodes_after_recovery = list_of_nodes_running_pods( selector="csi-cephfsplugin-provisioner") assert len(set(mds_running_nodes_after_recovery)) == len( mds_running_nodes_after_recovery ), "MDS running on same node, Not expected!!!" log.info("MDS pods not running on same node") assert len(set(rbd_provisioner_running_nodes_after_recovery)) == len( rbd_provisioner_running_nodes_after_recovery ), "rbd plugin provisioner pods running on Same node, Not expected" log.info("RBD plugin provisioner pods not running on same node") assert len( set(cephfs_provisioner_running_nodes_after_recovery) ) == len( cephfs_provisioner_running_nodes_after_recovery ), "cephfs plugin provisioner pods running on Same node, Not expected" log.info("CEPHFS plugin provisioner pods not running on same node")
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") self.deploy_ocs_via_operator() pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # validate PDB creation of MON, MDS, OSD pods validate_pdb_creation() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector='app=rook-ceph-tools', resource_count=1, timeout=600) # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1847098 if config.DEPLOYMENT.get('local_storage'): tools_pod = run_cmd( f"oc -n {self.namespace} get pod -l 'app=rook-ceph-tools' " f"-o jsonpath='{{.items[0].metadata.name}}'") pgs_to_autoscale = [ 'ocs-storagecluster-cephblockpool', 'ocs-storagecluster-cephfilesystem-data0' ] for pg in pgs_to_autoscale: run_cmd(f"oc -n {self.namespace} exec {tools_pod} -- " f"ceph osd pool set {pg} pg_autoscale_mode on") # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info("MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error("MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( 'persistent-monitoring'): sc = helpers.default_storage_class( interface_type=constants.CEPHBLOCKPOOL) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager']) # Create configmap cluster-monitoring-config and reconfigure # storage class and telemeter server (if the url is specified in a # config file) create_configmap_cluster_monitoring_pod( sc_name=sc.name, telemeter_server_url=config.ENV_DATA.get( "telemeter_server_url")) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state retry((CommandFailed, ResourceWrongStatusException), tries=3, delay=15)(validate_pods_are_respinned_and_running_state)( pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods retry((CommandFailed, AssertionError), tries=3, delay=15)( validate_pvc_are_mounted_on_monitoring_pods)(pods_list) elif config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( "telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") assert ceph_health_check(namespace=self.namespace) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ ceph_cluster = ocp.OCP(kind="CephCluster", namespace=self.namespace) try: ceph_cluster.get().get("items")[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") if config.DEPLOYMENT["external_mode"]: logger.info("Deploying OCS on external mode RHCS") return self.deploy_with_external_mode() self.deploy_ocs_via_operator() pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods assert pod.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=600, ) assert pod.wait_for_resource(condition="Running", selector="app=rook-ceph-mgr", timeout=600) assert pod.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, ) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # validate PDB creation of MON, MDS, OSD pods validate_pdb_creation() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector="app=rook-ceph-tools", resource_count=1, timeout=600, ) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data["items"][0]["metadata"]["name"] if helpers.validate_cephfilesystem(cfs_name): logger.info("MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error("MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get( "persistent-monitoring"): sc = helpers.default_storage_class( interface_type=constants.CEPHBLOCKPOOL) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus", "alertmanager"], ) # Create configmap cluster-monitoring-config and reconfigure # storage class and telemeter server (if the url is specified in a # config file) create_configmap_cluster_monitoring_pod( sc_name=sc.name, telemeter_server_url=config.ENV_DATA.get( "telemeter_server_url"), ) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state retry((CommandFailed, ResourceWrongStatusException), tries=3, delay=15)(validate_pods_are_respinned_and_running_state)( pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods retry((CommandFailed, AssertionError), tries=3, delay=15)( validate_pvc_are_mounted_on_monitoring_pods)(pods_list) elif config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get( "telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") try: ceph_health_check(namespace=self.namespace, tries=30, delay=10) except CephHealthException as ex: err = str(ex) logger.warning(f"Ceph health check failed with {err}") if "clock skew detected" in err: logger.info(f"Changing NTP on compute nodes to" f" {constants.RH_NTP_CLOCK}") if self.platform == constants.VSPHERE_PLATFORM: update_ntp_compute_nodes() assert ceph_health_check(namespace=self.namespace, tries=60, delay=10) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()
def test_replication_with_disruptions( self, awscli_pod_session, mcg_obj_session, cld_mgr, bucket_factory, source_bucketclass, target_bucketclass, test_directory_setup, nodes, ): # check uni bucket replication from multi (aws+azure) namespace bucket to s3-compatible namespace bucket target_bucket_name = bucket_factory( bucketclass=target_bucketclass)[0].name replication_policy = ("basic-replication-rule", target_bucket_name, None) source_bucket_name = bucket_factory( bucketclass=source_bucketclass, replication_policy=replication_policy)[0].name written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, source_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=5, pattern="first-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Uni-directional bucket replication working as expected") # change from uni-directional to bi-directional replication policy logger.info( "Changing the replication policy from uni to bi-directional!") bi_replication_policy_dict = { "spec": { "additionalConfig": { "replicationPolicy": json.dumps([{ "rule_id": "basic-replication-rule-2", "destination_bucket": source_bucket_name, }]) } } } OCP( namespace=config.ENV_DATA["cluster_namespace"], kind="obc", resource_name=target_bucket_name, ).patch(params=json.dumps(bi_replication_policy_dict), format_type="merge") logger.info( "Patch ran successfully! Changed the replication policy from uni to bi directional" ) # write objects to the second bucket and see if it's replicated on the other logger.info("checking if bi-directional replication works!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=3, pattern="second-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Bi directional bucket replication working as expected") # delete all the s3-compatible namespace buckets objects and then recover it from other namespace bucket on # write logger.info( "checking replication when one of the bucket's objects are deleted!!" ) try: mcg_obj_session.s3_resource.Bucket( target_bucket_name).objects.all().delete() except CommandFailed as e: logger.error(f"[Error] while deleting objects: {e}") if len( mcg_obj_session.s3_list_all_objects_in_bucket( target_bucket_name)) != 0: assert ( False ), f"[Error] Unexpectedly objects were not deleted from {target_bucket_name}" logger.info("All the objects in RGW namespace buckets are deleted!!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="third-write-", ) logger.info(f"Written objects: {written_random_objects}") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info( "All the objects retrieved back to s3-compatible bucket on new write!!" ) # restart RGW pods and then see if object sync still works logger.info( "Checking if the replication works when there is RGW pod restarts!!" ) written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="fourth-write-", ) logger.info(f"Written objects: {written_random_objects}") pod_names = get_pod_name_by_pattern( "rgw", namespace=config.ENV_DATA["cluster_namespace"]) pod_objs = get_rgw_pods(namespace=config.ENV_DATA["cluster_namespace"]) delete_pods(pod_objs=pod_objs) wait_for_pods_to_be_running( pod_names=pod_names, namespace=config.ENV_DATA["cluster_namespace"]) compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Object sync works after the RGW pod restarted!!") # write some object to any of the bucket, followed by immediate cluster restart logger.info("Checking replication when there is a cluster reboot!!") written_random_objects = write_random_test_objects_to_bucket( awscli_pod_session, target_bucket_name, test_directory_setup.origin_dir, mcg_obj=mcg_obj_session, amount=1, pattern="fifth-write-", ) logger.info(f"Written objects: {written_random_objects}") node_list = get_worker_nodes() node_objs = get_node_objs(node_list) nodes.restart_nodes(node_objs, timeout=500) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) wait_for_pods_to_be_running( namespace=config.ENV_DATA["cluster_namespace"], timeout=800) logger.info("Nodes rebooted successfully!!") compare_bucket_object_list(mcg_obj_session, source_bucket_name, target_bucket_name) logger.info("Objects sync works even when the cluster is rebooted")
def test_multiregion_mirror( self, cld_mgr, mcg_obj, awscli_pod_session, multiregion_mirror_setup, test_directory_setup, ): """ Test multi-region bucket creation using the S3 SDK """ bucket, backingstores = multiregion_mirror_setup backingstore1 = backingstores[0] backingstore2 = backingstores[1] bucket_name = bucket.name aws_client = cld_mgr.aws_client local_testobjs_dir_path = AWSCLI_TEST_OBJ_DIR downloaded_objs = awscli_pod_session.exec_cmd_on_pod( f"ls -A1 {local_testobjs_dir_path}").split(" ") logger.info("Uploading all pod objects to MCG bucket") local_temp_path = test_directory_setup.result_dir mcg_bucket_path = f"s3://{bucket_name}" # Upload test objects to the NooBucket retry(CommandFailed, tries=3, delay=10)(sync_object_directory)(awscli_pod_session, local_testobjs_dir_path, mcg_bucket_path, mcg_obj) mcg_obj.check_if_mirroring_is_done(bucket_name) # Bring bucket A down aws_client.toggle_aws_bucket_readwrite(backingstore1.uls_name) mcg_obj.check_backingstore_state("backing-store-" + backingstore1.name, BS_AUTH_FAILED) # Verify integrity of B # Retrieve all objects from MCG bucket to result dir in Pod retry(CommandFailed, tries=3, delay=10)(sync_object_directory)(awscli_pod_session, mcg_bucket_path, local_temp_path, mcg_obj) # Checksum is compared between original and result object for obj in downloaded_objs: assert verify_s3_object_integrity( original_object_path=f"{local_testobjs_dir_path}/{obj}", result_object_path=f"{local_temp_path}/{obj}", awscli_pod=awscli_pod_session, ), "Checksum comparision between original and result object failed" # Clean up the temp dir awscli_pod_session.exec_cmd_on_pod( command=f'sh -c "rm -rf {local_temp_path}/*"') # Bring B down, bring A up logger.info("Blocking bucket B") aws_client.toggle_aws_bucket_readwrite(backingstore2.uls_name) logger.info("Freeing bucket A") aws_client.toggle_aws_bucket_readwrite(backingstore1.uls_name, block=False) mcg_obj.check_backingstore_state("backing-store-" + backingstore1.name, BS_OPTIMAL) mcg_obj.check_backingstore_state("backing-store-" + backingstore2.name, BS_AUTH_FAILED) # Verify integrity of A # Retrieve all objects from MCG bucket to result dir in Pod retry(CommandFailed, tries=3, delay=10)(sync_object_directory)(awscli_pod_session, mcg_bucket_path, local_temp_path, mcg_obj) # Checksum is compared between original and result object for obj in downloaded_objs: assert verify_s3_object_integrity( original_object_path=f"{local_testobjs_dir_path}/{obj}", result_object_path=f"{local_temp_path}/{obj}", awscli_pod=awscli_pod_session, ), "Checksum comparision between original and result object failed" # Bring B up aws_client.toggle_aws_bucket_readwrite(backingstore2.uls_name, block=False) mcg_obj.check_backingstore_state("backing-store-" + backingstore2.name, BS_OPTIMAL)
def test_registry_rolling_reboot_node(self, node_type, nodes): """ Test registry workload when backed by OCS and reboot node one by one """ # Get the node list node_list = get_nodes(node_type) # Pull and push images to registries log.info("Pull and push images to registries") image_pull_and_push( project_name=self.project_name, template="eap-cd-basic-s2i", image= "registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest", pattern="eap-app", ) # Validate image exists in registries path validate_image_exists(namespace=self.project_name) for node in node_list: # Reboot node log.info(node.name) nodes.restart_nodes([node], wait=False) # Wait some time after rebooting node waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate all nodes and services are in READY state and up retry( ( CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException, ), tries=60, delay=15, )(wait_for_cluster_connectivity)(tries=400) retry( ( CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException, ), tries=60, delay=15, )(wait_for_nodes_status)(timeout=900) # Validate cluster health ok and all pods are running self.sanity_helpers.health_check(tries=40) # Validate storage pods are running wait_for_storage_pods() # Validate image registry pods validate_registry_pod_status() # Validate image exists in registries path validate_image_exists(namespace=self.project_name)