def validate_cluster_import(cluster_name): """ Validate ACM status of managed cluster Args: cluster_name: (str): cluster name to validate Assert: All conditions of selected managed cluster should be "True", Failed otherwise Return: True, if not AssertionError """ config.switch_ctx(0) oc_obj = OCP(kind=ACM_MANAGED_CLUSTERS) conditions = oc_obj.exec_oc_cmd( f"get managedclusters {cluster_name} -ojsonpath='{{.status.conditions}}'" ) log.debug(conditions) for dict_status in conditions: log.info(f"Message: {dict_status.get('message')}") log.info(f"Status: {dict_status.get('status')}") assert (dict_status.get("status") == "True" ), f"Status is not True, but: {dict_status.get('status')}" # Return true if Assertion error was not raised: return True
def relocate(preferred_cluster, drpc_name, namespace): """ Initiates Relocate action to the specified cluster Args: preferred_cluster (str): Cluster name to which the workload should be relocated drpc_name (str): Name of the DRPC resource to apply the patch namespace (str): Name of the namespace to use """ prev_index = config.cur_index config.switch_acm_ctx() relocate_params = ( f'{{"spec":{{"action":"Relocate","preferredCluster":"{preferred_cluster}"}}}}' ) drpc_obj = ocp.OCP( kind=constants.DRPC, namespace=namespace, resource_name=drpc_name ) drpc_obj._has_phase = True logger.info(f"Initiating relocate action to {preferred_cluster}") assert drpc_obj.patch( params=relocate_params, format_type="merge" ), f"Failed to patch {constants.DRPC}: {drpc_name}" logger.info( f"Wait for {constants.DRPC}: {drpc_name} to reach {constants.STATUS_RELOCATED} phase" ) drpc_obj.wait_for_phase(constants.STATUS_RELOCATED) config.switch_ctx(prev_index)
def finalizer(): # Switching to provider cluster context will be done during the test case in certain cases. # Switch back to consumer cluster context after the test case. if self.provider_index: config.switch_ctx(self.initial_cluster_index) assert ceph_health_check(), "Ceph cluster health is not OK" log.info("Ceph cluster health is OK")
def pytest_configure(config): """ Load config files, and initialize ocs-ci library. Args: config (pytest.config): Pytest config object """ set_log_level(config) # Somewhat hacky but this lets us differentiate between run-ci executions # and plain pytest unit test executions ocscilib_module = "ocs_ci.framework.pytest_customization.ocscilib" if ocscilib_module not in config.getoption("-p"): return for i in range(ocsci_config.nclusters): log.debug(f"Pytest configure switching to: cluster={i}") ocsci_config.switch_ctx(i) if not (config.getoption("--help") or config.getoption("collectonly")): process_cluster_cli_params(config) config_file = os.path.expanduser( os.path.join( ocsci_config.RUN["log_dir"], f"run-{ocsci_config.RUN['run_id']}-cl{i}-config.yaml", )) dump_config_to_file(config_file) log.info(f"Dump of the consolidated config file is located here: " f"{config_file}") # Add OCS related versions to the html report and remove # extraneous metadata markers_arg = config.getoption("-m") # add logs url logs_url = ocsci_config.RUN.get("logs_url") if logs_url: config._metadata["Logs URL"] = logs_url if ocsci_config.RUN["cli_params"].get("teardown") or ( "deployment" in markers_arg and ocsci_config.RUN["cli_params"].get("deploy")): log.info( "Skipping versions collecting because: Deploy or destroy of " "cluster is performed.") return elif ocsci_config.ENV_DATA["skip_ocs_deployment"]: log.info("Skipping version collection because we skipped " "the OCS deployment") return elif ocsci_config.RUN["cli_params"].get("dev_mode"): log.info("Running in development mode") return print("Collecting Cluster versions") # remove extraneous metadata for extra_meta in ["Python", "Packages", "Plugins", "Platform"]: if config._metadata.get(extra_meta): del config._metadata[extra_meta] config._metadata["Test Run Name"] = get_testrun_name() gather_version_info_for_report(config)
def failover(failover_cluster, drpc_name, namespace): """ Initiates Failover action to the specified cluster Args: failover_cluster (str): Cluster name to which the workload should be failed over drpc_name (str): Name of the DRPC resource to apply the patch namespace (str): Name of the namespace to use """ prev_index = config.cur_index config.switch_acm_ctx() failover_params = ( f'{{"spec":{{"action":"Failover","failoverCluster":"{failover_cluster}"}}}}' ) drpc_obj = ocp.OCP( kind=constants.DRPC, namespace=namespace, resource_name=drpc_name ) drpc_obj._has_phase = True logger.info(f"Initiating failover action to {failover_cluster}") assert drpc_obj.patch( params=failover_params, format_type="merge" ), f"Failed to patch {constants.DRPC}: {drpc_name}" logger.info( f"Wait for {constants.DRPC}: {drpc_name} to reach {constants.STATUS_FAILEDOVER} phase" ) drpc_obj.wait_for_phase(constants.STATUS_FAILEDOVER) config.switch_ctx(prev_index)
def check_scale_pods_and_pvcs_created_on_consumers(self): for consumer_i, fio_scale in self.consumer_i_per_fio_scale.items(): config.switch_ctx(consumer_i) c_name = config.ENV_DATA.get("cluster_name") ocp_pvc = OCP(kind=constants.PVC, namespace=fio_scale.namespace) ocp_pvc.wait_for_resource( timeout=30, condition=constants.STATUS_BOUND, resource_count=self.scale_count, ) log.info( f"All the PVCs were created successfully on the consumer {c_name}" ) ocp_pod = OCP(kind=constants.POD, namespace=fio_scale.namespace) ocp_pod.wait_for_resource( timeout=30, condition=constants.STATUS_COMPLETED, resource_count=self.expected_pod_num, ) log.info( f"All the pods were created successfully on the consumer {c_name}" ) log.info( "All the pods and PVCs were created successfully on the consumers")
def wait_for_mirroring_status_ok(replaying_images=None, timeout=300): """ Wait for mirroring status to reach health OK and expected number of replaying images for each of the ODF cluster Args: replaying_images (int): Expected number of images in replaying state timeout (int): time in seconds to wait for mirroring status reach OK Returns: bool: True if status contains expected health and states values Raises: AssertionError: In case of unexpected mirroring status """ for cluster in get_non_acm_cluster_config(): config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"]) logger.info( f"Validating mirroring status on cluster {cluster.ENV_DATA['cluster_name']}" ) sample = TimeoutSampler( timeout=timeout, sleep=5, func=check_mirroring_status_ok, replaying_images=replaying_images, ) assert sample.wait_for_func_status(result=True), ( "The mirroring status does not have expected values within the time" f" limit on cluster {cluster.ENV_DATA['cluster_name']}" )
def test_create_scale_pods_and_pvcs_with_ms_consumers( self, create_scale_pods_and_pvcs_using_kube_job_on_ms_consumers): """ Test create scale pods and PVCs using a kube job with MS consumers """ self.orig_index = config.cur_index self.consumer_i_per_fio_scale = ( create_scale_pods_and_pvcs_using_kube_job_on_ms_consumers( scale_count=self.scale_count, pvc_per_pod_count=self.pvc_per_pod_count, )) assert config.cur_index == self.orig_index, "The current index has changed" config.switch_to_provider() time_to_wait_for_io_running = 120 log.info(f"Wait {time_to_wait_for_io_running} seconds for checking " f"that the IO running as expected") sleep(time_to_wait_for_io_running) ceph_health_check() log.info("Checking the Ceph Health on the consumers") consumer_indexes = config.get_consumer_indexes_list() for i in consumer_indexes: config.switch_ctx(i) ceph_health_check() self.check_scale_pods_and_pvcs_created_on_consumers() log.info( "The scale pods and PVCs using a kube job with MS consumers created successfully" )
def test_automated_recovery_from_failed_nodes_reactive_ms( self, nodes, failure, ): """ We have 3 test cases to check when running IO in the background: A) Automated recovery from stopped worker node B) Automated recovery from termination of a worker node C) Automated recovery from unschedule and reschedule a worker node. """ self.create_resources() config.switch_to_provider() log.info("Start executing the node test function on the provider...") FAILURE_TYPE_FUNC_CALL_DICT[failure](nodes) # Verification steps after the automated recovery. assert check_pods_after_node_replacement( ), "Not all the pods are running" assert (verify_worker_nodes_security_groups() ), "Not all the worker nodes security groups set correctly" log.info("Checking that the ceph health is ok on the provider") ceph_health_check() log.info("Checking that the ceph health is ok on the consumers") consumer_indexes = config.get_consumer_indexes_list() for i in consumer_indexes: config.switch_ctx(i) ceph_health_check()
def finalizer(): ocp_nodes = get_node_objs() for n in ocp_nodes: recover_node_to_ready_state(n) logger.info("Switch to the original cluster index") config.switch_ctx(self.orig_index) ceph_health_check()
def teardown(): # ocs-operator pod deletion on consumer cluster will trigger rook-ceph-tools pod respin. Patching of # rook-ceph-tools pod is done in the test case after ocs-operator pod respin. But if the automatic # respin of rook-ceph-tools pod is delayed by few seconds, the patching step in the test case will not # run. So doing patch at the end of the test to ensure that the rook-ceph-tools pod on consumers # can run ceph command. for consumer_index in self.consumer_indexes: config.switch_ctx(consumer_index) patch_consumer_toolbox() # Switching cluster context will be done during the test case. # Switch back to current cluster context after the test case. config.switch_ctx(initial_cluster_index)
def test_deployment(pvc_factory, pod_factory): deploy = config.RUN["cli_params"].get("deploy") teardown = config.RUN["cli_params"].get("teardown") if not teardown or deploy: log.info("Verifying OCP cluster is running") assert is_cluster_running(config.ENV_DATA["cluster_path"]) if not config.ENV_DATA["skip_ocs_deployment"]: if config.multicluster: restore_ctx_index = config.cur_index for cluster in get_non_acm_cluster_config(): config.switch_ctx( cluster.MULTICLUSTER["multicluster_index"]) log.info( f"Sanity check for cluster: {cluster.ENV_DATA['cluster_name']}" ) sanity_helpers = Sanity() sanity_helpers.health_check() sanity_helpers.delete_resources() config.switch_ctx(restore_ctx_index) else: ocs_registry_image = config.DEPLOYMENT.get( "ocs_registry_image") if config.ENV_DATA["mcg_only_deployment"]: mcg_only_install_verification( ocs_registry_image=ocs_registry_image) return else: ocs_install_verification( ocs_registry_image=ocs_registry_image) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources if config.DEPLOYMENT["external_mode"]: sanity_helpers = SanityExternalCluster() else: sanity_helpers = Sanity() if (config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS): try: sanity_helpers.health_check() except exceptions.ResourceWrongStatusException as err_msg: log.warning(err_msg) else: sanity_helpers.health_check() sanity_helpers.delete_resources() # Verify ceph health log.info("Verifying ceph health after deployment") assert ceph_health_check(tries=10, delay=30) if teardown: log.info( "Cluster will be destroyed during teardown part of this test.")
def finalizer(): config.switch_to_provider() log.info( "Verify that all the worker nodes are in a Ready state on the provider" ) wnodes = get_nodes(node_type=constants.WORKER_MACHINE) for wnode in wnodes: is_recovered = recover_node_to_ready_state(wnode) if not is_recovered: log.warning(f"The node {wnode.name} has failed to recover") log.info("Verify again that the ceph health is OK") ceph_health_check() config.switch_ctx(self.orig_index)
def get_scheduling_interval(namespace): """ Get scheduling interval for the workload in the given namespace Args: namespace (str): Name of the namespace Returns: int: scheduling interval value from DRPolicy """ restore_index = config.cur_index drpolicy_obj = DRPC(namespace=namespace).drpolicy_obj interval_value = int(drpolicy_obj.get()["spec"]["schedulingInterval"][:-1]) config.switch_ctx(restore_index) return interval_value
def check_pods_and_pvcs_deleted_on_consumers(self): for consumer_i, fio_scale in self.consumer_i_per_fio_scale.items(): config.switch_ctx(consumer_i) c_name = config.ENV_DATA.get("cluster_name") pvc_objs = get_all_pvcs(fio_scale.namespace)["items"] assert not pvc_objs, "There are still remaining PVCs" log.info( f"All the PVCs deleted successfully on the consumer {c_name}") pod_objs = get_all_pods(fio_scale.namespace) assert not pod_objs, "There are still remaining pods" log.info( f"All the pods deleted successfully on the consumer {c_name}") log.info( "All the pods and PVCs were deleted successfully on the consumers")
def post_deploy_ops(self): """ 1. Install ingress certificates on OCP clusters deployed through ACM 2. Run post_ocp_deploy on OCP clusters """ prev = config.cur_index for cluster in get_non_acm_cluster_config(): config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"]) ssl_key = config.DEPLOYMENT.get("ingress_ssl_key") ssl_cert = config.DEPLOYMENT.get("ingress_ssl_cert") for key in [ssl_key, ssl_cert]: if os.path.exists(key): os.unlink(key) logger.info("Running post ocp deploy ops") self.post_ocp_deploy() config.switch_ctx(prev)
def post_destroy_ops(self): """ Post destroy ops includes 1. Deleting DNS entries 2. Freeing the ips assigned """ prev_ctx = config.cur_index config.switch_ctx(self.cluster_conf.MULTICLUSTER["multicluster_index"]) vmware.delete_dns_records() ipam = IPAM(appiapp="address") hosts = [ f"{config.ENV_DATA.get('cluster_name')}-{i}" for i in range(self.nvips) ] ipam.release_ips(hosts) config.switch_ctx(prev_ctx)
def setup(self, request, create_pvcs_and_pods): """ Prepare pods for the test and add finalizer. """ self.provider_cluster_index = config.get_provider_index() self.consumer_indexes = config.get_consumer_indexes_list() if config.ENV_DATA["platform"].lower( ) in constants.MANAGED_SERVICE_PLATFORMS: # Get the index of current cluster initial_cluster_index = config.cur_index def teardown(): # ocs-operator pod deletion on consumer cluster will trigger rook-ceph-tools pod respin. Patching of # rook-ceph-tools pod is done in the test case after ocs-operator pod respin. But if the automatic # respin of rook-ceph-tools pod is delayed by few seconds, the patching step in the test case will not # run. So doing patch at the end of the test to ensure that the rook-ceph-tools pod on consumers # can run ceph command. for consumer_index in self.consumer_indexes: config.switch_ctx(consumer_index) patch_consumer_toolbox() # Switching cluster context will be done during the test case. # Switch back to current cluster context after the test case. config.switch_ctx(initial_cluster_index) request.addfinalizer(teardown) self.io_pods = list() for cluster_index in self.consumer_indexes: config.switch_ctx(cluster_index) consumer_cluster_kubeconfig = os.path.join( config.clusters[cluster_index].ENV_DATA["cluster_path"], config.clusters[cluster_index].RUN.get("kubeconfig_location"), ) pvcs, io_pods = create_pvcs_and_pods( pvc_size=self.pvc_size, replica_count=1, pod_dict_path=constants.PERF_POD_YAML, ) for pvc_obj in pvcs: pvc_obj.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig for io_pod in io_pods: io_pod.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig pvcs[0].project.cluster_kubeconfig = consumer_cluster_kubeconfig self.io_pods.extend(io_pods)
def get_clusters_env(): """ Stores cluster's kubeconfig location and clusters name, in case of multi-cluster setup Returns after execution with cluster index zero as default context Returns: dict: with clusters names, clusters kubeconfig locations """ clusters_env = {} for index in range(config.nclusters): config.switch_ctx(index=index) clusters_env[f"kubeconfig_location_c{index}"] = os.path.join( config.ENV_DATA["cluster_path"], config.RUN["kubeconfig_location"]) clusters_env[f"cluster_name_{index}"] = config.ENV_DATA["cluster_name"] config.switch_ctx(index=0) return clusters_env
def get_current_secondary_cluster_name(namespace): """ Get current secondary cluster name based on workload namespace Args: namespace (str): Name of the namespace Returns: str: Current secondary cluster name """ restore_index = config.cur_index primary_cluster_name = get_current_primary_cluster_name(namespace) drpolicy_data = DRPC(namespace=namespace).drpolicy_obj.get() config.switch_ctx(restore_index) for cluster_name in drpolicy_data["spec"]["drClusters"]: if not cluster_name == primary_cluster_name: return cluster_name
def get_current_primary_cluster_name(namespace): """ Get current primary cluster name based on workload namespace Args: namespace (str): Name of the namespace Returns: str: Current primary cluster name """ restore_index = config.cur_index drpc_data = DRPC(namespace=namespace).get() if drpc_data.get("spec").get("action") == constants.ACTION_FAILOVER: cluster_name = drpc_data["spec"]["failoverCluster"] else: cluster_name = drpc_data["spec"]["preferredCluster"] config.switch_ctx(restore_index) return cluster_name
def submariner_configure_upstream(self): """ Deploy and Configure upstream submariner Raises: DRPrimaryNotFoundException: If there is no designated primary cluster found """ if self.designated_broker_cluster_index < 0: raise DRPrimaryNotFoundException( "Designated primary cluster not found") # Deploy broker on designated cluster # follow this config switch statement carefully to be mindful # about the context with which we are performing the operations config.switch_ctx(self.designated_broker_cluster_index) logger.info( f"Switched context: {config.cluster_ctx.ENV_DATA['cluster_name']}") deploy_broker_cmd = "deploy-broker" try: run_subctl_cmd(deploy_broker_cmd) except CommandFailed: logger.exception("Failed to deploy submariner broker") raise # Label the gateway nodes on all non acm cluster restore_index = config.cur_index for cluster in get_non_acm_cluster_config(): config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"]) gateway_node = self.get_default_gateway_node() label_nodes([gateway_node], constants.SUBMARINER_GATEWAY_NODE_LABEL) config.switch_ctx(restore_index) # Join all the clusters (except ACM cluster in case of hub deployment) for cluster in config.clusters: print(len(config.clusters)) cluster_index = cluster.MULTICLUSTER["multicluster_index"] if cluster_index != config.get_acm_index(): join_cmd = (f"join --kubeconfig {cluster.RUN['kubeconfig']} " f"{config.ENV_DATA['submariner_info_file']} " f"--clusterid c{self.cluster_seq} --natt=false") try: run_subctl_cmd(join_cmd, ) logger.info( f"Subctl join succeded for {cluster.ENV_DATA['cluster_name']}" ) except CommandFailed: logger.exception("Cluster failed to join") raise self.cluster_seq = self.cluster_seq + 1 self.dr_only_list.append(cluster_index) # Verify submariner connectivity between clusters(excluding ACM) kubeconf_list = [] for i in self.dr_only_list: kubeconf_list.append(config.clusters[i].RUN["kubeconfig"]) connct_check = f"verify {' '.join(kubeconf_list)} --only connectivity" run_subctl_cmd(connct_check)
def failover(failover_cluster, namespace): """ Initiates Failover action to the specified cluster Args: failover_cluster (str): Cluster name to which the workload should be failed over namespace (str): Namespace where workload is running """ restore_index = config.cur_index config.switch_acm_ctx() failover_params = f'{{"spec":{{"action":"{constants.ACTION_FAILOVER}","failoverCluster":"{failover_cluster}"}}}}' drpc_obj = DRPC(namespace=namespace) drpc_obj.wait_for_peer_ready_status() logger.info(f"Initiating Failover action with failoverCluster:{failover_cluster}") assert drpc_obj.patch( params=failover_params, format_type="merge" ), f"Failed to patch {constants.DRPC}: {drpc_obj.resource_name}" logger.info( f"Wait for {constants.DRPC}: {drpc_obj.resource_name} to reach {constants.STATUS_FAILEDOVER} phase" ) drpc_obj.wait_for_phase(constants.STATUS_FAILEDOVER) config.switch_ctx(restore_index)
def get_admin_key_from_provider(): """ Get admin key from rook-ceph-tools pod on provider Returns: str: The admin key obtained from rook-ceph-tools pod on provider. Return empty string if admin key is not obtained. """ initial_cluster_index = config.cur_index config.switch_to_provider() admin_key = "" try: # Get the key from provider cluster rook-ceph-tools pod provider_tools_pod = get_ceph_tools_pod() admin_key = (provider_tools_pod.exec_cmd_on_pod( "grep key /etc/ceph/keyring").strip().split()[-1]) except Exception as exc: logger.error( f"Couldn't find admin key from provider due to the error:\n{str(exc)}" ) finally: config.switch_ctx(initial_cluster_index) return admin_key
def relocate(preferred_cluster, namespace): """ Initiates Relocate action to the specified cluster Args: preferred_cluster (str): Cluster name to which the workload should be relocated namespace (str): Namespace where workload is running """ restore_index = config.cur_index config.switch_acm_ctx() relocate_params = f'{{"spec":{{"action":"{constants.ACTION_RELOCATE}","preferredCluster":"{preferred_cluster}"}}}}' drpc_obj = DRPC(namespace=namespace) drpc_obj.wait_for_peer_ready_status() logger.info(f"Initiating Relocate action with preferredCluster:{preferred_cluster}") assert drpc_obj.patch( params=relocate_params, format_type="merge" ), f"Failed to patch {constants.DRPC}: {drpc_obj.resource_name}" logger.info( f"Wait for {constants.DRPC}: {drpc_obj.resource_name} to reach {constants.STATUS_RELOCATED} phase" ) drpc_obj.wait_for_phase(constants.STATUS_RELOCATED) config.switch_ctx(restore_index)
def wait_for_mirroring_status_ok(replaying_images=None, timeout=300): """ Wait for mirroring status to reach health OK and expected number of replaying images for each of the ODF cluster Args: replaying_images (int): Expected number of images in replaying state timeout (int): time in seconds to wait for mirroring status reach OK Returns: bool: True if status contains expected health and states values Raises: TimeoutExpiredError: In case of unexpected mirroring status """ restore_index = config.cur_index if not replaying_images: replaying_images = 0 for cluster in get_non_acm_cluster_config(): config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"]) replaying_images += len( get_all_pvcs_in_storageclass(constants.CEPHBLOCKPOOL_SC) ) replaying_images -= 2 # Ignore db-noobaa-db-pg-0 PVCs for cluster in get_non_acm_cluster_config(): config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"]) logger.info( f"Validating mirroring status on cluster {cluster.ENV_DATA['cluster_name']}" ) sample = TimeoutSampler( timeout=timeout, sleep=5, func=check_mirroring_status_ok, replaying_images=replaying_images, ) if not sample.wait_for_func_status(result=True): error_msg = ( "The mirroring status does not have expected values within the time" f" limit on cluster {cluster.ENV_DATA['cluster_name']}" ) logger.error(error_msg) raise TimeoutExpiredError(error_msg) config.switch_ctx(restore_index) return True
def test_pod_disruptions(self, create_pvcs_and_pods): """ Test to perform pod disruption in consumer and provider cluster """ # List of pods to be disrupted. Using different list for consumer and provider for the easy implementation pods_on_consumer = [ "alertmanager_managed_ocs_alertmanager", "ocs_osd_controller_manager", "prometheus_managed_ocs_prometheus", "prometheus_operator", "ocs_operator", ] pods_on_provider = [ "alertmanager_managed_ocs_alertmanager", "ocs_osd_controller_manager", "prometheus_managed_ocs_prometheus", "prometheus_operator", "ocs_provider_server", "ocs_operator", ] disruption_on_consumer = [] disruption_on_provider = [] # Start I/O log.info("Starting fio on all pods") for pod_obj in self.io_pods: if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK: storage_type = "block" direct = 1 else: storage_type = "fs" direct = 0 pod_obj.run_io( storage_type=storage_type, size="10G", fio_filename=f"{pod_obj.name}", runtime=320, end_fsync=1, direct=direct, invalidate=0, fio_installed=True, ) consumer_index_iter = cycle(self.consumer_indexes) # Create Disruptions instance for each pod to be disrupted on consumer for pod_type in pods_on_consumer: consumer_index = next(consumer_index_iter) config.switch_ctx(consumer_index) disruption_obj = disruption_helpers.Disruptions() # Select each pod to be disrupted from different consumers disruption_obj.set_resource(resource=pod_type, cluster_index=consumer_index) disruption_obj.index_of_consumer = consumer_index disruption_on_consumer.append(disruption_obj) # Create Disruptions instance for each pod to be disrupted on provider config.switch_to_provider() for pod_type in pods_on_provider: disruption_obj = disruption_helpers.Disruptions() disruption_obj.set_resource( resource=pod_type, cluster_index=self.provider_cluster_index) disruption_on_provider.append(disruption_obj) # Delete pods on consumer one at a time log.info("Starting pod disruptions on consumer clusters") for disruptions_obj in disruption_on_consumer: disruptions_obj.delete_resource() # ocs-operator respin will trigger rook-ceph-tools pod respin. # Patch rook-ceph-tools pod to run ceph commands. if disruptions_obj.resource == "ocs_operator": config.switch_ctx(disruptions_obj.index_of_consumer) patch_consumer_toolbox() # Delete pods on provider one at a time log.info("Starting pod disruptions on provider cluster") for disruptions_obj in disruption_on_provider: disruptions_obj.delete_resource() log.info("Wait for IO to complete on pods") for pod_obj in self.io_pods: pod_obj.get_fio_results() log.info(f"Verified IO on pod {pod_obj.name}") log.info("IO is successful on all pods") # Performs different checks in the clusters for cluster_index in [self.provider_cluster_index ] + self.consumer_indexes: config.switch_ctx(cluster_index) # Verify managedocs components are Ready log.info("Verifying managedocs components state") managedocs_obj = OCP( kind="managedocs", resource_name="managedocs", namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) for component in {"alertmanager", "prometheus", "storageCluster"}: assert ( managedocs_obj.get()["status"]["components"][component] ["state"] == "Ready" ), f"{component} status is {managedocs_obj.get()['status']['components'][component]['state']}" # Verify storagecluster status log.info("Verifying storagecluster status") verify_storage_cluster() # Verify CSV status for managed_csv in { constants.OCS_CSV_PREFIX, constants.OSD_DEPLOYER, constants.OSE_PROMETHEUS_OPERATOR, }: csvs = csv.get_csvs_start_with_prefix( managed_csv, constants.OPENSHIFT_STORAGE_NAMESPACE) assert ( len(csvs) == 1 ), f"Unexpected number of CSVs with {managed_csv} prefix: {len(csvs)}" csv_name = csvs[0]["metadata"]["name"] csv_obj = csv.CSV( resource_name=csv_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) log.info(f"Check if {csv_name} is in Succeeded phase.") csv_obj.wait_for_phase(phase="Succeeded", timeout=600) # Verify the phase of ceph cluster log.info("Verify the phase of ceph cluster") cephcluster = OCP(kind="CephCluster", namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) cephcluster_yaml = cephcluster.get().get("items")[0] expected_phase = "Connected" if cluster_index == self.provider_cluster_index: expected_phase = "Ready" assert ( cephcluster_yaml["status"]["phase"] == expected_phase ), f"Status of cephcluster {cephcluster_yaml['metadata']['name']} is {cephcluster_yaml['status']['phase']}" # Create PVC and pods on all consumer clusters log.info("Creating new PVCs and pods") pods = list() for cluster_index in self.consumer_indexes: config.switch_ctx(cluster_index) consumer_cluster_kubeconfig = os.path.join( config.clusters[cluster_index].ENV_DATA["cluster_path"], config.clusters[cluster_index].RUN.get("kubeconfig_location"), ) pvcs, io_pods = create_pvcs_and_pods( pvc_size=self.pvc_size, replica_count=1, pod_dict_path=constants.PERF_POD_YAML, ) for pvc_obj in pvcs: pvc_obj.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig for io_pod in io_pods: io_pod.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig pvcs[0].project.cluster_kubeconfig = consumer_cluster_kubeconfig pods.extend(io_pods) # Run I/O on new pods log.info("Running I/O on new pods") for pod_obj in pods: if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK: storage_type = "block" direct = 1 else: storage_type = "fs" direct = 0 pod_obj.run_io( storage_type=storage_type, size="10G", fio_filename=f"{pod_obj.name}", runtime=320, end_fsync=1, direct=direct, invalidate=0, fio_installed=True, ) log.info("Wait for I/O to complete on new pods") for pod_obj in pods: pod_obj.get_fio_results() log.info(f"Verified IO on the new pod {pod_obj.name}") log.info("IO is successful on new pods")
def test_resource_deletion_during_pvc_clone(self, pvc_clone_factory, pod_factory): """ Verify PVC clone will succeeded if rook-ceph, csi pods are re-spun while creating the clone """ pods_to_delete = [ "rbdplugin_provisioner", "cephfsplugin_provisioner", "cephfsplugin", "rbdplugin", "osd", "mgr", ] executor = ThreadPoolExecutor(max_workers=len(self.pvcs) + len(pods_to_delete)) disruption_ops = [ disruption_helpers.Disruptions() for _ in pods_to_delete ] file_name = "file_clone" # Run IO log.info("Running fio on all pods to create a file") for pod_obj in self.pods: storage_type = ("block" if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=30, fio_filename=file_name, end_fsync=1, ) log.info("Wait for IO to complete on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"Verified IO on pod {pod_obj.name}") # Calculate md5sum file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) pod_obj.pvc.md5sum = cal_md5sum( pod_obj, file_name_pod, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info(f"md5sum obtained from pod {pod_obj.name}") log.info("IO is successful on all pods") # Select the pods to be deleted for disruption, pod_type in zip(disruption_ops, pods_to_delete): cluster_index = None # 'provider_index' will not be None if the platform is Managed Services if self.provider_index is not None: if pod_type in ["osd", "mgr"]: cluster_index = self.provider_index config.switch_to_provider() else: cluster_index = self.consumer_index config.switch_ctx(cluster_index) disruption.set_resource(resource=pod_type, cluster_index=cluster_index) # Switch cluster context if the platform is MS. 'provider_index' will not be None if platform is MS. if self.provider_index is not None: config.switch_ctx(self.consumer_index) # Clone PVCs log.info("Start creating clone of PVCs") for pvc_obj in self.pvcs: log.info(f"Creating clone of PVC {pvc_obj.name}") pvc_obj.clone_proc = executor.submit( pvc_clone_factory, pvc_obj=pvc_obj, status="", access_mode=pvc_obj.get_pvc_access_mode, volume_mode=pvc_obj.volume_mode, ) log.info("Started creating clone") # Delete the pods 'pods_to_delete' log.info(f"Deleting pods {pods_to_delete}") for disruption in disruption_ops: disruption.delete_proc = executor.submit( disruption.delete_resource) # Wait for delete and recovery [disruption.delete_proc.result() for disruption in disruption_ops] # Get cloned PVCs clone_pvc_objs = [] for pvc_obj in self.pvcs: clone_obj = pvc_obj.clone_proc.result() clone_pvc_objs.append(clone_obj) log.info(f"Created clone {clone_obj.name} of PVC {pvc_obj.name}") log.info("Created clone of all PVCs") # Confirm that the cloned PVCs are Bound log.info("Verifying the cloned PVCs are Bound") for pvc_obj in clone_pvc_objs: wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=300) pvc_obj.reload() pvc_obj.volume_mode = pvc_obj.data["spec"]["volumeMode"] log.info("Verified: Cloned PVCs are Bound.") clone_pod_objs = [] # Attach the cloned PVCs to pods log.info("Attach the cloned PVCs to pods") for pvc_obj in clone_pvc_objs: if pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK: pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML else: pod_dict_path = "" restore_pod_obj = pod_factory( interface=pvc_obj.interface, pvc=pvc_obj, status="", pod_dict_path=pod_dict_path, raw_block_pv=pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK, ) clone_pod_objs.append(restore_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in clone_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify md5sum log.info("Verify md5sum") for pod_obj in clone_pod_objs: file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) verify_data_integrity( pod_obj, file_name_pod, pod_obj.pvc.parent.md5sum, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info( f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} " f"matches with the original md5sum") log.info("Data integrity check passed on all pods") # Run IO log.info("Running IO on new pods") for pod_obj in clone_pod_objs: storage_type = ("block" if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=20, fio_filename=file_name, end_fsync=1, ) log.info("Wait for IO to complete on new pods") for pod_obj in clone_pod_objs: pod_obj.get_fio_results() log.info(f"Verified IO on new pod {pod_obj.name}") log.info("IO to completed on new pods")
def finalizer(): # Switching to provider cluster context will be done during the test case. # Switch back to consumer cluster context after the test case. config.switch_ctx(initial_cluster_index)
def create_cluster_prereq(self, timeout=600): """ Perform all prereqs before vsphere cluster creation from ACM Args: timeout (int): Timeout for any UI operations """ # Create vsphre credentials # Click on 'Add credential' in 'Infrastructure provider' page self.navigate_create_clusters_page() self.refresh_page() hard_timeout = config.ENV_DATA.get("acm_ui_hard_deadline", 1200) remaining = hard_timeout while True: ret = self.check_element_presence( (By.XPATH, self.acm_page_nav[PLATFORM_XPATH_MAP[self.platform]][0]), timeout=300, ) if ret: log.info("Found platform icon") break else: if remaining < 0: raise TimeoutException( "Timedout while waiting for platform icon") else: remaining -= timeout self.navigate_create_clusters_page() self.refresh_page() self.do_click( locator=self.acm_page_nav[PLATFORM_XPATH_MAP[self.platform]], timeout=100) # "Basic vsphere credential info" # 1. credential name # 2. Namespace # 3. Base DNS domain self.do_click(locator=self.acm_page_nav["cc_provider_credentials"], timeout=100) parent_tab = self.driver.current_window_handle tabs = self.driver.window_handles self.driver.switch_to.window(tabs[1]) self.do_click(locator=self.acm_page_nav["cc_provider_creds_vsphere"]) basic_cred_dict = { self.acm_page_nav["cc_provider_creds_vsphere_cred_name"]: self.platform_credential_name, self.acm_page_nav["cc_provider_creds_vsphere_base_dns"]: f"{self.cluster_conf.ENV_DATA['base_domain']}", } self.fill_multiple_textbox(basic_cred_dict) # Credential Namespace is not a text box but a dropdown self.do_click( self.acm_page_nav["cc_provider_creds_vsphere_cred_namespace"]) self.do_click(self.acm_page_nav["cc_provider_creds_default_namespace"]) # click on 'Next' button at the bottom self.click_next_button() # Detailed VMWare credentials section # 1. vCenter server # 2. vCenter username # 3. vCenter password # 4. cVenter root CA certificate # 5. vSphere cluster name # 6. vSphere datacenter # 7. vSphere default Datastore with open(VSPHERE_CA_FILE_PATH, "r") as fp: vsphere_ca = fp.read() vsphere_creds_dict = { self.acm_page_nav["cc_provider_creds_vsphere_vcenter_server"]: f"{self.cluster_conf.ENV_DATA['vsphere_server']}", self.acm_page_nav["cc_provider_creds_vsphere_username"]: f"{self.cluster_conf.ENV_DATA['vsphere_user']}", self.acm_page_nav["cc_provider_creds_vsphere_password"]: f"{self.cluster_conf.ENV_DATA['vsphere_password']}", self.acm_page_nav["cc_provider_creds_vsphere_rootca"]: f"{vsphere_ca}", self.acm_page_nav["cc_provider_creds_vsphere_clustername"]: f"{self.cluster_conf.ENV_DATA['vsphere_cluster']}", self.acm_page_nav["cc_provider_creds_vsphere_dc"]: f"{self.cluster_conf.ENV_DATA['vsphere_datacenter']}", self.acm_page_nav["cc_provider_creds_vsphere_datastore"]: f"{self.cluster_conf.ENV_DATA['vsphere_datastore']}", } self.fill_multiple_textbox(vsphere_creds_dict) self.click_next_button() # Pull Secret and SSH # 1. Pull secret # 2. SSH Private key # 3. SSH Public key with open(os.path.join(DATA_DIR, "pull-secret"), "r") as fp: pull_secret = fp.read() ssh_pub_key_path = os.path.expanduser( self.cluster_conf.DEPLOYMENT["ssh_key"]) ssh_priv_key_path = os.path.expanduser( self.cluster_conf.DEPLOYMENT["ssh_key_private"]) with open(ssh_pub_key_path, "r") as fp: ssh_pub_key = fp.read() with open(ssh_priv_key_path, "r") as fp: ssh_priv_key = fp.read() pull_secret_and_ssh = { self.acm_page_nav["cc_provider_creds_vsphere_pullsecret"]: f"{pull_secret}", self.acm_page_nav["cc_provider_creds_vsphere_ssh_privkey"]: f"{ssh_priv_key}", self.acm_page_nav["cc_provider_creds_vsphere_ssh_pubkey"]: f"{ssh_pub_key}", } self.fill_multiple_textbox(pull_secret_and_ssh) self.click_next_button() self.do_click( locator=self.acm_page_nav["cc_provider_creds_vsphere_add_button"]) # Go to credentials tab self.do_click(locator=self.acm_page_nav["Credentials"]) credential_table_entry = format_locator( self.acm_page_nav["cc_table_entry"], self.platform_credential_name) if not self.check_element_presence( (By.XPATH, credential_table_entry[0]), timeout=20): raise ACMClusterDeployException( "Could not create credentials for vsphere") else: log.info( f"vsphere credential successfully created {self.platform_credential_name}" ) # Get the ips in prereq itself from ocs_ci.deployment import vmware # Switch context to cluster which we are about to create prev_ctx = config.cur_index config.switch_ctx(self.cluster_conf.MULTICLUSTER["multicluster_index"]) self.ips = vmware.assign_ips(self.nvips) vmware.create_dns_records(self.ips) config.switch_ctx(prev_ctx) self.driver.close() self.driver.switch_to.window(parent_tab) self.driver.switch_to.default_content()