def delete_and_create_osd_node(osd_node_name): """ Delete an osd node, and create a new one to replace it Args: osd_node_name (str): The osd node name to delete """ new_node_name = None old_osd_ids = node.get_node_osd_ids(osd_node_name) old_osd_node_names = node.get_osd_running_nodes() # error message for invalid deployment configuration msg_invalid = ("ocs-ci config 'deployment_type' value " f"'{config.ENV_DATA['deployment_type']}' is not valid, " f"results of this test run are all invalid.") if config.ENV_DATA["deployment_type"] == "ipi": if is_lso_cluster(): # TODO: Implement functionality for Internal-Attached devices mode # once ocs-ci issue #4545 is resolved # https://github.com/red-hat-storage/ocs-ci/issues/4545 pytest.skip( "Functionality not implemented for this deployment mode") else: new_node_name = node.delete_and_create_osd_node_ipi(osd_node_name) elif config.ENV_DATA["deployment_type"] == "upi": if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM: new_node_name = node.delete_and_create_osd_node_aws_upi( osd_node_name) elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM: if is_lso_cluster(): new_node_name = node.delete_and_create_osd_node_vsphere_upi_lso( osd_node_name, use_existing_node=False) else: new_node_name = node.delete_and_create_osd_node_vsphere_upi( osd_node_name, use_existing_node=False) else: log.error(msg_invalid) pytest.fail(msg_invalid) log.info("Start node replacement verification steps...") check_node_replacement_verification_steps(osd_node_name, new_node_name, old_osd_node_names, old_osd_ids)
def delete_and_create_osd_node(osd_node_name): """ Delete an osd node, and create a new one to replace it Args: osd_node_name (str): The osd node name to delete """ new_node_name = None osd_pod = node.get_node_pods(osd_node_name, pods_to_search=pod.get_osd_pods())[0] old_osd_id = pod.get_osd_pod_id(osd_pod) old_osd_node_names = node.get_osd_running_nodes() # error message for invalid deployment configuration msg_invalid = ("ocs-ci config 'deployment_type' value " f"'{config.ENV_DATA['deployment_type']}' is not valid, " f"results of this test run are all invalid.") # TODO: refactor this so that AWS is not a "special" platform if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM: if config.ENV_DATA["deployment_type"] == "ipi": new_node_name = node.delete_and_create_osd_node_ipi(osd_node_name) elif config.ENV_DATA["deployment_type"] == "upi": new_node_name = node.delete_and_create_osd_node_aws_upi( osd_node_name) else: log.error(msg_invalid) pytest.fail(msg_invalid) elif config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS: if config.ENV_DATA["deployment_type"] == "ipi": new_node_name = node.delete_and_create_osd_node_ipi(osd_node_name) else: log.error(msg_invalid) pytest.fail(msg_invalid) elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM: if is_lso_cluster(): new_node_name = node.delete_and_create_osd_node_vsphere_upi_lso( osd_node_name, use_existing_node=False) else: new_node_name = node.delete_and_create_osd_node_vsphere_upi( osd_node_name, use_existing_node=False) log.info("Start node replacement verification steps...") check_node_replacement_verification_steps(osd_node_name, new_node_name, old_osd_node_names, old_osd_id)
def test_multiple_mon_pod_stays_on_same_node(self): """ A testcase to verify multiple mon pods stays on same node 1. Edit the rook-ceph-mon-endpoints configmap say, assign mon-a to another node that would be on the same node as another mon (compute-1 instead of compute-0) 2. Delete the mon-a deployment 3. Edit the mon-b deployment to remove the required mon anti-affinity 4. Restart the operator 5. Edit the mon-a deployment to remove the required mon anti-affinity 6. See mon-a start on compute-1 with mon-b 7. Soon after, see the operator failover one of these mons onto the node that doesn't currently have a mon (compute-0) and start mon-d """ ocs_version = config.ENV_DATA["ocs_version"] # Check that we have LSO cluster and OCS version is 4.8 and below # This is a workaround due to issue https://github.com/red-hat-storage/ocs-ci/issues/4937 if not (is_lso_cluster() and Version.coerce(ocs_version) <= Version.coerce("4.8")): pytest.skip( "Skip the test because mons are not node assignment from Rook, if cluster is not " "LSO based. And also currently, we want to run the test only with OCS 4.8 and " "below. This is a workaround due to issue " "https://github.com/red-hat-storage/ocs-ci/issues/4937") # Initialize rook_ceph_mon = "rook-ceph-mon" # Get mons running on pod mon_pods = get_mon_pods() mon_name_to_del = mon_pods[0].get().get("metadata").get("labels").get( "mon") mon_name_to_edit = mon_pods[1].get().get("metadata").get("labels").get( "mon") mon_node = get_pod_node(mon_pods[1]) # Edit the rook-ceph-mon-endpoints log.info(f"Edit the configmap {ROOK_CEPH_MON_ENDPOINTS}") configmap_obj = OCP(kind=CONFIGMAP, namespace=OPENSHIFT_STORAGE_NAMESPACE) rook_ceph_mon_configmap = configmap_obj.get( resource_name=ROOK_CEPH_MON_ENDPOINTS) json_val = json.loads(rook_ceph_mon_configmap["data"]["mapping"]) json_val["node"][mon_name_to_del].update( json_val["node"][mon_name_to_edit]) rook_ceph_mon_configmap["data"]["mapping"] = json.dumps(json_val) new_data = rook_ceph_mon_configmap["data"] params = f'{{"data": {json.dumps(new_data)}}}' configmap_obj.patch( resource_name=ROOK_CEPH_MON_ENDPOINTS, params=params, format_type="strategic", ) log.info(f"Configmap {ROOK_CEPH_MON_ENDPOINTS} edited successfully") log.info( f"Rook-ceph-mon-endpoints updated configmap: {rook_ceph_mon_configmap}" ) # Delete one mon deployment which had been edited dep_obj = OCP(kind=DEPLOYMENT, namespace=OPENSHIFT_STORAGE_NAMESPACE) mon_deployment_name_to_del = f"{rook_ceph_mon}-{mon_name_to_del}" log.info(f"Deleting mon {mon_deployment_name_to_del} deployments") dep_obj.delete(resource_name=mon_deployment_name_to_del) # Edit other mon deployment to remove mon anti-affinity mon_deployment_name_to_edit = f"{rook_ceph_mon}-{mon_name_to_edit}" log.info(f"Edit mon {mon_deployment_name_to_edit} deployment " "to remove the required mon anti-affinity") params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]' dep_obj.patch(resource_name=mon_deployment_name_to_edit, params=params, format_type="json") log.info( f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_edit}" ) # Restart operator operator_pod_obj = get_operator_pods() delete_pods(pod_objs=operator_pod_obj) POD_OBJ.wait_for_resource(condition=STATUS_RUNNING, selector=OPERATOR_LABEL) # Validate deleted deployment mon came up and in pending state # Initially mon stucks in pending state, remove defined anti-affinity POD_OBJ.wait_for_resource( condition=STATUS_PENDING, resource_count=1, selector=MON_APP_LABEL, timeout=1200, ) # Edit mon deployment to remove mon anti-affinity log.info(f"Edit mon {mon_deployment_name_to_del} deployment " "to remove the required mon anti-affinity") params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]' dep_obj.patch(resource_name=mon_deployment_name_to_del, params=params, format_type="json") log.info( f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_del}" ) # Validate mon pod moved to another node such that 2 mons are running on same node log.info("Waiting for 5 seconds for mon recovery") time.sleep(5) new_mon_pods = get_mon_pods() new_node = [ get_pod_node(mon) for mon in new_mon_pods if mon.get().get( "metadata").get("labels").get("mon") == mon_name_to_del ] assert ( new_node[0].name == mon_node.name ), f"Mon moved to node {mon_node} such that 2 mons are running on same node" # Verify rook deletes one of the mon and move to another node timeout = 60 log.info(f"Waiting for {timeout} seconds for mon recovery") time.sleep(timeout) POD_OBJ.wait_for_resource( condition=STATUS_RUNNING, resource_count=len(mon_pods), selector=MON_APP_LABEL, timeout=3600, sleep=5, ) log.info( "Mons are up and running state and validate are running on different nodes" ) mon_pods_running_on_same_node()
def test_pvc_creation_after_del_mon_services(self, interface, pod_factory): """ 1. Delete one mon service 2. Edit the configmap rook-ceph-endpoints remove all the deleted mon services entries 3. Delete deployment, pvc of deleted mon service 4. Restart rook-ceph-operator 5. Make sure all mon pods are running 6. Make sure ceph health Ok and storage pods are running 7. Sleep for 300 seconds before deleting another mon 8. Repeat above steps for all mons and at the end each mon should contain different endpoints 9. Create PVC, should succeeded. """ pod_obj = pod_factory(interface=interface) run_io_in_bg(pod_obj) # Get all mon services mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) # Get all mon pods mon_pods = get_mon_pods() mon_count = len(mon_pods) list_old_svc = [] for svc in mon_svc: # Get rook-ceph-operator pod obj operator_pod_obj = get_operator_pods() operator_name = operator_pod_obj[0].name # Scale down rook-ceph-operator log.info("Scale down rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=0 ), "Failed to scale down rook-ceph-operator to 0" log.info("Successfully scaled down rook-ceph-operator to 0") # Validate rook-ceph-operator pod not running POD_OBJ.wait_for_delete(resource_name=operator_name) svc_name = svc["metadata"]["name"] cluster_ip = svc["spec"]["clusterIP"] port = svc["spec"]["ports"][0]["port"] mon_endpoint = f"{cluster_ip}:{port}" mon_id = svc["spec"]["selector"]["mon"] list_old_svc.append(cluster_ip) # Delete deployment log.info("Delete mon deployments") del_obj = OCP( kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) mon_info = del_obj.get(resource_name=svc_name) del_obj.delete(resource_name=svc_name) # Delete pvc if is_lso_cluster(): mon_data_path = f"/var/lib/rook/mon-{mon_id}" mon_node = mon_info["spec"]["template"]["spec"]["nodeSelector"][ "kubernetes.io/hostname" ] log.info(f"Delete the directory `{mon_data_path}` from {mon_node}") cmd = f"rm -rf {mon_data_path}" ocp_obj = OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) ocp_obj.exec_oc_debug_cmd(node=mon_node, cmd_list=[cmd]) else: log.info("Delete mon PVC") pvc_name = svc["metadata"]["labels"]["pvc_name"] pvc_obj = OCP( kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) pvc_obj.delete(resource_name=pvc_name) # Delete the mon service log.info("Delete mon service") svc_obj = OCP( kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) svc_obj.delete(resource_name=svc_name) # Edit the cm log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}") configmap_obj = OCP( kind=constants.CONFIGMAP, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) output_get = configmap_obj.get( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS ) new_data = output_get["data"] new_data["csi-cluster-config-json"] = ( new_data["csi-cluster-config-json"].replace(f'"{mon_endpoint}",', "") if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1 else new_data["csi-cluster-config-json"].replace( f',"{mon_endpoint}"', "" ) ) new_data["data"] = ",".join( [ value for value in new_data["data"].split(",") if f"{mon_id}=" not in value ] ) new_data["mapping"] = ( new_data["mapping"].replace(f'"{mon_id}":null,', "") if new_data["mapping"].find(f'"{mon_id}":null,') != -1 else new_data["mapping"].replace(f',"{mon_id}":null', "") ) params = f'{{"data": {json.dumps(new_data)}}}' log.info(f"Removing {mon_id} entries from configmap") configmap_obj.patch( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS, params=params, format_type="strategic", ) log.info( f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully" ) # Scale up rook-ceph-operator log.info("Scale up rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=1 ), "Failed to scale up rook-ceph-operator to 1" log.info("Successfully scaled up rook-ceph-operator to 1") log.info("Validate rook-ceph-operator pod is running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, resource_count=1, timeout=600, sleep=5, ) # Validate all mons are running log.info("Validate all mons are up and running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=mon_count, timeout=1200, sleep=5, ) log.info("All mons are up and running") # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Validate all storage pods are running wait_for_storage_pods() # Sleep for some seconds before deleting another mon sleep_time = 300 log.info(f"Waiting for {sleep_time} seconds before deleting another mon") time.sleep(sleep_time) # Check the endpoints are different log.info("Validate the mon endpoints are changed") new_mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) list_new_svc = [] for new_svc in new_mon_svc: cluster_ip = new_svc["spec"]["clusterIP"] list_new_svc.append(cluster_ip) diff = set(list_new_svc) ^ set(list_old_svc) assert len(diff) == len(list_old_svc + list_new_svc), ( f"Not all endpoints are changed. Set of old " f"endpoints {list_old_svc} and new endpoints {list_new_svc}" ) log.info(f"All new mon endpoints are created {list_new_svc}") # Create PVC and pods log.info(f"Create {interface} PVC") pod_obj = pod_factory(interface=interface) pod_obj.run_io(storage_type="fs", size="500M")
def osd_device_replacement(nodes): """ Replacing randomly picked osd device Args: node (OCS): The OCS object representing the node """ logger.info("Picking a PV which to be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get("spec").get("claimRef").get("name") # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ ds for ds in osd_pvcs if ds.get().get("metadata").get("name") == claim_name ][0] # Get the corresponding OSD pod and ID logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] logger.info(f"OSD_POD {osd_pod.name}") osd_id = get_osd_pod_id(osd_pod) # Get the node that has the OSD pod running on logger.info( f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) ocp_version = get_ocp_version() if Version.coerce(ocp_version) < Version.coerce("4.6"): osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get( "labels").get("job-name")) osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_deployment_name = osd_deployment.name # Delete the volume from the platform side logger.info(f"Deleting {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Scale down OSD deployment logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"]) ocp_obj.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_deployment_name}") # Force delete OSD pod if necessary osd_pod_name = osd_pod.name logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") try: osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) except TimeoutError: osd_pod.delete(force=True) osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) # Run ocs-osd-removal job osd_removal_job = run_osd_removal_job([osd_id]) assert osd_removal_job, "ocs-osd-removal failed to create" is_completed = verify_osd_removal_job_completed_successfully(osd_id) assert is_completed, "ocs-osd-removal-job is not in status 'completed'" logger.info("ocs-osd-removal-job completed successfully") osd_pvc_name = osd_pvc.name if Version.coerce(ocp_version) < Version.coerce("4.6"): # Delete the OSD prepare job logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Delete the OSD deployment logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name, timeout=120) else: # If ocp version is '4.6' and above the osd removal job should # delete the OSD prepare job, OSD PVC, OSD deployment # We just need to verify the old PV is in the expected status logger.info( f"Verify that the old PV '{osd_pv_name}' is in the expected status" ) if cluster.is_lso_cluster(): expected_old_pv_statuses = [constants.STATUS_RELEASED] else: expected_old_pv_statuses = [ constants.STATUS_RELEASED, constants.STATUS_FAILED, ] assert (osd_pv.ocp.get_resource_status(osd_pv_name) in expected_old_pv_statuses), logger.warning( f"The old PV '{osd_pv_name}' is not in " f"the expected statuses: {expected_old_pv_statuses}") # Delete PV logger.info(f"Verifying deletion of PV {osd_pv_name}") try: osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) except TimeoutError: osd_pv.delete() osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) # If we use LSO, we need to create and attach a new disk manually if cluster.is_lso_cluster(): node.add_disk_to_node(osd_node) if Version.coerce(ocp_version) < Version.coerce("4.6"): # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info( f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() # Delete the OSD removal job logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") is_deleted = delete_osd_removal_job(osd_id) assert is_deleted, "Failed to delete ocs-osd-removal-job" logger.info("ocs-osd-removal-job deleted successfully") timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info( "Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info( "Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810 # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438 if Version.coerce(ocp_version) >= Version.coerce("4.6"): silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning( osd_pod_name) if not silence_osd_crash: logger.info("Didn't find ceph osd crash warning") sanity_helpers = Sanity() sanity_helpers.health_check(tries=120)
def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory): """ Test cluster recovery from disk deletion from the platform side. Based on documented procedure detailed in https://bugzilla.redhat.com/show_bug.cgi?id=1823183 """ logger.info("Picking a PV which to be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get("spec").get("claimRef").get("name") # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ ds for ds in osd_pvcs if ds.get().get("metadata").get("name") == claim_name ][0] # Get the corresponding OSD pod and ID logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] logger.info(f"OSD_POD {osd_pod.name}") osd_id = osd_pod.get().get("metadata").get("labels").get("ceph-osd-id") # Get the node that has the OSD pod running on logger.info( f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get( "labels").get("job-name")) osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_deployment_name = osd_deployment.name # Delete the volume from the platform side logger.info(f"Deleting {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Scale down OSD deployment logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") ocp.OCP().exec_oc_cmd( f"scale --replicas=0 deployment/{osd_deployment_name}") # Force delete OSD pod if necessary osd_pod_name = osd_pod.name logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") try: osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) except TimeoutError: osd_pod.delete(force=True) osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) # Run ocs-osd-removal job ocp_version = float(get_ocp_version()) if ocp_version >= 4.6: cmd = f"process ocs-osd-removal -p FAILED_OSD_IDS={osd_id} -o yaml" else: cmd = f"process ocs-osd-removal -p FAILED_OSD_ID={osd_id} -o yaml" logger.info(f"Executing OSD removal job on OSD-{osd_id}") ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"]) osd_removal_job_yaml = ocp_obj.exec_oc_cmd(cmd) osd_removal_job = OCS(**osd_removal_job_yaml) osd_removal_job.create(do_reload=False) # Get ocs-osd-removal pod name logger.info("Getting the ocs-osd-removal pod name") osd_removal_pod_name = get_osd_removal_pod_name(osd_id) osd_removal_pod_obj = get_pod_obj(osd_removal_pod_name, namespace="openshift-storage") osd_removal_pod_obj.ocp.wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=osd_removal_pod_name) # Verify OSD removal from the ocs-osd-removal pod logs logger.info( f"Verifying removal of OSD from {osd_removal_pod_name} pod logs") logs = get_pod_logs(osd_removal_pod_name) pattern = f"purged osd.{osd_id}" assert re.search(pattern, logs) osd_pvc_name = osd_pvc.name if ocp_version < 4.6: # Delete the OSD prepare job logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete( resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Delete the OSD deployment logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete( resource_name=osd_deployment_name, timeout=120) else: # If ocp version is '4.6' and above the osd removal job should # delete the OSD prepare job, OSD PVC, OSD deployment logger.info( f"Verifying deletion of OSD prepare job {osd_prepare_job_name}" ) osd_prepare_job.ocp.wait_for_delete( resource_name=osd_prepare_job_name, timeout=30) logger.info(f"Verifying deletion of OSD PVC {osd_pvc_name}") osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name, timeout=30) logger.info( f"Verifying deletion of OSD deployment {osd_deployment_name}") osd_deployment.ocp.wait_for_delete( resource_name=osd_deployment_name, timeout=30) # Delete PV logger.info(f"Verifying deletion of PV {osd_pv_name}") try: osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) except TimeoutError: osd_pv.delete() osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) # If we use LSO, we need to create and attach a new disk manually if cluster.is_lso_cluster(): osd_size = get_osd_size() logger.info(f"Create a new disk with size {osd_size}") nodes.create_and_attach_volume(node=osd_node, size=osd_size) if ocp_version < 4.6: # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info( f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() # Delete the OSD removal job logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") osd_removal_job = get_job_obj(f"ocs-osd-removal-{osd_id}") osd_removal_job.delete() osd_removal_job.ocp.wait_for_delete( resource_name=f"ocs-osd-removal-{osd_id}") timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info( "Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info( "Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810 # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438 if ocp_version >= 4.6: silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning( osd_pod_name) if not silence_osd_crash: logger.info("Didn't find ceph osd crash warning") # Validate cluster is still functional self.sanity_helpers.health_check(tries=100) self.sanity_helpers.create_resources(pvc_factory, pod_factory)