def osd_node_reboot(): """ Rebooting worker node that running OSD Raises: AssertionError: in case the ceph-tools pod was not recovered """ nodes = PlatformNodesFactory().get_nodes_platform() osd_nodes_names = get_osd_running_nodes() osd_node_to_reboot = list() for node in get_nodes(): node_name = get_node_name(node) if node_name == osd_nodes_names[0]: osd_node_to_reboot.append(node) log.info(f"Rebooting OSD node: {get_node_name(osd_node_to_reboot[0])}") nodes.restart_nodes(osd_node_to_reboot) log.info("Sleeping 5 minutes") time.sleep(320) assert ( wait_for_ct_pod_recovery() ), "Ceph tools pod failed to come up on another node"
def test_worker_node_restart_during_pvc_clone( self, nodes, pvc_clone_factory, pod_factory ): """ Verify PVC cloning will succeed if a worker node is restarted while cloning is in progress """ file_name = "fio_test" executor = ThreadPoolExecutor(max_workers=len(self.pvcs) + 1) selected_node = node.get_nodes( node_type=constants.WORKER_MACHINE, num_of_nodes=1 ) # Run IO log.info("Starting IO on all pods") for pod_obj in self.pods: storage_type = ( "block" if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK else "fs" ) pod_obj.run_io( storage_type=storage_type, size="1G", runtime=20, fio_filename=file_name, end_fsync=1, ) log.info(f"IO started on pod {pod_obj.name}") log.info("Started IO on all pods") # Wait for IO to finish log.info("Wait for IO to finish on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"IO finished on pod {pod_obj.name}") # Calculate md5sum file_name_pod = ( file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block") ) pod_obj.pvc.md5sum = pod.cal_md5sum( pod_obj, file_name_pod, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) # Restart node log.info(f"Restart node {selected_node[0].name}") restart_thread = executor.submit(nodes.restart_nodes, nodes=selected_node) log.info("Creating clone of all PVCs.") for pvc_obj in self.pvcs: log.info(f"Creating clone of {pvc_obj.name}") pvc_obj.clone_proc = executor.submit( pvc_clone_factory, pvc_obj=pvc_obj, status="" ) # Check result of 'restart_nodes' restart_thread.result() log.info("Verify status of node.") node.wait_for_nodes_status( node_names=[node.get_node_name(selected_node[0])], status=constants.NODE_READY, timeout=300, ) # Get cloned PVCs cloned_pvcs = [pvc_obj.clone_proc.result() for pvc_obj in self.pvcs] log.info("Verifying cloned PVCs are Bound") for pvc_obj in cloned_pvcs: wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=540 ) pvc_obj.reload() log.info("Verified: Cloned PVCs are Bound") # Attach the cloned PVCs to pods log.info("Attach the cloned PVCs to pods") clone_pod_objs = [] for pvc_obj in cloned_pvcs: if pvc_obj.volume_mode == "Block": pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML else: pod_dict_path = "" clone_pod_obj = pod_factory( interface=pvc_obj.parent.interface, pvc=pvc_obj, status="", pod_dict_path=pod_dict_path, raw_block_pv=pvc_obj.volume_mode == "Block", ) log.info(f"Attaching the PVC {pvc_obj.name} to pod {clone_pod_obj.name}") clone_pod_objs.append(clone_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in clone_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify md5sum for pod_obj in clone_pod_objs: file_name_pod = ( file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block") ) pod.verify_data_integrity( pod_obj, file_name_pod, pod_obj.pvc.parent.md5sum, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info( f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} " f"matches with the original md5sum" ) log.info("Data integrity check passed on all pods") # Run IO log.info("Starting IO on the new pods") for pod_obj in clone_pod_objs: storage_type = ( "block" if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK else "fs" ) pod_obj.run_io( storage_type=storage_type, size="1G", runtime=20, fio_filename=f"{file_name}_1", end_fsync=1, ) log.info(f"IO started on pod {pod_obj.name}") log.info("Started IO on the new pods") # Wait for IO to finish log.info("Wait for IO to finish on the new pods") for pod_obj in clone_pod_objs: pod_obj.get_fio_results() log.info(f"IO finished on pod {pod_obj.name}") log.info("IO finished on the new pods")
def test_worker_node_restart_during_pvc_expansion(self, nodes): """ Verify PVC expansion will succeed if a worker node is restarted during expansion """ pvc_size_expanded = 30 executor = ThreadPoolExecutor(max_workers=len(self.pods)) selected_node = node.get_nodes(node_type=constants.WORKER_MACHINE, num_of_nodes=1) # Restart node log.info(f"Restart node {selected_node[0].name}") restart_thread = executor.submit(nodes.restart_nodes, nodes=selected_node) log.info("Expanding all PVCs.") for pvc_obj in self.pvcs: log.info( f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expanded}G" ) pvc_obj.expand_proc = executor.submit(pvc_obj.resize_pvc, pvc_size_expanded, False) # Check result of node 'restart_nodes' restart_thread.result() log.info("Verify status of node.") node.wait_for_nodes_status( node_names=[node.get_node_name(selected_node[0])], status=constants.NODE_READY, timeout=300, ) # Find respun pods new_pods_list = [] wait_to_stabilize = True for pod_obj in self.pods: new_pods = get_all_pods( namespace=pod_obj.namespace, selector=[pod_obj.labels.get("deploymentconfig")], selector_label="deploymentconfig", wait=wait_to_stabilize, ) for pod_ob in new_pods: pod_ob.pvc = pod_obj.pvc new_pods_list.extend(new_pods) # Given enough time for pods to respin. So wait time # is not needed for further iterations wait_to_stabilize = False assert len(new_pods_list) == len( self.pods), "Couldn't find all pods after node reboot" # Verify PVC expansion status for pvc_obj in self.pvcs: assert (pvc_obj.expand_proc.result() ), f"Expansion failed for PVC {pvc_obj.name}" capacity = pvc_obj.get().get("status").get("capacity").get( "storage") assert capacity == f"{pvc_size_expanded}Gi", ( f"Capacity of PVC {pvc_obj.name} is not {pvc_size_expanded}Gi as " f"expected, but {capacity}.") log.info("PVC expansion was successful on all PVCs") # Run IO log.info("Run IO after PVC expansion.") for pod_obj in new_pods_list: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs" pod_obj.io_proc = executor.submit( pod_obj.run_io, storage_type=storage_type, size="6G", runtime=30, fio_filename=f"{pod_obj.name}_file", end_fsync=1, ) log.info("Wait for IO to complete on all pods") for pod_obj in new_pods_list: pod_obj.io_proc.result() fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert err_count == 0, (f"IO error on pod {pod_obj.name}. " f"FIO result: {fio_result}") log.info(f"Verified IO on pod {pod_obj.name}.") log.info("IO is successful on all pods after PVC expansion.")
def test_worker_node_restart_during_pvc_expansion(self, nodes): """ Verify PVC expansion will succeed if a worker node is restarted during expansion """ pvc_size_expanded = 30 executor = ThreadPoolExecutor(max_workers=len(self.pods)) selected_node = node.get_nodes(node_type=constants.WORKER_MACHINE, num_of_nodes=1) # Restart node log.info(f"Restart node {selected_node[0].name}") restart_thread = executor.submit(nodes.restart_nodes, nodes=selected_node) log.info("Expanding all PVCs.") for pvc_obj in self.pvcs: log.info( f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expanded}G" ) pvc_obj.expand_proc = executor.submit(pvc_obj.resize_pvc, pvc_size_expanded, False) # Check result of node 'restart_nodes' restart_thread.result() log.info("Verify status of node.") node.wait_for_nodes_status( node_names=[node.get_node_name(selected_node[0])], status=constants.NODE_READY, timeout=300, ) # Find respun pods new_pods_list = [] wait_to_stabilize = True for pod_obj in self.pods: new_pods = get_all_pods( namespace=pod_obj.namespace, selector=[pod_obj.labels.get("deploymentconfig")], selector_label="deploymentconfig", wait=wait_to_stabilize, ) for pod_ob in new_pods: pod_ob.pvc = pod_obj.pvc new_pods_list.extend(new_pods) # Given enough time for pods to respin. So wait time # is not needed for further iterations wait_to_stabilize = False assert len(new_pods_list) == len( self.pods), "Couldn't find all pods after node reboot" # Verify PVC expansion status for pvc_obj in self.pvcs: assert pvc_obj.expand_proc.result(), ( f"Expansion failed for PVC {pvc_obj.name}\nDescribe output " f"of PVC and PV:\n{pvc_obj.describe()}\n" f"{pvc_obj.backed_pv_obj.describe()}") capacity = pvc_obj.get().get("status").get("capacity").get( "storage") assert capacity == f"{pvc_size_expanded}Gi", ( f"Capacity of PVC {pvc_obj.name} is not {pvc_size_expanded}Gi as " f"expected, but {capacity}.") log.info("PVC expansion was successful on all PVCs") log.info("Verifying new size on pods.") for pod_obj in new_pods_list: if pod_obj.pvc.volume_mode == "Block": log.info( f"Skipping check on pod {pod_obj.name} as volume mode is Block." ) continue # Wait for 240 seconds to reflect the change on pod log.info(f"Checking pod {pod_obj.name} to verify the change.") for df_out in TimeoutSampler(240, 3, pod_obj.exec_cmd_on_pod, command="df -kh"): df_out = df_out.split() new_size_mount = df_out[ df_out.index(pod_obj.get_storage_path()) - 4] if new_size_mount in [ f"{pvc_size_expanded - 0.1}G", f"{float(pvc_size_expanded)}G", f"{pvc_size_expanded}G", ]: log.info( f"Verified: Expanded size of PVC {pod_obj.pvc.name} " f"is reflected on pod {pod_obj.name}") break log.info( f"Expanded size of PVC {pod_obj.pvc.name} is not reflected" f" on pod {pod_obj.name}. New size on mount is not " f"{pvc_size_expanded}G as expected, but {new_size_mount}. " f"Checking again.") log.info(f"Verified: Expanded size {pvc_size_expanded}G is reflected " f"on all pods.") # Run IO log.info("Run IO after PVC expansion.") for pod_obj in new_pods_list: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs" pod_obj.io_proc = executor.submit( pod_obj.run_io, storage_type=storage_type, size="6G", runtime=30, fio_filename=f"{pod_obj.name}_file", end_fsync=1, ) log.info("Wait for IO to complete on all pods") for pod_obj in new_pods_list: pod_obj.io_proc.result() fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert err_count == 0, (f"IO error on pod {pod_obj.name}. " f"FIO result: {fio_result}") log.info(f"Verified IO on pod {pod_obj.name}.") log.info("IO is successful on all pods after PVC expansion.")