def test_dynamic_provisioning_glusterfile_glusterpod_failure(self): """Create glusterblock PVC when gluster pod is down.""" # Check that we work with containerized Gluster if not self.is_containerized_gluster(): self.skipTest("Only containerized Gluster clusters are supported.") mount_path = "/mnt" datafile_path = '%s/fake_file_for_%s' % (mount_path, self.id()) # Create secret and storage class self.create_storage_class() # Create PVC pvc_name = self.create_and_wait_for_pvc() # Create app POD with attached volume pod_name = oc_create_tiny_pod_with_volume(self.node, pvc_name, "test-pvc-mount-on-app-pod", mount_path=mount_path) self.addCleanup(wait_for_resource_absence, self.node, 'pod', pod_name) self.addCleanup(oc_delete, self.node, 'pod', pod_name) # Wait for app POD be up and running wait_for_pod_be_ready(self.node, pod_name, timeout=60, wait_step=2) # Run IO in background io_cmd = "oc rsh %s dd if=/dev/urandom of=%s bs=1000K count=900" % ( pod_name, datafile_path) async_io = g.run_async(self.node, io_cmd, "root") # Pick up one of the hosts which stores PV brick (4+ nodes case) gluster_pod_data = get_gluster_pod_names_by_pvc_name( self.node, pvc_name)[0] # Delete glusterfs POD from chosen host and wait for spawn of new one oc_delete(self.node, 'pod', gluster_pod_data["pod_name"]) cmd = ("oc get pods -o wide | grep glusterfs | grep %s | " "grep -v Terminating | awk '{print $1}'") % ( gluster_pod_data["host_name"]) for w in Waiter(600, 15): out = self.cmd_run(cmd) new_gluster_pod_name = out.strip().split("\n")[0].strip() if not new_gluster_pod_name: continue else: break if w.expired: error_msg = "exceeded timeout, new gluster pod not created" g.log.error(error_msg) raise ExecutionError(error_msg) new_gluster_pod_name = out.strip().split("\n")[0].strip() g.log.info("new gluster pod name is %s" % new_gluster_pod_name) wait_for_pod_be_ready(self.node, new_gluster_pod_name) # Check that async IO was not interrupted ret, out, err = async_io.async_communicate() self.assertEqual(ret, 0, "IO %s failed on %s" % (io_cmd, self.node))
def wait_for_claim(ocp_node, pvc_name, timeout=60, interval=2): """Wait for a claim to be created & bound up to the given timeout. """ for w in Waiter(timeout, interval): sts = oc_get_pvc(ocp_node, pvc_name) if sts and sts.get('status', {}).get('phase') == 'Bound': return sts raise AssertionError('wait_for_claim on pvc %s timed out' % (pvc_name, ))
def wait_for_sc_unused(ocp_node, sc_name, timeout=60, interval=1): for w in Waiter(timeout, interval): sts = oc_get_all_pvs(ocp_node) items = (sts and sts.get('items')) or [] if not any( i.get('spec', {}).get('storageClassName') == sc_name for i in items): return raise AssertionError('wait_for_sc_unused on %s timed out' % (sc_name, ))
def wait_to_settle(self, timeout=120, interval=1): # This was originally going to be a tearDown, but oddly enough # tearDown is called *before* the cleanup functions, so it # could never succeed. This needs to be added as a cleanup # function first so that we run after our test's other cleanup # functions but before we go on to the next test in order # to prevent the async cleanups in kubernetes from steping # on the next test's "toes". for w in Waiter(timeout): nvols = self._count_vols() if nvols == self.volcount: return raise AssertionError('wait for volume count to settle timed out')
def _node_reboot(self): storage_hostname = (g.config["gluster_servers"] [self.gluster_servers[0]]["storage"]) cmd = "sleep 3; /sbin/shutdown -r now 'Reboot triggered by Glusto'" ret, out, err = g.run(storage_hostname, cmd) self.addCleanup(self._wait_for_gluster_pod_to_be_ready) if ret != 255: err_msg = "failed to reboot host %s error: %s" % ( storage_hostname, err) g.log.error(err_msg) raise AssertionError(err_msg) try: g.ssh_close_connection(storage_hostname) except Exception as e: g.log.error("failed to close connection with host %s" " with error: %s" % (storage_hostname, e)) raise # added sleep as node will restart after 3 sec time.sleep(3) for w in Waiter(timeout=600, interval=10): try: if g.rpyc_get_connection(storage_hostname, user="******"): g.rpyc_close_connection(storage_hostname, user="******") break except Exception as err: g.log.info("exception while getting connection: '%s'" % err) if w.expired: error_msg = ("exceeded timeout 600 sec, node '%s' is " "not reachable" % storage_hostname) g.log.error(error_msg) raise ExecutionError(error_msg) # wait for the gluster pod to be in 'Running' state self._wait_for_gluster_pod_to_be_ready() # glusterd and gluster-blockd service should be up and running service_names = ("glusterd", "gluster-blockd", "tcmu-runner") for gluster_pod in self.gluster_pod_list: for service in service_names: g.log.info("gluster_pod - '%s' : gluster_service '%s'" % ( gluster_pod, service)) check_service_status_on_pod( self.oc_node, gluster_pod, service, "running" )
def background_ops(): subname = make_unique_label(short_tc_name) for i, w in enumerate(Waiter(60 * 60)): time.sleep(random.randint(1, 10) * 0.1) c = ClaimInfo(name='{}-{}'.format(subname, i), storageclass=tname, size=2) c.create_pvc(ocp_node) time.sleep(1) c.update_pvc_info(ocp_node, timeout=300) c.update_pv_info(ocp_node) time.sleep(random.randint(1, 10) * 0.1) c.delete_pvc(ocp_node) if done.is_set(): break
def verify_all_paths_are_up_in_multipath( self, mpath_name, hacount, node, timeout=30, interval=5): for w in Waiter(timeout, interval): out = command.cmd_run('multipath -ll %s' % mpath_name, node) count = 0 for line in out.split('\n'): if 'active ready running' in line: count += 1 if hacount == count: break msg = "Paths are not up equal to hacount %s in mpath %s on Node %s" % ( hacount, out, node) self.assertEqual(hacount, count, msg) for state in ['failed', 'faulty', 'undef']: msg = "All paths are not up in mpath %s on Node %s" % (out, node) self.assertNotIn(state, out, msg)
def _wait_for_gluster_pod_to_be_ready(self): for gluster_pod in self.gluster_pod_list: for w in Waiter(timeout=600, interval=10): try: success = wait_for_pod_be_ready( self.oc_node, gluster_pod, timeout=1, wait_step=1 ) if success: break except ExecutionError as e: g.log.info("exception %s while validating gluster " "pod %s" % (e, gluster_pod)) if w.expired: error_msg = ("exceeded timeout 600 sec, pod '%s' is " "not in 'running' state" % gluster_pod) g.log.error(error_msg) raise ExecutionError(error_msg)
def create_heketi_volume_with_name_and_wait(self, name, size, raise_on_cleanup_error=True, timeout=600, wait_step=10, **kwargs): json = kwargs.get("json", False) try: h_volume_info = heketi_volume_create(self.heketi_client_node, self.heketi_server_url, size, name=name, **kwargs) except Exception as e: if ('more required' in six.text_type(e) or ('Failed to allocate new volume' in six.text_type(e))): raise for w in Waiter(timeout, wait_step): h_volumes = heketi_volume_list(self.heketi_client_node, self.heketi_server_url) h_volume_match = re.search(HEKETI_VOLUME_REGEX % name, h_volumes) if h_volume_match: h_volume_info = heketi_volume_info(self.heketi_client_node, self.heketi_server_url, h_volume_match.group(1), json=json) break if w.expired: g.log.info( "Heketi volume with name %s not created in 600 sec" % name) raise self.addCleanup(heketi_volume_delete, self.heketi_client_node, self.heketi_server_url, h_volume_info["id"], raise_on_error=raise_on_cleanup_error) return h_volume_info
def wait_for_hostname(self, vm_name, timeout=600, interval=10): """Wait for hostname to get assigned to a VM. Args: vm_name (str): Name of the VM. Returns: str: hostname of the VM. Raises: CloudProviderError: In case of any failures. """ for w in Waiter(timeout, interval): vmlist = ( self.vsphere_client.content.viewManager.CreateContainerView( self.vsphere_client.content.rootFolder, [vim.VirtualMachine], True)) vm = [vm for vm in vmlist.view if vm.name == vm_name] hostname = vm[0].summary.guest.hostName if hostname: return hostname msg = 'VM %s did not got assigned hostname' % vm_name g.log.error(msg) raise exceptions.CloudProviderError(msg)
def test_heketi_server_operations_cleanup_on_idle_setup(self): """Run heketi db clean up on an idle setup""" h_node, h_url = self.heketi_client_node, self.heketi_server_url err_msg = "There should not be any pending operations list {}" # Verify the server operations for waiter_add in Waiter(300, 20): initial_ops = heketi_ops.heketi_server_operations_list( h_node, h_url) if not initial_ops: break if waiter_add.expired and initial_ops: self.skipTest(err_msg.format(initial_ops)) # Run cleanup cleanup = heketi_ops.heketi_server_operation_cleanup(h_node, h_url) self.assertFalse( cleanup, "Cleanup command failed with message {}".format(cleanup)) # Verify the server operations final_ops = heketi_ops.heketi_server_operations_list(h_node, h_url) self.assertFalse(final_ops, err_msg.format(final_ops))
def test_dynamic_provisioning_glusterfile_gluster_pod_or_node_failure( self): """Create glusterblock PVC when gluster pod or node is down.""" mount_path = "/mnt" datafile_path = '%s/fake_file_for_%s' % (mount_path, self.id()) # Create secret and storage class self.create_storage_class() # Create PVC pvc_name = self.create_and_wait_for_pvc() # Create app POD with attached volume pod_name = oc_create_tiny_pod_with_volume( self.node, pvc_name, "test-pvc-mount-on-app-pod", mount_path=mount_path, image=self.io_container_image_cirros) self.addCleanup(wait_for_resource_absence, self.node, 'pod', pod_name) self.addCleanup(oc_delete, self.node, 'pod', pod_name) # Wait for app POD be up and running wait_for_pod_be_ready(self.node, pod_name, timeout=60, wait_step=2) # Run IO in background io_cmd = "oc rsh %s dd if=/dev/urandom of=%s bs=1000K count=900" % ( pod_name, datafile_path) async_io = g.run_async(self.node, io_cmd, "root") # Check for containerized Gluster if self.is_containerized_gluster(): # Pick up one of the hosts which stores PV brick (4+ nodes case) gluster_pod_data = get_gluster_pod_names_by_pvc_name( self.node, pvc_name)[0] # Delete glusterfs POD from chosen host and wait for # spawn of new one oc_delete(self.node, 'pod', gluster_pod_data["pod_name"]) cmd = ("oc get pods -o wide | grep glusterfs | grep %s | " "grep -v Terminating | awk '{print $1}'") % ( gluster_pod_data["pod_hostname"]) for w in Waiter(600, 15): new_gluster_pod_name = self.cmd_run(cmd) if new_gluster_pod_name: break if w.expired: error_msg = "exceeded timeout, new gluster pod not created" g.log.error(error_msg) raise AssertionError(error_msg) g.log.info("new gluster pod name is %s" % new_gluster_pod_name) wait_for_pod_be_ready(self.node, new_gluster_pod_name) else: pvc_hosting_node_ip = get_gluster_host_ips_by_pvc_name( self.node, pvc_name)[0] heketi_nodes = heketi_node_list(self.heketi_client_node, self.heketi_server_url) node_ip_for_reboot = None for heketi_node in heketi_nodes: heketi_node_ip = heketi_node_info( self.heketi_client_node, self.heketi_server_url, heketi_node, json=True)["hostnames"]["storage"][0] if heketi_node_ip == pvc_hosting_node_ip: node_ip_for_reboot = heketi_node_ip break if not node_ip_for_reboot: raise AssertionError( "Gluster node IP %s not matched with heketi node %s" % (pvc_hosting_node_ip, heketi_node_ip)) node_reboot_by_command(node_ip_for_reboot) # Check that async IO was not interrupted ret, out, err = async_io.async_communicate() self.assertEqual(ret, 0, "IO %s failed on %s" % (io_cmd, self.node))
def test_targetcli_failure_during_block_pvc_creation(self): h_node, h_server = self.heketi_client_node, self.heketi_server_url # Disable redundant nodes and leave just 3 nodes online h_node_id_list = heketi_node_list(h_node, h_server) self.assertGreater(len(h_node_id_list), 2) for node_id in h_node_id_list[3:]: heketi_node_disable(h_node, h_server, node_id) self.addCleanup(heketi_node_enable, h_node, h_server, node_id) # Gather info about the Gluster node we are going to use for killing # targetcli processes. chosen_g_node_id = h_node_id_list[0] chosen_g_node_info = heketi_node_info(h_node, h_server, chosen_g_node_id, json=True) chosen_g_node_ip = chosen_g_node_info['hostnames']['storage'][0] chosen_g_node_hostname = chosen_g_node_info['hostnames']['manage'][0] chosen_g_node_ip_and_hostname = set( (chosen_g_node_ip, chosen_g_node_hostname)) g_pods = oc_get_custom_resource( self.node, 'pod', [ ':.metadata.name', ':.status.hostIP', ':.status.podIP', ':.spec.nodeName' ], selector='glusterfs-node=pod') if g_pods and g_pods[0]: for g_pod in g_pods: if chosen_g_node_ip_and_hostname.intersection(set(g_pod[1:])): host_to_run_cmds = self.node g_pod_prefix, g_pod = 'oc exec %s -- ' % g_pod[0], g_pod[0] break else: err_msg = ( 'Failed to find Gluster pod filtering it by following IPs ' 'and hostnames: %s\nFound following Gluster pods: %s') % ( chosen_g_node_ip_and_hostname, g_pods) g.log.error(err_msg) raise AssertionError(err_msg) else: host_to_run_cmds, g_pod_prefix, g_pod = chosen_g_node_ip, '', '' # Schedule deletion of targetcli process file_for_bkp, pvc_number = "~/.targetcli/prefs.bin", 10 self.cmd_run("%scp %s %s_backup" % (g_pod_prefix, file_for_bkp, file_for_bkp), hostname=host_to_run_cmds) self.addCleanup(self.cmd_run, "%srm -f %s_backup" % (g_pod_prefix, file_for_bkp), hostname=host_to_run_cmds) kill_targetcli_services_cmd = ( "while true; do " " %spkill targetcli || echo 'failed to kill targetcli process'; " "done" % g_pod_prefix) loop_for_killing_targetcli_process = g.run_async( host_to_run_cmds, kill_targetcli_services_cmd, "root") try: # Create bunch of PVCs sc_name, pvc_names = self.create_storage_class(), [] for i in range(pvc_number): pvc_names.append(oc_create_pvc(self.node, sc_name, pvc_size=1)) self.addCleanup(wait_for_resources_absence, self.node, 'pvc', pvc_names) self.addCleanup(oc_delete, self.node, 'pvc', ' '.join(pvc_names)) # Check that we get expected number of provisioning errors timeout, wait_step, succeeded_pvcs, failed_pvcs = 120, 1, [], [] _waiter, err_msg = Waiter(timeout=timeout, interval=wait_step), "" for pvc_name in pvc_names: _waiter._attempt = 0 for w in _waiter: events = get_events(self.node, pvc_name, obj_type="PersistentVolumeClaim") for event in events: if event['reason'] == 'ProvisioningSucceeded': succeeded_pvcs.append(pvc_name) break elif event['reason'] == 'ProvisioningFailed': failed_pvcs.append(pvc_name) break else: continue break if w.expired: err_msg = ( "Failed to get neither 'ProvisioningSucceeded' nor " "'ProvisioningFailed' statuses for all the PVCs in " "time. Timeout was %ss, interval was %ss." % (timeout, wait_step)) g.log.error(err_msg) raise AssertionError(err_msg) self.assertGreater(len(failed_pvcs), len(succeeded_pvcs)) finally: # Restore targetcli workability loop_for_killing_targetcli_process._proc.terminate() # Revert breakage back which can be caused by BZ-1769426 check_bkp_file_size_cmd = ("%sls -lah %s | awk '{print $5}'" % (g_pod_prefix, file_for_bkp)) bkp_file_size = self.cmd_run(check_bkp_file_size_cmd, hostname=host_to_run_cmds).strip() if bkp_file_size == "0": self.cmd_run("%smv %s_backup %s" % (g_pod_prefix, file_for_bkp, file_for_bkp), hostname=host_to_run_cmds) breakage_err_msg = ( "File located at '%s' was corrupted (zero size) on the " "%s. Looks like BZ-1769426 took effect. \n" "Don't worry, it has been restored after test failure." % (file_for_bkp, "'%s' Gluster pod" % g_pod if g_pod else "'%s' Gluster node" % chosen_g_node_ip)) g.log.error(breakage_err_msg) if err_msg: breakage_err_msg = "%s\n%s" % (err_msg, breakage_err_msg) raise AssertionError(breakage_err_msg) # Wait for all the PVCs to be in bound state wait_for_pvcs_be_bound(self.node, pvc_names, timeout=300, wait_step=5)