def _guster_pod_delete_cleanup(self, g_pod_list_before): """Cleanup for deletion of gluster pod using force delete""" # Switch to gluster project openshift_ops.switch_oc_project(self._master, self._registry_project_name) try: # Fetch gluster pod after delete pod_name = self._get_newly_deployed_gluster_pod(g_pod_list_before) # Check if pod name is empty i.e no new pod come up so use old pod openshift_ops.wait_for_pod_be_ready( self._master, pod_name[0] if pod_name else g_pod_list_before[0], timeout=1) except exceptions.ExecutionError: # Force delete and wait for new pod to come up openshift_ops.oc_delete(self._master, 'pod', g_pod_list_before[0], is_force=True) openshift_ops.wait_for_resource_absence(self._master, 'pod', g_pod_list_before[0]) # Fetch gluster pod after force delete g_new_pod = self._get_newly_deployed_gluster_pod(g_pod_list_before) openshift_ops.wait_for_pod_be_ready(self._master, g_new_pod[0])
def _perform_io_and_fetch_metrics(self, pod_name, pvc_name, filename, dirname, metric_data, operation): """Create 1000 files and dirs and validate with old metrics""" openshift_ops.switch_oc_project(self._master, self.storage_project_name) if operation == "create": cmds = ("touch /mnt/{}{{1..1000}}".format(filename), "mkdir /mnt/{}{{1..1000}}".format(dirname)) else: cmds = ("rm -rf /mnt/large_file", "rm -rf /mnt/{}{{1..1000}}".format(filename), "rm -rf /mnt/{}{{1..1000}}".format(dirname)) for cmd in cmds: self.cmd_run("oc rsh {} {}".format(pod_name, cmd)) # Fetch the new metrics and compare the inodes used and bytes used for w in waiter.Waiter(120, 10): after_io_metrics = self._get_and_manipulate_metric_data( self.metrics, pvc_name) if operation == "create": if (int(after_io_metrics['kubelet_volume_stats_inodes_used']) > int(metric_data['kubelet_volume_stats_inodes_used']) and int(after_io_metrics['kubelet_volume_stats_used_bytes'] ) > int(metric_data['kubelet_volume_stats_used_bytes'])): break else: if int(metric_data['kubelet_volume_stats_used_bytes']) > int( after_io_metrics['kubelet_volume_stats_used_bytes']): break if w.expired: raise AssertionError( "After data is modified metrics like bytes used and inodes " "used are not reflected in prometheus")
def _check_heketi_and_gluster_pod_after_node_reboot(self, heketi_node): openshift_ops.switch_oc_project( self._master, self.storage_project_name) heketi_pod = openshift_ops.get_pod_names_from_dc( self._master, self.heketi_dc_name)[0] # Wait for heketi pod to become ready and running openshift_ops.wait_for_pod_be_ready(self._master, heketi_pod) heketi_ops.hello_heketi(self._master, self.heketi_server_url) # Wait for glusterfs pods to become ready if hosted on same node heketi_node_ip = openshift_ops.oc_get_custom_resource( self._master, 'pod', '.:status.hostIP', heketi_pod)[0] if heketi_node_ip in self.gluster_servers: gluster_pod = openshift_ops.get_gluster_pod_name_for_specific_node( self._master, heketi_node) # Wait for glusterfs pod to become ready openshift_ops.wait_for_pod_be_ready(self._master, gluster_pod) services = ( ("glusterd", "running"), ("gluster-blockd", "running"), ("tcmu-runner", "running"), ("gluster-block-target", "exited")) for service, state in services: openshift_ops.check_service_status_on_pod( self._master, gluster_pod, service, "active", state)
def setUp(self): """Initialize all the variables necessary for test cases.""" super(TestMetricsAndGlusterRegistryValidation, self).setUp() try: metrics_config = g.config['openshift']['metrics'] self.metrics_project_name = metrics_config['metrics_project_name'] self.metrics_rc_hawkular_cassandra = ( metrics_config['metrics_rc_hawkular_cassandra']) self.metrics_rc_hawkular_metrics = ( metrics_config['metrics_rc_hawkular_metrics']) self.metrics_rc_heapster = metrics_config['metrics_rc_heapster'] self.registry_heketi_server_url = ( g.config['openshift']['registry_heketi_config'] ['heketi_server_url']) self.registry_project_name = ( g.config['openshift']['registry_project_name']) self.registry_servers_info = g.config['gluster_registry_servers'] except KeyError as err: msg = "Config file doesn't have key {}".format(err) g.log.error(msg) self.skipTest(msg) self.master = self.ocp_master_node[0] cmd = "oc project --short=true" current_project = command.cmd_run(cmd, self.master) switch_oc_project(self.master, self.metrics_project_name) self.addCleanup(switch_oc_project, self.master, current_project)
def test_kill_bhv_fsd_while_es_pod_running(self): """Validate killing of bhv fsd won't effect es pod io's""" # Fetch pod and PVC names and validate iscsi and multipath es_pod, pvc_name = self._get_es_pod_and_verify_iscsi_sessions() # Get the bhv name gluster_node = list(self._registry_servers_info.keys())[0] openshift_ops.switch_oc_project(self._master, self._registry_project_name) bhv_name = self.get_block_hosting_volume_by_pvc_name( pvc_name, heketi_server_url=self._registry_heketi_server_url, gluster_node=gluster_node) # Get one of the bricks pid of the bhv gluster_volume_status = gluster_ops.get_gluster_vol_status(bhv_name) pid = None for g_node, g_node_data in gluster_volume_status.items(): if g_node != gluster_node: continue for process_name, process_data in g_node_data.items(): if not process_name.startswith("/var"): continue pid = process_data["pid"] # When birck is down, pid of the brick is returned as -1. # Which is unexepeted situation. So, add appropriate assertion. self.assertNotEqual( pid, "-1", "Got unexpected PID (-1) for '{}' gluster vol " "on '{}' node.".format(bhv_name, gluster_node)) break self.assertTrue( pid, "Could not find 'pid' in Gluster vol data for '{}' " "Gluster node. Data: {}".format(gluster_node, gluster_volume_status)) break # Kill gluster vol brick process using found pid cmd_kill = "kill -9 {}".format(pid) cmd_start_vol = "gluster v start {} force".format(bhv_name) openshift_ops.cmd_run_on_gluster_pod_or_node(self._master, cmd_kill, gluster_node) self.addCleanup(openshift_ops.cmd_run_on_gluster_pod_or_node, self._master, cmd_start_vol, gluster_node) self.addCleanup(openshift_ops.switch_oc_project, self._master, self._registry_project_name) # Run I/O on ES pod openshift_ops.switch_oc_project(self._master, self._logging_project_name) file_name = '/elasticsearch/persistent/file1' cmd_run_io = 'dd if=/dev/urandom of={} bs=4k count=10000'.format( file_name) cmd_remove_file = 'rm {}'.format(file_name) openshift_ops.oc_rsh(self._master, es_pod, cmd_run_io) self.addCleanup(openshift_ops.oc_rsh, self._master, es_pod, cmd_remove_file)
def test_prometheus_volume_metrics_on_pod_restart(self): """Validate volume metrics using prometheus before and after pod restart""" # Create PVC and wait for it to be in 'Bound' state pvc_name = self.create_and_wait_for_pvc() pod_name = openshift_ops.oc_create_tiny_pod_with_volume( self._master, pvc_name, "autotest-volume", image=self.io_container_image_cirros) self.addCleanup(openshift_ops.oc_delete, self._master, 'pod', pod_name, raise_on_absence=False) # Wait for POD be up and running openshift_ops.wait_for_pod_be_ready( self._master, pod_name, timeout=60, wait_step=2) # Write data on the volume and wait for 2 mins and sleep is must for # prometheus to get the exact values of the metrics self._run_io_on_the_pod(pod_name, 30) time.sleep(120) # Fetching the metrics and storing in initial_metrics as dictionary initial_metrics = self._get_and_manipulate_metric_data( self.metrics, pvc_name) # Mark the current node unschedulable on which app pod is running openshift_ops.switch_oc_project( self._master, self.storage_project_name) pod_info = openshift_ops.oc_get_pods(self._master, name=pod_name) openshift_ops.oc_adm_manage_node( self._master, '--schedulable=false', nodes=[pod_info[pod_name]["node"]]) self.addCleanup( openshift_ops.oc_adm_manage_node, self._master, '--schedulable=true', nodes=[pod_info[pod_name]["node"]]) # Delete the existing pod and create a new pod openshift_ops.oc_delete(self._master, 'pod', pod_name) pod_name = openshift_ops.oc_create_tiny_pod_with_volume( self._master, pvc_name, "autotest-volume") self.addCleanup(openshift_ops.oc_delete, self._master, 'pod', pod_name) # Wait for POD be up and running and prometheus to refresh the data openshift_ops.wait_for_pod_be_ready( self._master, pod_name, timeout=60, wait_step=2) time.sleep(120) # Fetching the metrics and storing in final_metrics as dictionary and # validating with initial_metrics final_metrics = self._get_and_manipulate_metric_data( self.metrics, pvc_name) self.assertEqual(dict(initial_metrics), dict(final_metrics), "Metrics are different post pod restart")
def _guster_volume_cleanup(self, vol_name): # Check brick status. Restart vol if bricks are offline openshift_ops.switch_oc_project(self._master, self._registry_project_name) brick_list = brick_libs.get_all_bricks("auto_get_gluster_endpoint", vol_name) self.assertIsNotNone(brick_list, "Failed to get brick list") check_bricks = brick_libs.are_bricks_online( "auto_get_gluster_endpoint", vol_name, brick_list) if not check_bricks: start_vol, _, _ = volume_ops.volume_start( "auto_get_gluster_endpoint", vol_name, force=True) self.assertFalse(start_vol, "Failed to start volume using force")
def _delete_and_wait_for_new_es_pod_to_come_up(self): # Force delete and wait for es pod to come up openshift_ops.switch_oc_project( self._master, self._logging_project_name) pod_name = openshift_ops.get_pod_name_from_dc( self._master, self._logging_es_dc) openshift_ops.oc_delete(self._master, 'pod', pod_name, is_force=True) openshift_ops.wait_for_resource_absence(self._master, 'pod', pod_name) new_pod_name = openshift_ops.get_pod_name_from_dc( self._master, self._logging_es_dc) openshift_ops.wait_for_pod_be_ready( self._master, new_pod_name, timeout=1800)
def _wait_for_gluster_pod_be_ready(self, g_pod_list_before): """Wait for the gluster pods to be in ready state""" openshift_ops.switch_oc_project( self._master, self._registry_project_name) # Check if the gluster pods are in ready state try: pod_count = len(self._registry_servers_info.keys()) openshift_ops.wait_for_pods_be_ready( self._master, pod_count, "glusterfs-node=pod", timeout=120, wait_step=6) except exceptions.ExecutionError: self._guster_pod_delete(g_pod_list_before)
def test_block_provisioner_on_multiple_clusters(self): """Check block provisioner and verify on multiple clusters """ # Skip test if registry project is not present self.registry_sc = self.storage_classes.get( 'registry_block_storage_class') if not self.registry_sc: self.skipTest("Config file doesn't have key " "openshift.dynamic_provisioning.storage_classes") self._registry_heketi_server_url = self.registry_sc.get('resturl') self._registry_project_name = self.registry_sc.get( 'restsecretnamespace') if not (self._registry_heketi_server_url and self._registry_project_name): self.skipTest( "Config file doesn't have key" "'storage_classes.registry_block_storage_class.resturl' or " "'storage_classes.registry_block_storage_class" ".restsecretnamespace'") size = 1 prefix = 'autotest-pvc-{}'.format(utils.get_random_str(size=5)) # Create PVC in default namespace and verify multipath pvc_name = self.create_and_wait_for_pvc(pvc_size=size, pvc_name_prefix=prefix) _, pod_name = self.create_dc_with_pvc(pvc_name) match_pvc_and_pv(self.node, prefix) self.verify_iscsi_sessions_and_multipath(pvc_name, pod_name, rtype='pod') # Change project namespace self.addCleanup(switch_oc_project, self.node, self.storage_project_name) switch_oc_project(self.node, self._registry_project_name) # Create PVC in registry namespace and verify multipath self.sc_name = self.create_storage_class(glusterfs_registry=True) pvc_name = self.create_and_wait_for_pvc(sc_name=self.sc_name, pvc_size=size, pvc_name_prefix=prefix) _, pod_name = self.create_dc_with_pvc(pvc_name) self.verify_iscsi_sessions_and_multipath( pvc_name, pod_name, rtype='pod', heketi_server_url=self._registry_heketi_server_url, is_registry_gluster=True)
def setUpClass(cls): """Initialize all the variables necessary for test cases.""" super(BaseClass, cls).setUpClass() # Initializes OCP config variables cls.ocp_servers_info = g.config['ocp_servers'] cls.ocp_master_node = list(g.config['ocp_servers']['master'].keys()) cls.ocp_master_node_info = g.config['ocp_servers']['master'] cls.ocp_client = list(g.config['ocp_servers']['client'].keys()) cls.ocp_client_info = g.config['ocp_servers']['client'] cls.ocp_nodes = list(g.config['ocp_servers']['nodes'].keys()) cls.ocp_nodes_info = g.config['ocp_servers']['nodes'] # Initializes storage project config variables openshift_config = g.config.get("cns", g.config.get("openshift")) cls.storage_project_name = openshift_config.get( 'storage_project_name', openshift_config.get('setup', {}).get('cns_project_name')) # Initializes heketi config variables heketi_config = openshift_config['heketi_config'] cls.heketi_dc_name = heketi_config['heketi_dc_name'] cls.heketi_service_name = heketi_config['heketi_service_name'] cls.heketi_client_node = heketi_config['heketi_client_node'] cls.heketi_server_url = heketi_config['heketi_server_url'] cls.heketi_cli_user = heketi_config['heketi_cli_user'] cls.heketi_cli_key = heketi_config['heketi_cli_key'] cls.gluster_servers = list(g.config['gluster_servers'].keys()) cls.gluster_servers_info = g.config['gluster_servers'] cls.storage_classes = openshift_config['dynamic_provisioning'][ 'storage_classes'] cls.sc = cls.storage_classes.get( 'storage_class1', cls.storage_classes.get('file_storage_class')) cmd = "echo -n %s | base64" % cls.heketi_cli_key ret, out, err = g.run(cls.ocp_master_node[0], cmd, "root") if ret != 0: raise ExecutionError("failed to execute cmd %s on %s out: %s " "err: %s" % ( cmd, cls.ocp_master_node[0], out, err)) cls.secret_data_key = out.strip() # Checks if heketi server is alive if not hello_heketi(cls.heketi_client_node, cls.heketi_server_url): raise ConfigError("Heketi server %s is not alive" % cls.heketi_server_url) # Switch to the storage project if not switch_oc_project( cls.ocp_master_node[0], cls.storage_project_name): raise ExecutionError("Failed to switch oc project on node %s" % cls.ocp_master_node[0]) if 'glustotest_run_id' not in g.config: g.config['glustotest_run_id'] = ( datetime.datetime.now().strftime('%H_%M_%d_%m_%Y')) cls.glustotest_run_id = g.config['glustotest_run_id'] msg = "Setupclass: %s : %s" % (cls.__name__, cls.glustotest_run_id) g.log.info(msg)
def setUpClass(cls): """Initialize all the variables necessary for test cases.""" super(BaseClass, cls).setUpClass() # Initializes OCP config variables cls.ocp_servers_info = g.config['ocp_servers'] cls.ocp_master_node = list(g.config['ocp_servers']['master'].keys()) cls.ocp_master_node_info = g.config['ocp_servers']['master'] cls.ocp_client = list(g.config['ocp_servers']['client'].keys()) cls.ocp_client_info = g.config['ocp_servers']['client'] cls.ocp_nodes = list(g.config['ocp_servers']['nodes'].keys()) cls.ocp_nodes_info = g.config['ocp_servers']['nodes'] # Initializes storage project config variables openshift_config = g.config.get("cns", g.config.get("openshift")) cls.storage_project_name = openshift_config.get( 'storage_project_name', openshift_config.get('setup', {}).get('cns_project_name')) # Initializes heketi config variables heketi_config = openshift_config['heketi_config'] cls.heketi_dc_name = heketi_config['heketi_dc_name'] cls.heketi_service_name = heketi_config['heketi_service_name'] cls.heketi_client_node = heketi_config['heketi_client_node'] cls.heketi_server_url = heketi_config['heketi_server_url'] cls.heketi_cli_user = heketi_config['heketi_cli_user'] cls.heketi_cli_key = heketi_config['heketi_cli_key'] cls.gluster_servers = list(g.config['gluster_servers'].keys()) cls.gluster_servers_info = g.config['gluster_servers'] cls.storage_classes = openshift_config['dynamic_provisioning'][ 'storage_classes'] cls.sc = cls.storage_classes.get( 'storage_class1', cls.storage_classes.get('file_storage_class')) cmd = "echo -n %s | base64" % cls.heketi_cli_key ret, out, err = g.run(cls.ocp_master_node[0], cmd, "root") if ret != 0: raise ExecutionError("failed to execute cmd %s on %s out: %s " "err: %s" % (cmd, cls.ocp_master_node[0], out, err)) cls.secret_data_key = out.strip() # Checks if heketi server is alive if not hello_heketi(cls.heketi_client_node, cls.heketi_server_url): raise ConfigError("Heketi server %s is not alive" % cls.heketi_server_url) # Switch to the storage project if not switch_oc_project(cls.ocp_master_node[0], cls.storage_project_name): raise ExecutionError("Failed to switch oc project on node %s" % cls.ocp_master_node[0]) if 'glustotest_run_id' not in g.config: g.config['glustotest_run_id'] = ( datetime.datetime.now().strftime('%H_%M_%d_%m_%Y')) cls.glustotest_run_id = g.config['glustotest_run_id'] msg = "Setupclass: %s : %s" % (cls.__name__, cls.glustotest_run_id) g.log.info(msg)
def test_metrics_cassandra_pod_with_bhv_brick_process_down(self): """Validate metrics during restart of brick process of bhv""" # Validate iscsi and multipath gluster_node = list(self.registry_servers_info.keys())[0] hawkular_cassandra, pvc_name, _, _, _ = ( self.verify_cassandra_pod_multipath_and_iscsi()) switch_oc_project(self.master, self.registry_project_name) # Kill the brick process and force restart the volume bhv_name = self.get_block_hosting_volume_by_pvc_name( pvc_name, heketi_server_url=self.registry_heketi_server_url, gluster_node=gluster_node, ocp_client_node=self.master) restart_gluster_vol_brick_processes( self.master, bhv_name, list(self.registry_servers_info.keys())) self.addCleanup(self.cassandra_pod_delete_cleanup, raise_on_error=True)
def test_prometheus_basic_validation(self): """ Validate basic volume metrics using prometheus """ # Fetch the metrics and storing initial_metrics as dictionary pvc_name, pod_name, initial_metrics = self._fetch_initial_metrics( volume_expansion=False) # Create 1000 files and fetch the metrics that the data is updated self._perform_io_and_fetch_metrics(pod_name=pod_name, pvc_name=pvc_name, filename="filename1", dirname="dirname1", metric_data=initial_metrics, operation="create") # Write the IO half the size of the volume and validated from # prometheus pod that the size change is reflected size_to_write = int( initial_metrics['kubelet_volume_stats_capacity_bytes']) // 2 openshift_ops.switch_oc_project(self._master, self.storage_project_name) cmd = ("dd if=/dev/urandom of=/mnt/large_file bs={} count=1024".format( size_to_write // 1024)) ret, _, err = openshift_ops.oc_rsh(self._master, pod_name, cmd) self.assertFalse(ret, 'Failed to write file due to err {}'.format(err)) # Fetching the metrics and validating the data change is reflected for w in waiter.Waiter(120, 10): half_io_metrics = self._get_and_manipulate_metric_data( ['kubelet_volume_stats_used_bytes'], pvc_name) if bool(half_io_metrics) and (int( half_io_metrics['kubelet_volume_stats_used_bytes']) > size_to_write): break if w.expired: raise AssertionError( "After Data is written on the pvc, metrics like inodes used " "and bytes used are not reflected in the prometheus") # Delete the files from the volume and wait for the # updated details reflected in prometheus self._perform_io_and_fetch_metrics(pod_name=pod_name, pvc_name=pvc_name, filename="filename1", dirname="dirname1", metric_data=half_io_metrics, operation="delete")
def test_resping_gluster_pod(self): """Validate gluster pod restart with no disruption to elasticsearch pod """ restart_custom = ":status.containerStatuses[0].restartCount" # Fetch pod and validate iscsi and multipath es_pod, _ = self._get_es_pod_and_verify_iscsi_sessions() # Fetch the restart count for the es pod restart_count_before = openshift_ops.oc_get_custom_resource( self._master, "pod", restart_custom, es_pod)[0] # Switch to gluster project openshift_ops.switch_oc_project(self._master, self._registry_project_name) # Fetch the gluster pod list before g_pod_list_before = [ pod["pod_name"] for pod in openshift_ops.get_ocp_gluster_pod_details(self._master) ] # Respin a gluster pod openshift_ops.oc_delete(self._master, "pod", g_pod_list_before[0]) self.addCleanup(self._guster_pod_delete_cleanup, g_pod_list_before) # Wait for pod to get absent openshift_ops.wait_for_resource_absence(self._master, "pod", g_pod_list_before[0]) # Fetch gluster pod after delete g_new_pod = self._get_newly_deployed_gluster_pod(g_pod_list_before) openshift_ops.wait_for_pod_be_ready(self._master, g_new_pod[0]) # Switch to logging project openshift_ops.switch_oc_project(self._master, self._logging_project_name) # Fetch the restart count for the es pod restart_count_after = openshift_ops.oc_get_custom_resource( self._master, "pod", restart_custom, es_pod)[0] self.assertEqual( restart_count_before, restart_count_after, "Failed disruption to es pod found expecting restart count before" " {} and after {} for es pod to be equal after gluster pod" " respin".format(restart_count_before, restart_count_after))
def cassandra_pod_delete_cleanup(self, raise_on_error=False): """Cleanup for deletion of cassandra pod using force delete""" switch_oc_project(self.master, self.metrics_project_name) try: # Check if pod is up or ready pod_name = get_pod_name_from_rc(self.master, self.metrics_rc_hawkular_cassandra) wait_for_pod_be_ready(self.master, pod_name, timeout=1) except exceptions.ExecutionError as err: # Force delete and wait for new pod to come up oc_delete(self.master, 'pod', pod_name, is_force=True) wait_for_resource_absence(self.master, 'pod', pod_name) new_pod_name = get_pod_name_from_rc( self.master, self.metrics_rc_hawkular_cassandra) wait_for_pod_be_ready(self.master, new_pod_name) if raise_on_error: raise err
def _get_and_manipulate_metric_data(self, metrics, pvc): """Create a dict of metric names and total values""" # Switch to namespace containing prometheus pods openshift_ops.switch_oc_project(self._master, self._prometheus_project_name) self.addCleanup(openshift_ops.switch_oc_project, self._master, self.storage_project_name) metric_data = dict() for metric in metrics: out = self._fetch_metric_from_promtheus_pod(metric) for matric_result in out: if matric_result["metric"]["persistentvolumeclaim"] == pvc: metric_data[matric_result["metric"][ "__name__"]] = matric_result["value"][1] return metric_data
def setUp(self): """Initialize all the variables which are necessary for test cases""" super(TestPrometheusAndGlusterRegistryValidation, self).setUp() try: prometheus_config = g.config['openshift']['prometheus'] self._prometheus_project_name = prometheus_config[ 'prometheus_project_name'] self._prometheus_resources_selector = prometheus_config[ 'prometheus_resources_selector'] self._alertmanager_resources_selector = prometheus_config[ 'alertmanager_resources_selector'] self._registry_heketi_server_url = ( g.config['openshift']['registry_heketi_config'][ 'heketi_server_url']) self._registry_project_name = ( g.config['openshift']['registry_project_name']) self._registry_servers_info = ( g.config['gluster_registry_servers']) except KeyError as err: self.skipTest("Config file doesn't have key {}".format(err)) # Skip the test if iscsi-initiator-utils version is not the expected cmd = ("rpm -q iscsi-initiator-utils " "--queryformat '%{version}-%{release}\n'" "| cut -d '.' -f 1,2,3,4") e_pkg_version = "6.2.0.874-17" for g_server in self.gluster_servers: out = self.cmd_run(cmd, g_server) if parse_version(out) < parse_version(e_pkg_version): self.skipTest( "Skip the test as iscsi-initiator-utils package version {}" "is less than version {} found on the node {}, for more " "info refer to BZ-1624670".format( out, e_pkg_version, g_server)) self._master = self.ocp_master_node[0] # Switch to namespace conatining prometheus pods cmd = "oc project --short=true" current_project = command.cmd_run(cmd, self._master) openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) self.addCleanup( openshift_ops.switch_oc_project, self._master, current_project)
def setUp(self): """Initialize all the variables necessary for test cases.""" super(TestLoggingAndGlusterRegistryValidation, self).setUp() try: logging_config = g.config['openshift']['logging'] self._logging_project_name = logging_config['logging_project_name'] self._logging_fluentd_ds = logging_config['logging_fluentd_ds'] self._logging_es_dc = logging_config['logging_es_dc'] self._logging_kibana_dc = logging_config['logging_kibana_dc'] self._registry_heketi_server_url = ( g.config['openshift']['registry_heketi_config'] ['heketi_server_url']) self._registry_project_name = ( g.config['openshift']['registry_project_name']) self._registry_servers_info = g.config['gluster_registry_servers'] except KeyError as err: msg = "Config file doesn't have key {}".format(err) g.log.error(msg) self.skipTest(msg) # Skip the test if iscsi-initiator-utils version is not the expected cmd = ("rpm -q iscsi-initiator-utils " "--queryformat '%{version}-%{release}\n'" "| cut -d '.' -f 1,2,3,4") e_pkg_version = "6.2.0.874-17" for g_server in self.gluster_servers: out = self.cmd_run(cmd, g_server) if parse_version(out) < parse_version(e_pkg_version): msg = ("Skip test since isci initiator utils version actual: " "{out} is less than expected: {ver} on node {server}," " for more info refer to BZ-1624670".format( out=out, ver=e_pkg_version, server=g_server)) g.log.error(msg) self.skipTest(msg) self._master = self.ocp_master_node[0] cmd = "oc project --short=true" current_project = command.cmd_run(cmd, self._master) openshift_ops.switch_oc_project(self._master, self._logging_project_name) self.addCleanup(openshift_ops.switch_oc_project, self._master, current_project)
def _guster_pod_delete(self, g_pod_list_before): """Delete the gluster pod using force delete""" openshift_ops.switch_oc_project( self._master, self._registry_project_name) # Fetch newly deployed gluster pod after delete try: pod_name = self._get_newly_deployed_gluster_pod(g_pod_list_before) openshift_ops.wait_for_pod_be_ready( self._master, pod_name[0] if pod_name else g_pod_list_before[0], timeout=120, wait_step=6) except exceptions.ExecutionError: openshift_ops.oc_delete( self._master, 'pod', g_pod_list_before[0], is_force=True) openshift_ops.wait_for_resource_absence( self._master, 'pod', g_pod_list_before[0]) g_new_pod = self._get_newly_deployed_gluster_pod(g_pod_list_before) openshift_ops.wait_for_pod_be_ready(self._master, g_new_pod[0])
def test_run_workload_with_logging(self): """Validate logs are being generated aifter running workload""" # Get the size of used space of logs es_pod = openshift_ops.get_pod_name_from_dc( self._master, self._logging_es_dc) mount_point = "/elasticsearch/persistent" cmd_space_check = ('df -kh --output=used {} | sed "/Used/d" |' 'sed "s/G//"'.format(mount_point)) ret, initial_used_percent, err = openshift_ops.oc_rsh( self._master, es_pod, cmd_space_check) err_msg = "Failed to fetch the size of used space, error {}" self.assertFalse(ret, err_msg.format(err)) # Create 20 pvcs and app pods with io openshift_ops.switch_oc_project( self._master, self.storage_project_name) pvc_count, batch_count = 5, 4 for _ in range(batch_count): pvcs = self.create_and_wait_for_pvcs(pvc_amount=pvc_count) self.create_dcs_with_pvc(pvcs) self.addCleanup( openshift_ops.switch_oc_project, self._master, self.storage_project_name) # Get and verify the final used size of used space of logs openshift_ops.switch_oc_project( self._master, self._logging_project_name) for w in waiter.Waiter(600, 30): ret, final_used_percent, err = openshift_ops.oc_rsh( self._master, es_pod, cmd_space_check) self.assertFalse(ret, err_msg.format(err)) if int(initial_used_percent) < int(final_used_percent): break if w.expired: raise AssertionError( "Initial used space {} for logs is not less than final " "used space {}".format( initial_used_percent, final_used_percent))
def test_verify_metrics_data_during_gluster_pod_respin(self): # Add check for CRS version switch_oc_project(self.master, self.registry_project_name) if not self.is_containerized_gluster(): self.skipTest("Skipping this test case as CRS version check " "can not be implemented") # Verify multipath and iscsi for cassandra pod switch_oc_project(self.master, self.metrics_project_name) hawkular_cassandra, pvc_name, iqn, _, node = ( self.verify_cassandra_pod_multipath_and_iscsi()) # Get the ip of active path device_and_ip = get_iscsi_block_devices_by_path(node, iqn) mpath = get_mpath_name_from_device_name(node, list(device_and_ip.keys())[0]) active_passive_dict = get_active_and_enabled_devices_from_mpath( node, mpath) node_ip = device_and_ip[active_passive_dict['active'][0]] # Get the name of gluster pod from the ip switch_oc_project(self.master, self.registry_project_name) gluster_pods = get_ocp_gluster_pod_details(self.master) pod_name = list( filter(lambda pod: (pod["pod_host_ip"] == node_ip), gluster_pods))[0]["pod_name"] err_msg = "Failed to get the gluster pod name {} with active path" self.assertTrue(pod_name, err_msg.format(pod_name)) # Delete the pod oc_delete(self.master, 'pod', pod_name) wait_for_resource_absence(self.master, 'pod', pod_name) # Wait for new pod to come up pod_count = len(self.registry_servers_info.keys()) selector = "glusterfs-node=pod" wait_for_pods_be_ready(self.master, pod_count, selector) # Validate cassandra pod state, multipath and issci switch_oc_project(self.master, self.metrics_project_name) wait_for_pod_be_ready(self.master, hawkular_cassandra, timeout=2) self.verify_iscsi_sessions_and_multipath( pvc_name, self.metrics_rc_hawkular_cassandra, rtype='rc', heketi_server_url=self.registry_heketi_server_url, is_registry_gluster=True)
def test_run_workload_with_metrics(self): """Validate if logs are being generated after running workload""" # Get the size of used space of logs cassandra_pod = get_pod_name_from_rc( self.master, self.metrics_rc_hawkular_cassandra) mount_point = "/cassandra_data" cmd_space_check = ('df -k --output=used {} | sed "/Used/d" |' 'sed "s/G//"'.format(mount_point)) ret, initial_used_percent, err = oc_rsh(self.master, cassandra_pod, cmd_space_check) err_msg = "Failed to fetch the size of used space, error {}" self.assertFalse(ret, err_msg.format(err)) # Create 20 PVCs and app pods with IO switch_oc_project(self.master, self.storage_project_name) pvc_count, batch_count = 5, 4 for _ in range(batch_count): pvcs = self.create_and_wait_for_pvcs(pvc_amount=pvc_count) self.create_dcs_with_pvc(pvcs) self.addCleanup(switch_oc_project, self.master, self.storage_project_name) # Get and verify the final size of used space of logs switch_oc_project(self.master, self.metrics_project_name) for w in waiter.Waiter(600, 30): ret, final_used_percent, err = oc_rsh(self.master, cassandra_pod, cmd_space_check) self.assertFalse(ret, err_msg.format(err)) if int(initial_used_percent) < int(final_used_percent): break if w.expired: raise AssertionError( "Initial used space {} for logs is not less than final " "used space {}".format(initial_used_percent, final_used_percent))
def test_heketi_prometheus_usedbytes_brickcount_on_device_delete( self, operation): """Validate used bytes,device count on heketi and prometheus""" h_node, h_server = self.heketi_client_node, self.heketi_server_url # Get list of additional devices for one of the Gluster nodes gluster_server_0 = list(self.gluster_servers_info.values())[0] manage_hostname = gluster_server_0.get("manage") self.assertTrue( manage_hostname, "IP Address is not specified for " "node {}".format(gluster_server_0)) device_name = gluster_server_0.get("additional_devices")[0] self.assertTrue( device_name, "Additional devices are not specified for " "node {}".format(gluster_server_0)) # Get node ID of the Gluster hostname node_list = heketi_ops.heketi_topology_info( h_node, h_server, json=True).get("clusters")[0].get("nodes") self.assertTrue( node_list, "Cluster info command returned empty list of nodes") node_id = [ node.get("id") for node in node_list if manage_hostname == node.get("hostnames").get("manage")[0]] self.assertTrue( node_id, "Failed to get node_id for {}".format(manage_hostname)) node_id = node_id[0] # Adding heketi device heketi_ops.heketi_device_add(h_node, h_server, device_name, node_id) node_info_after_addition = heketi_ops.heketi_node_info( h_node, h_server, node_id, json=True) device_id, bricks = None, None for device in node_info_after_addition.get("devices"): if device.get("name") == device_name: device_id, bricks = ( device.get("id"), len(device.get("bricks"))) break # Verify zero bricks on the device msg = ( "Number of bricks on the device {} of the nodes should be" "zero".format(device_name)) self.assertFalse(bricks, msg) self.addCleanup( heketi_ops.heketi_device_delete, h_node, h_server, device_id, raise_on_error=False) self.addCleanup( heketi_ops.heketi_device_remove, h_node, h_server, device_id, raise_on_error=False) self.addCleanup( heketi_ops.heketi_device_disable, h_node, h_server, device_id, raise_on_error=False) # Disable,Remove and Delete heketi device heketi_ops.heketi_device_disable(h_node, h_server, device_id) heketi_ops.heketi_device_remove(h_node, h_server, device_id) heketi_ops.heketi_device_delete(h_node, h_server, device_id) # Verify device deletion node_info_after_deletion = ( heketi_ops.heketi_node_info(h_node, h_server, node_id)) msg = ("Device {} should not be shown in node info of the node {}" "after the device deletion".format(device_id, node_id)) self.assertNotIn(device_id, node_info_after_deletion, msg) if operation == "usedbytes": # Validate heketi and prometheus device used bytes for w in waiter.Waiter(timeout=60, interval=10): device_used_bytes_prometheus = 0 device_used_bytes_metrics = 0 openshift_ops.switch_oc_project( self.ocp_master_node[0], 'openshift-monitoring') metric_result = self._fetch_metric_from_promtheus_pod( metric='heketi_device_used_bytes') for result in metric_result: if (node_id == result.get('cluster') and device_name == result.get('device')): device_used_bytes_prometheus += ( int(result.get('value')[1])) openshift_ops.switch_oc_project( self.ocp_master_node[0], 'glusterfs') metrics = heketi_ops.get_heketi_metrics(h_node, h_server) heketi_device_count_metric = ( metrics.get('heketi_device_used_bytes')) for result in heketi_device_count_metric: if (node_id == result.get('cluster') and device_name == result.get('device')): device_used_bytes_metrics = int(result.get('value')) if device_used_bytes_prometheus == device_used_bytes_metrics: break if w.expired: raise exceptions.ExecutionError( "Failed to update device details in prometheus") elif operation == "brickcount": # Validate heketi and prometheus device brick count for w in waiter.Waiter(timeout=60, interval=10): device_brick_count_prometheus = 0 device_brick_count_metrics = 0 metrics = heketi_ops.get_heketi_metrics(h_node, h_server) heketi_device_count_metric = metrics.get( 'heketi_device_brick_count') for result in heketi_device_count_metric: device_brick_count_metrics += int(result.get('value')) openshift_ops.switch_oc_project( self.ocp_master_node[0], 'openshift-monitoring') metric_result = self._fetch_metric_from_promtheus_pod( metric='heketi_device_brick_count') for result in metric_result: device_brick_count_prometheus += ( int(result.get('value')[1])) if device_brick_count_prometheus == device_brick_count_metrics: break if w.expired: raise exceptions.ExecutionError( "Failed to update device details in prometheus")
def test_prometheus_volume_metrics_on_node_reboot(self): """Validate volume metrics using prometheus before and after node reboot""" # Pod name for the entire test prefix = "autotest-{}".format(utils.get_random_str()) # Create I/O pod with PVC pvc_name = self.create_and_wait_for_pvc() pod_name = openshift_ops.oc_create_tiny_pod_with_volume( self._master, pvc_name, prefix, image=self.io_container_image_cirros) self.addCleanup(openshift_ops.oc_delete, self._master, 'pod', pod_name, raise_on_absence=False) openshift_ops.wait_for_pod_be_ready( self._master, pod_name, timeout=60, wait_step=5) # Write data on the volume and wait for 2 mins and sleep is must for # prometheus to get the exact values of the metrics ret, _, err = openshift_ops.oc_rsh( self._master, pod_name, "touch /mnt/file{1..1000}") self.assertEqual( ret, 0, "Failed to create files in the app pod " "with {}".format(err)) time.sleep(120) # Fetch the metrics and store in initial_metrics as dictionary initial_metrics = self._get_and_manipulate_metric_data( self.metrics, pvc_name) openshift_ops.switch_oc_project( self._master, self.storage_project_name) # Get the hostname to reboot where the pod is running pod_info = openshift_ops.oc_get_pods(self._master, name=pod_name) node_for_reboot = pod_info[pod_name]['node'] # Get the vm name by the hostname vm_name = node_ops.find_vm_name_by_ip_or_hostname(node_for_reboot) # power off and on the vm, based on the vm type(either gluster or not) if node_for_reboot in self.gluster_servers: self.power_off_gluster_node_vm(vm_name, node_for_reboot) self.power_on_gluster_node_vm(vm_name, node_for_reboot) else: self.power_off_vm(vm_name) self.power_on_vm(vm_name) openshift_ops.wait_for_ocp_node_be_ready( self._master, node_for_reboot) # Create the new pod and validate the prometheus metrics pod_name = openshift_ops.oc_create_tiny_pod_with_volume( self._master, pvc_name, prefix) self.addCleanup(openshift_ops.oc_delete, self._master, 'pod', pod_name) # Wait for POD be up and running and prometheus to refresh the data openshift_ops.wait_for_pod_be_ready( self._master, pod_name, timeout=60, wait_step=5) time.sleep(120) # Fetching the metrics and storing in final_metrics as dictionary and # validating with initial_metrics final_metrics = self._get_and_manipulate_metric_data( self.metrics, pvc_name) self.assertEqual(dict(initial_metrics), dict(final_metrics), "Metrics are different post node reboot")
def test_prometheus_pv_resize(self): """ Validate prometheus metrics with pv resize""" # Fetch the metrics and storing initial_metrics as dictionary pvc_name, pod_name, initial_metrics = self._fetch_initial_metrics( vol_name_prefix="for-pv-resize", volume_expansion=True) # Write data on the pvc and confirm it is reflected in the prometheus self._perform_io_and_fetch_metrics( pod_name=pod_name, pvc_name=pvc_name, filename="filename1", dirname="dirname1", metric_data=initial_metrics, operation="create") # Resize the pvc to 2GiB openshift_ops.switch_oc_project( self._master, self.storage_project_name) pvc_size = 2 openshift_ops.resize_pvc(self._master, pvc_name, pvc_size) openshift_ops.wait_for_events(self._master, obj_name=pvc_name, event_reason='VolumeResizeSuccessful') openshift_ops.verify_pvc_size(self._master, pvc_name, pvc_size) pv_name = openshift_ops.get_pv_name_from_pvc( self._master, pvc_name) openshift_ops.verify_pv_size(self._master, pv_name, pvc_size) heketi_volume_name = heketi_ops.heketi_volume_list_by_name_prefix( self.heketi_client_node, self.heketi_server_url, "for-pv-resize", json=True)[0][2] self.assertIsNotNone( heketi_volume_name, "Failed to fetch volume with prefix {}". format("for-pv-resize")) openshift_ops.oc_delete(self._master, 'pod', pod_name) openshift_ops.wait_for_resource_absence(self._master, 'pod', pod_name) pod_name = openshift_ops.get_pod_name_from_dc( self._master, self.dc_name) openshift_ops.wait_for_pod_be_ready(self._master, pod_name) # Check whether the metrics are updated or not for w in waiter.Waiter(120, 10): resize_metrics = self._get_and_manipulate_metric_data( self.metrics, pvc_name) if bool(resize_metrics) and int(resize_metrics[ 'kubelet_volume_stats_capacity_bytes']) > int( initial_metrics['kubelet_volume_stats_capacity_bytes']): break if w.expired: raise AssertionError("Failed to reflect PVC Size after resizing") openshift_ops.switch_oc_project( self._master, self.storage_project_name) time.sleep(240) # Lookup and trigger rebalance and wait for the its completion for _ in range(100): self.cmd_run("oc rsh {} ls /mnt/".format(pod_name)) self._rebalance_completion(heketi_volume_name) # Write data on the resized pvc and compared with the resized_metrics self._perform_io_and_fetch_metrics( pod_name=pod_name, pvc_name=pvc_name, filename="secondfilename", dirname="seconddirname", metric_data=resize_metrics, operation="create")
def test_prometheous_kill_bhv_brick_process(self): """Validate kill brick process of block hosting volume with prometheus workload running""" # Add check for CRS version openshift_ops.switch_oc_project( self._master, self._registry_project_name) if not self.is_containerized_gluster(): self.skipTest("Skipping this test case as CRS" " version check can not be implemented") # Get one of the prometheus pod name and respective pvc name openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) prometheus_pods = openshift_ops.oc_get_pods( self._master, selector=self._prometheus_resources_selector) if not prometheus_pods: self.skipTest( prometheus_pods, "Skipping test as prometheus" " pod is not present") # Validate iscsi and multipath prometheus_pod = list(prometheus_pods.keys())[0] pvc_name = openshift_ops.oc_get_custom_resource( self._master, "pod", ":.spec.volumes[*].persistentVolumeClaim.claimName", prometheus_pod) self.assertTrue(pvc_name, "Failed to get PVC name") pvc_name = pvc_name[0] self.verify_iscsi_sessions_and_multipath( pvc_name, prometheus_pod, rtype='pod', heketi_server_url=self._registry_heketi_server_url, is_registry_gluster=True) # Try to fetch metric from prometheus pod self._fetch_metric_from_promtheus_pod( metric='heketi_device_brick_count') # Kill the brick process of a BHV gluster_node = list(self._registry_servers_info.keys())[0] openshift_ops.switch_oc_project( self._master, self._registry_project_name) bhv_name = self.get_block_hosting_volume_by_pvc_name( pvc_name, heketi_server_url=self._registry_heketi_server_url, gluster_node=gluster_node, ocp_client_node=self._master) vol_status = gluster_ops.get_gluster_vol_status(bhv_name) gluster_node_ip, brick_pid = None, None for g_node, g_node_data in vol_status.items(): for process_name, process_data in g_node_data.items(): if process_name.startswith("/var"): gluster_node_ip = g_node brick_pid = process_data["pid"] break if gluster_node_ip and brick_pid: break self.assertIsNotNone(brick_pid, "Could not find pid for brick") cmd = "kill -9 {}".format(brick_pid) openshift_ops.cmd_run_on_gluster_pod_or_node( self._master, cmd, gluster_node_ip) self.addCleanup(self._guster_volume_cleanup, bhv_name) # Check if the brick-process has been killed killed_pid_cmd = ( "ps -p {} -o pid --no-headers".format(brick_pid)) try: openshift_ops.cmd_run_on_gluster_pod_or_node( self._master, killed_pid_cmd, gluster_node_ip) except exceptions.ExecutionError: g.log.info("Brick process {} was killed" "successfully".format(brick_pid)) # Try to fetch metric from prometheus pod openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) self._fetch_metric_from_promtheus_pod( metric='heketi_device_brick_count') # Start the bhv using force openshift_ops.switch_oc_project( self._master, self._registry_project_name) start_vol, _, _ = volume_ops.volume_start( gluster_node_ip, bhv_name, force=True) self.assertFalse( start_vol, "Failed to start volume {}" " using force".format(bhv_name)) # Validate iscsi and multipath openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) self.verify_iscsi_sessions_and_multipath( pvc_name, prometheus_pod, rtype='pod', heketi_server_url=self._registry_heketi_server_url, is_registry_gluster=True) # Try to fetch metric from prometheus pod self._fetch_metric_from_promtheus_pod( metric='heketi_device_brick_count')
def test_heketi_metrics_validation_with_node_reboot(self): """Validate heketi metrics after node reboot using prometheus""" initial_metrics, final_metrics = {}, {} # Use storage project openshift_ops.switch_oc_project( self._master, self.storage_project_name) # Get initial metrics result h_node, h_server = self.heketi_client_node, self.heketi_server_url initial_metrics = tuple( heketi_ops.get_heketi_metrics(h_node, h_server).get(metric)[0] for metric in self.metrics) # Use prometheus project openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) # Get initial prometheus result initial_prometheus = self._get_and_manipulate_metric_data( self.metrics) # Get hosted node IP of heketi pod openshift_ops.switch_oc_project( self._master, self.storage_project_name) heketi_pod = openshift_ops.get_pod_name_from_dc( self._master, self.heketi_dc_name) heketi_node = openshift_ops.oc_get_custom_resource( self._master, 'pod', '.:spec.nodeName', heketi_pod)[0] # Reboot the node on which heketi pod is scheduled self.addCleanup( self._check_heketi_and_gluster_pod_after_node_reboot, heketi_node) node_ops.node_reboot_by_command(heketi_node) # Wait node to become NotReady custom = r'":.status.conditions[?(@.type==\"Ready\")]".status' for w in waiter.Waiter(300, 10): status = openshift_ops.oc_get_custom_resource( self._master, 'node', custom, heketi_node) if status[0] == 'False': break if w.expired: raise exceptions.ExecutionError( "Failed to bring down node {}".format(heketi_node)) # Wait for node to become ready openshift_ops.wait_for_ocp_node_be_ready(self._master, heketi_node) # Wait for heketi and glusterfs pod to become ready self._check_heketi_and_gluster_pod_after_node_reboot(heketi_node) # Use prometheus project openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) # Get final metrics result final_metrics = tuple( heketi_ops.get_heketi_metrics(h_node, h_server).get(metric)[0] for metric in self.metrics) # Get final prometheus result final_prometheus = self._get_and_manipulate_metric_data( self.metrics) err_msg = "Initial value {} is not same as final value {}" self.assertEqual( initial_metrics, final_metrics, err_msg.format( initial_metrics, final_metrics)) self.assertEqual( initial_prometheus, final_prometheus, err_msg.format( initial_prometheus, final_prometheus))
def test_heketi_metrics_validation_after_node(self, condition): """Validate heketi metrics after adding and remove node""" # Get additional node additional_host_info = g.config.get("additional_gluster_servers") if not additional_host_info: self.skipTest( "Skipping this test case as additional gluster server is " "not provied in config file") additional_host_info = list(additional_host_info.values())[0] storage_hostname = additional_host_info.get("manage") storage_ip = additional_host_info.get("storage") if not (storage_hostname and storage_ip): self.skipTest( "Config options 'additional_gluster_servers.manage' " "and 'additional_gluster_servers.storage' must be set.") h_client, h_server = self.heketi_client_node, self.heketi_server_url initial_node_count, final_node_count = 0, 0 # Get initial node count from prometheus metrics metric_result = self._fetch_metric_from_promtheus_pod( metric='heketi_nodes_count') initial_node_count = reduce( lambda x, y: x + y, [result.get('value')[1] for result in metric_result]) # Switch to storage project openshift_ops.switch_oc_project( self._master, self.storage_project_name) # Configure node before adding node self.configure_node_to_run_gluster(storage_hostname) # Get cluster list cluster_info = heketi_ops.heketi_cluster_list( h_client, h_server, json=True) # Add node to the cluster heketi_node_info = heketi_ops.heketi_node_add( h_client, h_server, len(self.gluster_servers), cluster_info.get('clusters')[0], storage_hostname, storage_ip, json=True) heketi_node_id = heketi_node_info.get("id") self.addCleanup( heketi_ops.heketi_node_delete, h_client, h_server, heketi_node_id, raise_on_error=False) self.addCleanup( heketi_ops.heketi_node_remove, h_client, h_server, heketi_node_id, raise_on_error=False) self.addCleanup( heketi_ops.heketi_node_disable, h_client, h_server, heketi_node_id, raise_on_error=False) self.addCleanup( openshift_ops.switch_oc_project, self._master, self.storage_project_name) if condition == 'delete': # Switch to openshift-monitoring project openshift_ops.switch_oc_project( self.ocp_master_node[0], self._prometheus_project_name) # Get initial node count from prometheus metrics for w in waiter.Waiter(timeout=60, interval=10): metric_result = self._fetch_metric_from_promtheus_pod( metric='heketi_nodes_count') node_count = reduce( lambda x, y: x + y, [result.get('value')[1] for result in metric_result]) if node_count != initial_node_count: break if w.expired: raise exceptions.ExecutionError( "Failed to get updated node details from prometheus") # Remove node from cluster heketi_ops.heketi_node_disable(h_client, h_server, heketi_node_id) heketi_ops.heketi_node_remove(h_client, h_server, heketi_node_id) for device in heketi_node_info.get('devices'): heketi_ops.heketi_device_delete( h_client, h_server, device.get('id')) heketi_ops.heketi_node_delete(h_client, h_server, heketi_node_id) # Switch to openshift-monitoring project openshift_ops.switch_oc_project( self.ocp_master_node[0], self._prometheus_project_name) # Get final node count from prometheus metrics for w in waiter.Waiter(timeout=60, interval=10): metric_result = self._fetch_metric_from_promtheus_pod( metric='heketi_nodes_count') final_node_count = reduce( lambda x, y: x + y, [result.get('value')[1] for result in metric_result]) if condition == 'delete': if final_node_count < node_count: break else: if final_node_count > initial_node_count: break if w.expired: raise exceptions.ExecutionError( "Failed to update node details in prometheus")
def test_metrics_workload_on_prometheus(self): """Validate metrics workload on prometheus""" # Skip test if the prometheus pods are not present openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) prometheus_pods = openshift_ops.oc_get_pods( self._master, selector=self._prometheus_resources_selector) if not prometheus_pods: self.skipTest( prometheus_pods, "Skipping test as prometheus" " pod is not present") if not self.registry_sc: self.skipTest( prometheus_pods, "Skipping test as registry " " storage details are not provided") self._registry_project = self.registry_sc.get( 'restsecretnamespace') self.prefix = "autotest-{}".format(utils.get_random_str()) # Get one of the prometheus pod name and respective pvc name prometheus_pod = list(prometheus_pods.keys())[0] pvc_custom = ":.spec.volumes[*].persistentVolumeClaim.claimName" pvc_name = openshift_ops.oc_get_custom_resource( self._master, "pod", pvc_custom, prometheus_pod)[0] self.assertTrue( pvc_name, "Failed to get PVC name for prometheus" " pod {}".format(prometheus_pod)) self.verify_iscsi_sessions_and_multipath( pvc_name, prometheus_pod, rtype='pod', heketi_server_url=self._registry_heketi_server_url, is_registry_gluster=True) # Try to fetch metric from the prometheus pod self._fetch_metric_from_promtheus_pod( metric='kube_persistentvolumeclaim_info') # Create storage class openshift_ops.switch_oc_project( self._master, self._registry_project) self.sc_name = self.create_storage_class( vol_name_prefix=self.prefix, glusterfs_registry=True) self.addCleanup(openshift_ops.switch_oc_project, self._master, self._registry_project) # Create PVCs and app pods pvc_size, pvc_count, batch_count = 1, 5, 5 for _ in range(batch_count): test_pvc_names = self.create_and_wait_for_pvcs( pvc_size, pvc_name_prefix=self.prefix, pvc_amount=pvc_count, sc_name=self.sc_name, timeout=600, wait_step=10) self.create_dcs_with_pvc( test_pvc_names, timeout=600, wait_step=5, dc_name_prefix="autotests-dc-with-app-io", space_to_use=1048576) # Check from the prometheus pod for the PVC space usage openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) mount_path = "/prometheus" cmd = "oc exec {0} -- df -PT {1} | grep {1}".format( prometheus_pod, mount_path) out = self.cmd_run(cmd) self.assertTrue(out, "Failed to get info about mounted volume. " "Output is empty.") # Try to fetch metric from prometheus pod self._fetch_metric_from_promtheus_pod( metric='kube_persistentvolumeclaim_info') self._fetch_metric_from_promtheus_pod( metric='kube_pod_spec_volumes_persistentvolumeclaims_info') self.addCleanup(openshift_ops.switch_oc_project, self._master, self._registry_project)
def test_restart_prometheus_glusterfs_pod(self): """Validate restarting glusterfs pod""" # Add check for CRS version openshift_ops.switch_oc_project( self._master, self._registry_project_name) if not self.is_containerized_gluster(): self.skipTest( "Skipping this test case as CRS version check " "can not be implemented") # Get one of the prometheus pod name and respective pvc name openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) prometheus_pods = openshift_ops.oc_get_pods( self._master, selector=self._prometheus_resources_selector) if not prometheus_pods: self.skipTest( prometheus_pods, "Skipping test as prometheus" " pod is not present") prometheus_pod = list(prometheus_pods.keys())[0] pvc_name = openshift_ops.oc_get_custom_resource( self._master, "pod", ":.spec.volumes[*].persistentVolumeClaim.claimName", prometheus_pod)[0] self.assertTrue( pvc_name, "Failed to get pvc name from {} pod".format(prometheus_pod)) iqn, _, node = self.verify_iscsi_sessions_and_multipath( pvc_name, prometheus_pod, rtype='pod', heketi_server_url=self._registry_heketi_server_url, is_registry_gluster=True) # Get the ip of active path devices = openshift_storage_libs.get_iscsi_block_devices_by_path( node, iqn) mpath = openshift_storage_libs.get_mpath_name_from_device_name( node, list(devices.keys())[0]) mpath_dev = ( openshift_storage_libs.get_active_and_enabled_devices_from_mpath( node, mpath)) node_ip = devices[mpath_dev['active'][0]] # Get the name of gluster pod from the ip openshift_ops.switch_oc_project( self._master, self._registry_project_name) gluster_pods = openshift_ops.get_ocp_gluster_pod_details( self._master) active_pod_name = list( filter(lambda pod: (pod["pod_host_ip"] == node_ip), gluster_pods) )[0]["pod_name"] err_msg = "Failed to get the gluster pod name {} with active path" self.assertTrue(active_pod_name, err_msg.format(active_pod_name)) g_pods = [pod['pod_name'] for pod in gluster_pods] g_pods.remove(active_pod_name) pod_list = [active_pod_name, g_pods[0]] for pod_name in pod_list: # Delete the glusterfs pods openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) self._fetch_metric_from_promtheus_pod( metric='heketi_device_brick_count') openshift_ops.switch_oc_project( self._master, self._registry_project_name) g_pod_list_before = [ pod["pod_name"] for pod in openshift_ops.get_ocp_gluster_pod_details( self._master)] openshift_ops.oc_delete(self._master, 'pod', pod_name) self.addCleanup( self._guster_pod_delete, g_pod_list_before) # Wait for gluster pod to be absent openshift_ops.wait_for_resource_absence( self._master, 'pod', pod_name) # Try to fetch metric from prometheus pod openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) self._fetch_metric_from_promtheus_pod( metric='heketi_device_brick_count') # Wait for new pod to come up openshift_ops.switch_oc_project( self._master, self._registry_project_name) self.assertTrue(self._get_newly_deployed_gluster_pod( g_pod_list_before), "Failed to get new pod") self._wait_for_gluster_pod_be_ready(g_pod_list_before) # Validate iscsi and multipath openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) self.verify_iscsi_sessions_and_multipath( pvc_name, prometheus_pod, rtype='pod', heketi_server_url=self._registry_heketi_server_url, is_registry_gluster=True) # Try to fetch metric from prometheus pod self._fetch_metric_from_promtheus_pod( metric='heketi_device_brick_count')