def all_healthy(self): """ check if each resource is in healthy order and return a global healthy status :return: Bool """ for attempt_number in range(360): try: if all([ self.rabbitmq_resource_healthy(), self.galera_resource_healthy(), self.redis_resource_healthy(), self.vips_resource_healthy(), self.ha_proxy_cinder_healthy(), self.ovn_resource_healthy() ]): LOG.info("pcs status checks: all resources are" " in healthy state") return True else: LOG.info("pcs status check: not all resources are " "in healthy " "state") raise PcsResourceException() except PcsResourceException: # reread pcs status LOG.info('Retrying pacemaker resource checks attempt ' '{} of 360'.format(attempt_number)) time.sleep(1) self.pcs_df = get_pcs_resources_table() # exhausted all retries tobiko.fail('pcs cluster is not in a healthy state')
def find_msg_in_file(node, logfile, message, rotated=False): """Search for the message in the logfile :param node: Node the container is running on :type node: class: tobiko.openstack.topology.OpenStackTopologyNode :param logfile: Path of the logfile :type logfile: string :param message: Message to search for :type message: string :param rotated: Variable to flag that log file has to be rotated so the name will be ended by '.1' :type rotated: bool :return: True if message exists in file or False otherwise :rtype: bool """ if rotated: suffix = ".1" else: suffix = "" LOG.debug(f'Searching for {message} in {logfile}{suffix} on {node.name}') result = sh.execute(f'grep -h {message} {logfile}{suffix}', ssh_client=node.ssh_client, expect_exit_status=None, sudo=True) if result.stderr: tobiko.fail(f'Failed to read {logfile} on {node.name}:\n' f'{result.stderr}') elif result.stdout.strip() == message: return True else: return False
def get_overcloud_container(container_name=None, container_host=None, partial_container_name=None): """gets an container object by name on specified host container""" con_obj_df = list_containers_objects_df() if partial_container_name and container_host: con_obj_df = con_obj_df[con_obj_df['container_name'].str.contains( partial_container_name)] contaniner_obj = con_obj_df.query( 'container_host == "{container_host}"'.format( container_host=container_host))['container_object'] elif container_host: contaniner_obj = con_obj_df.query( 'container_name == "{container_name}"' ' and container_host == "{container_host}"'.format( container_host=container_host, container_name=container_name))['container_object'] else: contaniner_obj = con_obj_df.query( 'container_name == "{container_name}"'.format( container_name=container_name))['container_object'] if not contaniner_obj.empty: return contaniner_obj.values[0] else: tobiko.fail('container {} not found!'.format(container_name))
def assert_not_transmitted(self): if self.transmitted: tobiko.fail( "{transmitted!r} package(s) has been transmitted to " "{destination!r}", transmitted=self.transmitted, destination=self.destination)
def assert_not_replied(self): if self.received: tobiko.fail( "{received!r} reply package(s) has been received from " "{destination!r}", received=self.received, destination=self.destination)
def test_public_ips(self): ips = dict() for node in self.topology.nodes: ping.ping(node.public_ip).assert_replied() other = ips.setdefault(node.public_ip, node) if node is not other: tobiko.fail(f"Nodes {node.name} and {other.name} have the " f"same IP: {node.public_ip}")
def test_hostnames(self): hostnames = dict() for node in self.topology.nodes: hostname = sh.get_hostname(ssh_client=node.ssh_client) self.assertTrue(hostname.startswith(node.name)) other = hostnames.setdefault(hostname, node) if node is not other: tobiko.fail(f"Nodes {node.name} and {other.name} have the " f"same hostname: {hostname}")
def run_container_config_validations(): """check containers configuration in different scenarios """ # TODO add here any generic configuration validation config_checkings = [] if neutron.has_ovn(): ovn_config_checkings = \ [{'node_group': 'controller', 'container_name': 'neutron_api', 'config_file': '/etc/neutron/plugins/ml2/ml2_conf.ini', 'param_validations': [{'section': 'ml2', 'param': 'mechanism_drivers', 'expected_value': 'ovn'}, {'section': 'ml2', 'param': 'type_drivers', 'expected_value': 'geneve'}, {'section': 'ovn', 'param': 'ovn_l3_mode', 'expected_value': 'True'}, {'section': 'ovn', 'param': 'ovn_metadata_enabled', 'expected_value': 'True'}]}] config_checkings += ovn_config_checkings else: ovs_config_checkings = \ [{'node_group': 'controller', 'container_name': 'neutron_api', 'config_file': '/etc/neutron/plugins/ml2/ml2_conf.ini', 'param_validations': [{'section': 'ml2', 'param': 'mechanism_drivers', 'expected_value': 'openvswitch'}]}] config_checkings += ovs_config_checkings container_runtime_name = get_container_runtime_name() for config_check in config_checkings: for node in topology.list_openstack_nodes( group=config_check['node_group']): for param_check in config_check['param_validations']: obtained_param = sh.execute( f"{container_runtime_name} exec -uroot " f"{config_check['container_name']} crudini " f"--get {config_check['config_file']} " f"{param_check['section']} {param_check['param']}", ssh_client=node.ssh_client, sudo=True).stdout.strip() if param_check['expected_value'] not in obtained_param: tobiko.fail(f"Expected {param_check['param']} value: " f"{param_check['expected_value']}\n" f"Obtained {param_check['param']} value: " f"{obtained_param}") LOG.info("Configuration verified:\n" f"node group: {config_check['node_group']}\n" f"container: {config_check['container_name']}\n" f"config file: {config_check['config_file']}")
def test_network_namespaces(self): for node in self.topology.nodes: namespaces_ips = {} namespaces = ip.list_network_namespaces(ssh_client=node.ssh_client) for namespace in namespaces: ips = ip.list_ip_addresses(ssh_client=node.ssh_client, network_namespace=namespace) other_ips = namespaces_ips.setdefault(namespace, ips) if ips is not other_ips: tobiko.fail(f"Duplicate network namespace {namespace} in " f"node {node.name}: {other_ips}, {ips}")
def test_network_namespaces(self): for node in self.topology.nodes: namespaces_ips = {} namespaces = ip.list_network_namespaces(ssh_client=node.ssh_client) for namespace in namespaces: ips = ip.list_ip_addresses(ssh_client=node.ssh_client, network_namespace=namespace) other_ips = namespaces_ips.setdefault(namespace, ips) if ips is not other_ips: tobiko.fail("Duplicate network namespace {!r} in node " "{!r}: {!r}, {!r}", namespace, node.name, other_ips, ips)
def assert_vlan_is_unreachable(self, ip_version: int = None, timeout: tobiko.Seconds = None, ssh_client: ssh.SSHClientType = None): fixed_ips = self.list_vlan_fixed_ips(ip_version=ip_version) if fixed_ips: if ssh_client is None: ssh_client = self.vlan_ssh_proxy_client ping.assert_unreachable_hosts(fixed_ips, ssh_client=ssh_client, timeout=timeout) else: tobiko.fail(f'Server {self.stack_name} has any IP on VLAN port')
def stop_all_instances(): """try to start all stopped overcloud instances""" for instance in nova.list_servers(): activated_instance = nova.shutoff_server(instance) time.sleep(3) instance_info = 'instance {nova_instance} is {state} on {host}'.format( nova_instance=activated_instance.name, state=activated_instance.status, host=activated_instance._info[ # pylint: disable=W0212 'OS-EXT-SRV-ATTR:hypervisor_hostname']) LOG.info(instance_info) if activated_instance.status != 'SHUTOFF': tobiko.fail(instance_info)
def get_pcs_resources_table(timeout=720, interval=2) -> pandas.DataFrame: """ get pcs status from a controller and parse it to have it's resources states in check returns : rabbitmq-bundle-0 (ocf::heartbeat:rabbitmq-cluster): Started con troller-0 ip-10.0.0.101 (ocf::heartbeat:IPaddr2): Started controller-1 openstack-cinder-volume-docker-0 (ocf::heartbeat:docker): Sta rted controller-0 :return: dataframe of pcs resources stats table """ failures: typing.List[str] = [] start = time.time() ssh_client = get_random_controller_ssh_client() # prevent pcs table read failure while pacemaker is starting while time.time() - start < timeout: failures = [] try: output = sh.execute("sudo pcs status resources |grep ocf", ssh_client=ssh_client, expect_exit_status=None).stdout # remove the first column when it only includes '*' characters output = output.replace('*', '').strip() stream = io.StringIO(output) table: pandas.DataFrame = pandas.read_csv(stream, delim_whitespace=True, header=None) table.columns = [ 'resource', 'resource_type', 'resource_state', 'overcloud_node' ] except ValueError: pcs_status_raw = sh.execute("sudo pcs status ", ssh_client=ssh_client, expect_exit_status=None).stdout failures.append(f'pcs status table import failed : ' f'pcs status stdout:\n {pcs_status_raw}') LOG.info('Retrying , timeout at: {}'.format(timeout - (time.time() - start))) time.sleep(interval) else: break # exhausted all retries if failures: tobiko.fail('pcs status table import error\n' + '\n'.join(failures)) LOG.debug("Got pcs status :\n%s", table) return table
def test_extra_dhcp_opts(self): extra_dhcp_options = neutron.get_port_extra_dhcp_opts( self.stack.port_id) for option in extra_dhcp_options: if 'domain-name' == option['opt_name']: domain = option['opt_value'].replace('"', '') break else: tobiko.fail('No extra-dhcp-opt found for domain-name') vm_resolv_conf = sh.execute('cat /etc/resolv.conf', ssh_client=self.stack.ssh_client).stdout self.assertIsNotNone( re.search(r'^search\s+{domain}$'.format(domain=domain), vm_resolv_conf, re.MULTILINE))
def assert_downloaded_file(file_name: str, headers_file_name: str, ssh_client: ssh.SSHClientType = None, sudo: bool = None): try: header = read_headers_file(headers_file_name=headers_file_name, ssh_client=ssh_client, sudo=sudo)[-1] except sh.ShellCommandFailed as ex: tobiko.fail(f"Error reading headers file '{headers_file_name}': {ex}") else: file_size = header.content_length if file_size is not None: sh.assert_file_size(file_size=header.content_length, file_name=file_name, ssh_client=ssh_client, sudo=sudo)
def basic_overcloud_processes_running(self): """ Checks that the oc_procs_df dataframe has all of the list procs :return: Bool """ for attempt_number in range(600): try: for process_name in self.processes_to_check: # osp16/python3 process is "neutron-server:" if process_name == 'neutron-server' and \ self.oc_procs_df.query('PROCESS=="{}"'.format( process_name)).empty: process_name = 'neutron-server:' if not self.oc_procs_df.query( 'PROCESS=="{}"'.format(process_name)).empty: LOG.info("overcloud processes status checks: " "process {} is " "in running state".format(process_name)) continue else: LOG.info( "Failure : overcloud processes status checks:" "process {} is not running ".format(process_name)) raise OvercloudProcessesException( process_error="process {} is not running ".format( process_name)) # if all procs are running we can return true return True except OvercloudProcessesException: LOG.info('Retrying overcloud processes checks attempt ' '{} of 360'.format(attempt_number)) time.sleep(1) self.oc_procs_df = overcloud.get_overcloud_nodes_dataframe( get_overcloud_node_processes_table) # exhausted all retries tobiko.fail('Not all overcloud processes are running !\n')
def check_ping_statistics(failure_limit=10): """Gets a list of ping_vm_log files and iterates their lines, checks if max ping failures have been reached per fip=file""" # iterate over ping_vm_log files: for filename in list(get_vm_ping_log_files()): with io.open(filename, 'rt') as fd: LOG.info(f'checking ping log file: {filename}, ' f'failure_limit is :{failure_limit}') failure_counter = 0 for ping_line in fd.readlines(): ping_line = json.loads(ping_line.rstrip()) if ping_line['transmitted'] != ping_line['received']: failure_counter += 1 LOG.info(f'found ping failure: {ping_line}') if failure_counter >= failure_limit: rename_ping_staistics_file_to_checked(filename) tobiko.fail(f'{failure_counter} pings failure found ' f'to vm fip destination: ' f'{ping_line["destination"]}') LOG.info(f'no failures in ping log file: {filename}') rename_ping_staistics_file_to_checked(filename)
def check_vm_evacuations(vms_df_old=None, compute_host=None, timeout=600, interval=2, check_no_evacuation=False): """check evacuation of vms input: old and new vms_state_tables dfs""" failures = [] start = time.time() while time.time() - start < timeout: failures = [] vms_df_new = get_compute_vms_df(compute_host) for vm_id in vms_df_old.vm_id.to_list(): old_bm_host = vm_location(vm_id, vms_df_old) new_vm_host = vm_location(vm_id, vms_df_new) if check_no_evacuation: cond = bool(old_bm_host != new_vm_host) else: cond = bool(old_bm_host == new_vm_host) if cond: failures.append( 'failed vm evacuations: {}\n\n'.format(vm_info(vm_id, vms_df_old))) if failures: LOG.info('Failed nova evacuation:\n {}'.format(failures)) LOG.info('Not all nova vms evacuated ..') LOG.info('Retrying , timeout at: {}' .format(timeout-(time.time() - start))) time.sleep(interval) else: LOG.info(vms_df_old.to_string()) LOG.info('All vms were evacuated!') return # exhausted all retries if failures: tobiko.fail( 'failed vm evacuations:\n{!s}', '\n'.join(failures))
def test_neutron_agents_are_alive(timeout=300., interval=5.) \ -> tobiko.Selection[neutron.NeutronAgentType]: for attempt in tobiko.retry(timeout=timeout, interval=interval): LOG.debug("Look for unhealthy Neutron agents...") try: # get Neutron agent list agents = neutron.list_agents() except (neutron.ServiceUnavailable, neutron.NeutronClientException, exceptions.connection.ConnectFailure) as ex: if attempt.is_last: raise else: # retry because Neutron server could still be unavailable # after a disruption LOG.debug(f"Waiting for neutron service... ({ex})") continue # Let retry dead_agents = agents.with_items(alive=False) if dead_agents: dead_agents_details = json.dumps(agents, indent=4, sort_keys=True) if attempt.is_last: tobiko.fail("Unhealthy agent(s) found:\n" f"{dead_agents_details}\n") else: # retry because some Neutron agent could still be unavailable # after a disruption LOG.debug("Waiting for Neutron agents to get alive...\n" f"{dead_agents_details}") continue LOG.debug(f"All {len(agents)} Neutron agents are alive.") break else: raise RuntimeError("Retry loop broken") return agents
def assert_containers_running(group, expected_containers, full_name=True, bool_check=False): """assert that all containers specified in the list are running on the specified openstack group(controller or compute etc..) if bool_check is True then return only True or false without failing""" if is_docker(): LOG.info('not checking common containers since we are on docker') return failures = [] openstack_nodes = topology.list_openstack_nodes(group=group) for node in openstack_nodes: node_containers = list_node_containers(ssh_client=node.ssh_client) containers_list_df = pandas.DataFrame( get_container_states_list(node_containers), columns=['container_host', 'container_name', 'container_state']) # check that the containers are present LOG.info('node: {} containers list : {}'.format( node.name, containers_list_df.to_string(index=False))) for container in expected_containers: # get container attrs dataframe if full_name: container_attrs = containers_list_df.query( 'container_name == "{}"'.format(container)) else: container_attrs = containers_list_df[containers_list_df[ 'container_name'].str.contains(container)] # check if the container exists LOG.info('checking container: {}'.format(container)) if container_attrs.empty: failures.append( 'expected container {} not found on node {} ! : \n\n'. format(container, node.name)) # if container exists, check it is running else: # only one running container is expected container_running_attrs = container_attrs.query( 'container_state=="running"') if container_running_attrs.empty: failures.append( 'expected container {} is not running on node {} , ' 'its state is {}! : \n\n'.format( container, node.name, container_attrs.container_state.values.item())) elif len(container_running_attrs) > 1: failures.append( 'only one running container {} was expected on ' 'node {}, but got {}! : \n\n'.format( container, node.name, len(container_running_attrs))) if not bool_check and failures: tobiko.fail('container states mismatched:\n{!s}', '\n'.join(failures)) elif bool_check and failures: return False else: LOG.info('All specified containers are in running state! ') return True
def assert_equal_containers_state(expected_containers_list=None, timeout=120, interval=2, recreate_expected=False): """compare all overcloud container states with using two lists: one is current , the other some past list first time this method runs it creates a file holding overcloud containers' states: /home/stack/expected_containers_list_df.csv' second time it creates a current containers states list and compares them, they must be identical""" # if we have a file or an explicit variable use that , otherwise create # and return if recreate_expected or (not expected_containers_list and not os.path.exists(expected_containers_file)): save_containers_state_to_file(list_containers()) return elif expected_containers_list: expected_containers_list_df = pandas.DataFrame( get_container_states_list(expected_containers_list), columns=['container_host', 'container_name', 'container_state']) elif os.path.exists(expected_containers_file): expected_containers_list_df = pandas.read_csv(expected_containers_file) failures = [] start = time.time() error_info = 'Output explanation: left_only is the original state, ' \ 'right_only is the new state' while time.time() - start < timeout: failures = [] actual_containers_list_df = list_containers_df() LOG.info('expected_containers_list_df: {} '.format( expected_containers_list_df.to_string(index=False))) LOG.info('actual_containers_list_df: {} '.format( actual_containers_list_df.to_string(index=False))) # execute a `dataframe` diff between the expected and actual containers expected_containers_state_changed = \ dataframe_difference(expected_containers_list_df, actual_containers_list_df) # check for changed state containerstopology if not expected_containers_state_changed.empty: failures.append( 'expected containers changed state ! : ' '\n\n{}\n{}'.format( expected_containers_state_changed.to_string(index=False), error_info)) LOG.info('container states mismatched:\n{}\n'.format(failures)) time.sleep(interval) # clear cache to obtain new data list_node_containers.cache_clear() else: LOG.info("assert_equal_containers_state :" " OK, all containers are on the same state") return if failures: tobiko.fail('container states mismatched:\n{!s}', '\n'.join(failures))
def assert_not_transmitted(self): if self.transmitted: tobiko.fail(f"{self.transmitted} package(s) has been " f"transmitted to {self.destination}")
def assert_unreachable_hosts(hosts, **params): reachable_hosts = _ping.list_reachable_hosts(hosts, **params) if reachable_hosts: tobiko.fail("Reached host(s): {!r}", reachable_hosts)
def assert_reachable_hosts(hosts, **params): unreachable_hosts = _ping.list_unreachable_hosts(hosts, **params) if unreachable_hosts: tobiko.fail("Unable to reach host(s): {!r}", unreachable_hosts)
def assert_not_replied(self): if self.received: tobiko.fail(f"{self.received} reply package(s) has been received " f"from {self.destination}")