def test_keepalived_failover(self): ha_router_id = self.ha_stack.network_stack.gateway_id self.agents = [self.wait_for_active_ha_l3_agent()] keepalived_pids = self.get_cmd_pids('keepalived', ha_router_id, min_pids_per_host=2) ping.ping_until_received(self.ha_stack.ip_address).assert_replied() active_agent_host = self.agents[0]['host'] # Need to make sure that 'keepalived-state-change' process is UP # before we will kill 'keepalived' process as it can break the agent # status otherwise. So will check that keepalived pids are equal for # two attemts of listing them ka_state_cmd = f'neutron-keepalived-state-change.*{ha_router_id}' ka_state_pids = {} for _ in tobiko.retry(timeout=120., interval=5.): new_ka_state_pids = self.get_cmd_pids('/usr/bin/python', ka_state_cmd, min_pids_per_host=1) if ka_state_pids == new_ka_state_pids: break else: ka_state_pids = new_ka_state_pids self.kill_pids(active_agent_host, keepalived_pids[active_agent_host]) ping.ping_until_received(self.ha_stack.ip_address).assert_replied() # Need to make sure that 'keepalived' is spawned back after it has # been killed self.assertNotEqual(keepalived_pids, self.get_cmd_pids('keepalived', ha_router_id, min_pids_per_host=2))
def wait_for_processes(timeout: tobiko.Seconds = None, sleep_interval: tobiko.Seconds = None, ssh_client: ssh.SSHClientType = None, is_cirros: bool = None, **list_params): for attempt in tobiko.retry(timeout=timeout, interval=sleep_interval, default_interval=5.): processes = list_processes(ssh_client=ssh_client, is_cirros=is_cirros, **list_params) if not processes: break hostname = _hostname.get_hostname(ssh_client=ssh_client) process_lines = [ ' {pid} {command}'.format(pid=process.pid, command=process.command) for process in processes] if attempt.is_last: raise PsWaitTimeout(timeout=timeout, hostname=hostname, processes='\n'.join(process_lines)) LOG.debug(f"Waiting for process(es) on host {hostname}...\n" '\n'.join(process_lines))
def test_port_ips(self, ip_version: typing.Optional[int] = None): """Checks port IPS has been assigned to server via DHCP protocol""" port_ips = set( neutron.list_device_ip_addresses( device_id=self.stack.server_id, network_id=self.stack.network_stack.network_id, need_dhcp=self.stack.need_dhcp, ip_version=ip_version)) if port_ips: # verify neutron port IPs and VM port IPs match # when a VM connected to the external network has been just # created, it may need some time to receive its IPv6 address for attempt in tobiko.retry(timeout=60., interval=4.): server_ips = set( ip.list_ip_addresses(scope='global', ssh_client=self.stack.ssh_client)) server_ips &= port_ips # ignore other server IPs LOG.debug("Neutron IPs and VM IPs should match...") try: self.assertEqual( port_ips, server_ips, f"Server {self.stack.server_id} is missing port " f"IP(s): {port_ips - server_ips}") break except self.failureException: attempt.check_limits() elif ip_version: self.skipTest(f"Server has any port IPv{ip_version} address to be" " tested") else: self.skipTest("Server has any port IP address to be tested")
def wait_for_members_to_be_reachable(self, interval: tobiko.Seconds = None, timeout: tobiko.Seconds = None): members = [self.server_stack, self.other_server_stack] if len(members) < 1: return # Wait for members to be reachable from localhost last_reached_id = 0 for attempt in tobiko.retry(timeout=timeout, interval=interval, default_interval=5., default_timeout=members[0].wait_timeout): try: for member in members[last_reached_id:]: octavia.check_members_balanced( members_count=1, ip_address=member.ip_address, protocol=self.lb_protocol, port=self.lb_port, requests_count=1) last_reached_id += 1 # prevent retrying same member again except sh.ShellCommandFailed: if attempt.is_last: raise LOG.info( "Waiting for members to have HTTP service available...") continue else: break else: raise RuntimeError("Members couldn't be reached!")
def wait_until_stack_deleted(self, check=True, cached=True, timeout: tobiko.Seconds = None, interval: tobiko.Seconds = None): # check stack has been completely deleted for attempt in tobiko.retry(timeout=timeout, interval=interval, default_timeout=self.wait_timeout, default_interval=self.wait_interval): # Ensure to refresh stack status stack = self.wait_for_delete_complete(check=check, cached=cached, timeout=attempt.time_left, interval=attempt.interval) if stack is None: LOG.debug(f"Stack {self.stack_name} disappeared") break assert stack.stack_status == DELETE_COMPLETE if attempt.is_last: raise HeatStackDeletionFailed( name=self.stack_name, observed=stack.stack_status, expected={DELETE_COMPLETE}, status_reason=stack.stack_status_reason) cached = False LOG.debug("Waiting for deleted stack to disappear: '%s'", self.stack_name) else: raise RuntimeError("Retry look broken itself")
def create_stack(self, retry: tobiko.Retry = None) -> stacks.Stack: if config.get_bool_env('TOBIKO_PREVENT_CREATE'): stack = self.validate_created_stack() else: for attempt in tobiko.retry(retry, count=self.retry_count, timeout=self.retry_timeout, interval=0.): try: stack = self.try_create_stack() break except InvalidStackError: LOG.exception(f"Error creating stack '{self.stack_name}'", exc_info=1) if attempt.is_last: raise self.delete_stack() # It uses a random time sleep to make conflicting # concurrent creations less probable to occur sleep_time = random_sleep_time( min_time=self.min_retry_interval, max_time=self.max_retry_interval) LOG.debug( f"Failed creating stack '{self.stack_name}' " f"(attempt {attempt.number} of " f"{attempt.count}). It will retry after " f"{sleep_time} seconds", exc_info=1) time.sleep(sleep_time) else: raise RuntimeError('Retry loop broken') return stack
def test_retry_with_timeout_and_big_interval(self): mock_time = self.patch_time() attempts = [] try: for attempt in tobiko.retry(timeout=9., interval=3.): attempts.append(attempt) except tobiko.RetryTimeLimitError as ex: self.assertEqual( "Retry time limit exceeded " f"({attempt.details})", str(ex)) else: self.fail("RetryTimeLimitError not raised") expected = [ tobiko.retry_attempt(number=i + 1, timeout=9., interval=3., start_time=mock_time.start_time, elapsed_time=elapsed_time) for i, elapsed_time in enumerate([0., 4., 7., 10.]) ] self.assertEqual(expected, attempts) mock_time.sleep.assert_has_calls( [mock.call(2.), mock.call(1.), mock.call(1.)])
def setUp(self): # pylint: disable=no-member super(OctaviaBasicFaultTest, self).setUp() # Wait for Octavia objects to be active LOG.info('Waiting for member ' f'{self.listener_stack.server_stack.stack_name} and ' f'for member ' f'{self.listener_stack.other_server_stack.stack_name} ' f'to be created...') self.listener_stack.wait_for_active_members() self.loadbalancer_stack.wait_for_octavia_service() self.listener_stack.wait_for_members_to_be_reachable() # For 5 minutes we ignore specific exceptions as we know # that Octavia resources are being provisioned for attempt in tobiko.retry(timeout=300.): try: octavia.check_members_balanced( pool_id=self.listener_stack.pool_id, ip_address=self.loadbalancer_stack.floating_ip_address, lb_algorithm=self.listener_stack.lb_algorithm, protocol=self.listener_stack.lb_protocol, port=self.listener_stack.lb_port) break except (octavia.RoundRobinException, octavia.TrafficTimeoutError, sh.ShellCommandFailed): LOG.exception(f"Traffic didn't reach all members after " f"#{attempt.number} attempts and " f"{attempt.elapsed_time} seconds") if attempt.is_last: raise
def run_operation(self): self.is_rebooted = False self.start_time = None for attempt in tobiko.retry( timeout=self.timeout, default_timeout=self.default_wait_timeout, default_count=self.default_wait_count, default_interval=self.default_wait_interval): try: channel = self.ssh_client.connect( connection_timeout=attempt.time_left, retry_count=1) LOG.info("Executing reboot command on host " f"'{self.hostname}' (command='{self.command}')... ") self.start_time = tobiko.time() channel.exec_command(str(self.command)) except Exception as ex: if attempt.time_left > 0.: LOG.debug(f"Unable to reboot remote host " f"(time_left={attempt.time_left}): {ex}") else: LOG.exception(f"Unable to reboot remote host: {ex}") raise RebootHostTimeoutError( hostname=self.hostname or self.ssh_client.host, timeout=attempt.timeout) from ex else: LOG.info(f"Host '{self.hostname}' is rebooting " f"(command='{self.command}').") break finally: # Ensure we close connection after rebooting command self.ssh_client.close()
def wait_processes_destroyed(self, command_filter, pids_per_host, timeout=120, interval=2): '''Wait for processes to be terminated on hosts Make sure that all processes from the list are terminated or return an error otherwise. Tricky situation may happen when the different process with same PID can be spawned so then need to check it against `command_filter`. :param command_filter: Patter to be found in process command details :type command_filter: string :param pids_per_host: Dictionary with hostnames as a key and list of PIDs as a value :type pids_per_host: dict :param timeout: Time to wait till each process will be terminated :type timeout: int :param interval: Time to sleep between attempts :type interval: int ''' LOG.debug(f'Waiting for processes to be finished: {pids_per_host}') for host, pids in pids_per_host.items(): for pid in pids: retry = tobiko.retry(timeout=timeout, interval=interval) for _ in retry: LOG.debug(f'Check if {pid} has been terminated on {host}') if self.is_destroyed(pid, command_filter, host): LOG.debug(f'Process {pid} finished on {host}') break
def test_0_vlan_ip_addresses(self): """Check Nova server VLAN port IP addresses""" self.stack.ensure_server_status('ACTIVE') expected = set(self.stack.list_vlan_fixed_ips()) for attempt in tobiko.retry(): actual = set( ip.list_ip_addresses(device=self.stack.vlan_device, ssh_client=self.stack.ssh_client, scope='global')) unexpected = actual - expected if unexpected: self.fail("Unexpected IP address assigned to VLAN port: " f"{unexpected}") missing = expected - actual if missing: if attempt.is_last: self.fail("IP addresses not assigned to VLAN port: " f"{unexpected}") else: LOG.debug("IP addresses still not assigned to VLAN port: " f"{unexpected}") else: break else: raise RuntimeError("Broken retry loop") self.assertEqual(set(expected), set(actual))
def _stop_octavia_main_services(self, controller: OpenStackTopologyNode, excluded_services: typing.List[str]): """Stops the provided octavia services. This method stops the provided octavia services, except for the ones which are in excluded_services. After it runs the "stop command" (e.g. `systemctl stop`), it makes sure that the Octavia's stopped services do not appear on the running Octavia services. It then sends traffic to validate the Octavia's functionality """ # Preparing the services to stop services_to_stop = octavia.OCTAVIA_SERVICES if excluded_services: services_to_stop = [ service for service in services_to_stop if (service not in excluded_services) ] # Stopping the Octavia services for service in services_to_stop: command = f"systemctl stop {service}" sh.execute(command, ssh_client=controller.ssh_client, sudo=True) log_msg = f"Stopping {service} on {controller.name}" LOG.info(log_msg) # Making sure the Octavia services were stopped octavia_active_units = self._list_octavia_services( controller.ssh_client) for service in services_to_stop: err_msg = f'{service} was not stopped on {controller.name}' self.assertTrue(service not in octavia_active_units, err_msg) self.loadbalancer_stack.wait_for_octavia_service() # For 30 seconds we ignore the OctaviaClientException as we know # that Octavia services are being stopped and restarted for attempt in tobiko.retry(timeout=30.): try: octavia.check_members_balanced( pool_id=self.listener_stack.pool_id, ip_address=self.loadbalancer_stack.floating_ip_address, lb_algorithm=self.listener_stack.lb_algorithm, protocol=self.listener_stack.lb_protocol, port=self.listener_stack.lb_port) break except octavia.OctaviaClientException: LOG.exception(f"Octavia service was unavailable after " f"#{attempt.number} attempts and " f"{attempt.elapsed_time} seconds") if attempt.is_last: raise
def create_client(self): # noqa: C901 for _ in tobiko.retry(timeout=60., interval=5.): try: podman_remote_socket = self.discover_podman_socket() username = self.ssh_client.connect_parameters['username'] host = self.ssh_client.connect_parameters["hostname"] socket = podman_remote_socket podman_remote_socket_uri = f'unix:/tmp/podman.sock_{host}' remote_uri = f'ssh://{username}@{host}{socket}' if podman_version_3(): # check if a ssh tunnel exists, if not create one psall = str(subprocess.check_output(('ps', '-ef'))) if f'ssh -L /tmp/podman.sock_{host}' not in psall: if os.path.exists(f"/tmp/podman.sock_{host}"): subprocess.call( ['rm', '-f', f'/tmp/podman.sock_{host}']) # start a background ssh tunnel with the remote host subprocess.call([ 'ssh', '-L', f'/tmp/podman.sock_{host}:' f'/run/podman/podman.sock', host, '-N', '-f' ]) for _ in tobiko.retry(timeout=60., interval=1.): if os.path.exists(f'/tmp/podman.sock_{host}'): break client = podman.PodmanClient( base_url=podman_remote_socket_uri) if client.ping(): LOG.info('container_client is online') else: client = _podman1.Client( # pylint: disable=E1101 uri=podman_remote_socket_uri, remote_uri=remote_uri, identity_file='~/.ssh/id_rsa') if client.system.ping(): LOG.info('container_client is online') return client except (ConnectionRefusedError, ConnectionResetError): # retry self.create_client()
def communicate(self, stdin=None, stdout=True, stderr=True, timeout: tobiko.Seconds = None, receive_all=False, buffer_size=None): timeout = tobiko.to_seconds(timeout) # Avoid waiting for data in the first loop poll_interval = 0. streams = _io.select_opened_files([ stdin and self.stdin, stdout and self.stdout, stderr and self.stderr ]) for attempt in tobiko.retry(timeout=timeout): if not self._is_communicating( streams=streams, send=stdin, receive=receive_all): break # Remove closed streams streams = _io.select_opened_files(streams) # Select ready streams read_ready, write_ready = _io.select_files(files=streams, timeout=poll_interval) if read_ready or write_ready: # Avoid waiting for data the next time poll_interval = 0. if self.stdin in write_ready: # Write data to remote STDIN stdin = self._write_to_stdin(stdin) if not stdin: streams.remove(self.stdin) if self.stdout in read_ready: # Read data from remote STDOUT stdout = self._read_from_stdout(buffer_size=buffer_size) if not stdout: streams.remove(self.stdout) if self.stderr in read_ready: # Read data from remote STDERR stderr = self._read_from_stderr(buffer_size=buffer_size) if not stderr: streams.remove(self.stderr) else: self._check_communicate_timeout(attempt=attempt, timeout=timeout) # Wait for data in the following loops poll_interval = self.parameters.poll_interval LOG.debug(f"Waiting for process data {poll_interval} " f"seconds... \n" f" command: {self.command}\n" f" attempt: {attempt.details}\n" f" streams: {streams}")
def wait_for_metadata_status(self, count=None, timeout=60., interval=2., is_reachable: typing.Optional[bool] = None): for attempt in tobiko.retry(timeout=timeout, interval=interval, count=count): if is_reachable is not None: try: self.assert_metadata_is_reachable(is_reachable) except self.failureException: # re-raises failureException when reaching retry limits attempt.check_limits() else: break
def request_number(self, timeout=30.) -> int: connection = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) with connection: for attempt in tobiko.retry(timeout=timeout): try: connection.connect(get_sock_file()) break except (ConnectionRefusedError, FileNotFoundError) as ex: if attempt.is_last: raise RuntimeError('Server not running') from ex message = connection.recv(4096) return int(message)
def wait_for_active_ha_l3_agent(self) -> AgentType: ha_router_id = self.ha_stack.network_stack.gateway_id for attempt in tobiko.retry(timeout=180., interval=5.): agents = neutron.list_l3_agent_hosting_routers(ha_router_id) try: active_agent = agents.with_items(ha_state='active').unique break except (tobiko.MultipleObjectsFound, tobiko.ObjectNotFound): attempt.check_limits() continue return active_agent
def wait_for_stack_status( self, expected_status: typing.Container[str], check=True, cached=True, timeout: tobiko.Seconds = None, interval: tobiko.Seconds = None) \ -> typing.Optional[stacks.Stack]: """Waits for the stack to reach the given status.""" for attempt in tobiko.retry(timeout=timeout, interval=interval, default_timeout=self.wait_timeout, default_interval=self.wait_interval): if cached: cached = False stack = self.stack or self.get_stack() else: stack = self.get_stack() stack_status = getattr(stack, 'stack_status', DELETE_COMPLETE) if stack_status in expected_status: LOG.debug(f"Stack '{self.stack_name}' reached expected " f"status: '{stack_status}'") break if not stack_status.endswith('_IN_PROGRESS'): LOG.warning(f"Stack '{self.stack_name}' reached unexpected " f"status: '{stack_status}'") break if attempt.is_last: LOG.warning(f"Timed out waiting for stack '{self.stack_name}' " f"status to change from '{stack_status}' to " f"'{expected_status}'.") break LOG.debug(f"Waiting for stack '{self.stack_name}' status to " f"change from '{stack_status}' to " f"'{expected_status}'...") else: raise RuntimeError('Retry loop broken') if stack is not None: self._log_stack_status(stack) if check: if stack is None: if DELETE_COMPLETE not in expected_status: raise HeatStackNotFound(name=self.stack_name) else: check_stack_status(stack, expected_status) return stack
def ensure_nova_quota_limits(project: keystone.ProjectType = None, user: keystone.UserType = None, client: _client.NovaClientType = None, retry_timeout: tobiko.Seconds = None, retry_interval: tobiko.Seconds = None, **required_quotas: int): if not required_quotas: return client = _client.nova_client(client) project = keystone.get_project_id(project=project, session=client.client.session) user = user and keystone.get_user_id(user=user) or None if user: # Must increase project limits before user ones ensure_nova_quota_limits(project=project, client=client, **required_quotas) for attempt in tobiko.retry(timeout=retry_timeout, interval=retry_interval, default_timeout=60., default_interval=3.): actual_limits, expected_limits = get_nova_quota_limits_increase( project=project, user=user, client=client, extra_increase=10 // attempt.number, **required_quotas) if expected_limits: if attempt.is_last: raise EnsureNovaQuotaLimitsError( project=project, actual_limits=actual_limits, expected_limits=expected_limits) LOG.info(f"Increase Nova quota limit(s) (project={project}, " f"user={user}): {actual_limits} -> {expected_limits}...") try: set_nova_quota_set(project=project, user=user, client=client, **expected_limits) except Exception: if attempt.is_last: raise LOG.exception("Error increasing Nova quota set limits: " f"{expected_limits}") else: LOG.debug(f"Required Nova quota limits are OK: {required_quotas}") break else: raise RuntimeError("Broken retry loop")
def test_reboot_amphora_compute_node(self): amphora_compute_host = octavia.get_amphora_compute_node( loadbalancer_id=self.loadbalancer_stack.loadbalancer_id, lb_port=self.listener_stack.lb_port, lb_protocol=self.listener_stack.lb_protocol, ip_address=self.loadbalancer_stack.floating_ip_address) LOG.debug('Rebooting compute node...') # Reboot Amphora's compute node will initiate a failover amphora_compute_host.reboot_overcloud_node() LOG.debug('Compute node has been rebooted') # Wait for the LB to be updated try: self.loadbalancer_stack.wait_for_update_loadbalancer(timeout=30) except tobiko.RetryTimeLimitError: LOG.info('The restarted servers reached ACTIVE status after the' ' LB finished its update process, hence no exception is' ' being raised even though the update timeout was' ' reached.') self.loadbalancer_stack.wait_for_active_loadbalancer() LOG.debug(f'Load Balancer {self.loadbalancer_stack.loadbalancer_id} is' f' ACTIVE') # Wait for Octavia objects' provisioning status to be ACTIVE self.listener_stack.wait_for_active_members() # Verify Octavia functionality # For 5 minutes we ignore specific exceptions as we know # that Octavia resources are being provisioned/migrated for attempt in tobiko.retry(timeout=300.): try: octavia.check_members_balanced( pool_id=self.listener_stack.pool_id, ip_address=self.loadbalancer_stack.floating_ip_address, lb_algorithm=self.listener_stack.lb_algorithm, protocol=self.listener_stack.lb_protocol, port=self.listener_stack.lb_port) break except (octavia.RoundRobinException, octavia.TrafficTimeoutError, sh.ShellCommandFailed): LOG.exception(f"Traffic didn't reach all members after " f"#{attempt.number} attempts and " f"{attempt.elapsed_time} seconds") if attempt.is_last: raise
def check_computes_vms_running_via_virsh(): """check all vms are running via virsh list command""" for compute in topology.list_openstack_nodes(group='compute'): hostname = get_fqdn_from_topology_node(compute) retry = tobiko.retry(timeout=120, interval=5) for vm_id in get_compute_vms_df(hostname)['vm_id'].to_list(): for _ in retry: if check_vm_running_via_virsh(compute, vm_id): LOG.info(f"{vm_id} is running ok on " f"{compute.hostname}") break else: LOG.info(f"{vm_id} is not in running state on " f"{compute.hostname}")
def get_client(self, ssh_client): for attempt in tobiko.retry(timeout=60.0, interval=5.0): try: client = self._get_client(ssh_client=ssh_client) break # TODO chose a better exception type except Exception: if attempt.is_last: raise LOG.debug('Unable to connect to docker server', exc_info=1) ssh.reset_default_ssh_port_forward_manager() else: raise RuntimeError("Broken retry loop") return client
def kill_rabbitmq_service(): """kill a rabbit process on a random controller, check in pacemaker it is down""" if tripleo_topology.is_composable_roles_env(): nodes = topology.list_openstack_nodes(group='messaging') else: nodes = topology.list_openstack_nodes(group='controller') node = random.choice(nodes) sh.execute(kill_rabbit, ssh_client=node.ssh_client) LOG.info('kill rabbit: {} on server: {}'.format(kill_rabbit, node.name)) retry = tobiko.retry(timeout=30, interval=5) for _ in retry: if not (pacemaker.PacemakerResourcesStatus().rabbitmq_resource_healthy( )): return
def test_retry_when_succeed(self, ): mock_time = self.patch_time() attempts = [] for attempt in tobiko.retry(): attempts.append(attempt) break # this marks a success expected = [ tobiko.retry_attempt(number=1, start_time=mock_time.start_time, elapsed_time=0.) ] self.assertEqual(expected, attempts) mock_time.sleep.assert_not_called()
def create_process(self): """Execute command on a remote host using SSH client""" command = str(self.command) ssh_client = self.ssh_client parameters = self.parameters tobiko.check_valid_type(ssh_client, ssh.SSHClientFixture) tobiko.check_valid_type(parameters, SSHShellProcessParameters) environment = parameters.environment for attempt in tobiko.retry( timeout=self.parameters.timeout, default_count=self.parameters.retry_count, default_interval=self.parameters.retry_interval, default_timeout=self.parameters.retry_timeout): timeout = attempt.time_left details = (f"command='{command}', " f"login={ssh_client.login}, " f"timeout={timeout}, " f"attempt={attempt}, " f"environment={environment}") LOG.debug(f"Create remote process... ({details})") try: client = ssh_client.connect() process = client.get_transport().open_session() if environment: variables = " ".join( f"{name}={shlex.quote(value)}" for name, value in self.environment.items()) command = variables + " " + command process.exec_command(command) LOG.debug(f"Remote process created. ({details})") return process except Exception: # Before doing anything else cleanup SSH connection ssh_client.close() LOG.debug(f"Error creating remote process. ({details})", exc_info=1) try: attempt.check_limits() except tobiko.RetryTimeLimitError as ex: LOG.debug(f"Timed out creating remote process. ({details})") raise _exception.ShellTimeoutExpired(command=command, stdin=None, stdout=None, stderr=None, timeout=timeout) from ex
def test_2_delete_server(self): server = self.ensure_server(status='ACTIVE') self.stack.assert_is_reachable() nova.delete_server(server.id) for _ in tobiko.retry(timeout=60., interval=3.): try: server = nova.get_server(server_id=server.id) except nova.ServerNotFoundError: LOG.debug(f"Server '{server.id}' deleted") break else: LOG.debug(f"Waiting for server deletion:\n" f" - server.id='{server.id}'" f" - server.status='{server.status}'") self.stack.assert_is_unreachable()
def request_galera_sst(): """remove_one_grastate_galera, check that sst is requested by a node with grastate""" node, date = remove_one_grastate_galera() bootstrapDate = datetime.strptime(date, '%a %b %d %H:%M:%S %Y') retry = tobiko.retry(timeout=30, interval=5) for _ in retry: sst_req = sh.execute(galera_sst_request, ssh_client=node.ssh_client).stdout if sst_req: break sstDate = datetime.strptime( re.findall(r"\d{4}-\d{,2}-\d{,2}\s*\d{,2}:\d{,2}:\d{,2}", sst_req)[-1], '%Y-%m-%d %H:%M:%S') if bootstrapDate > sstDate: raise TimestampException
def kill_all_galera_services(): """kill all galera processes, check in pacemaker it is down""" if tripleo_topology.is_composable_roles_env(): nodes = topology.list_openstack_nodes(group='database') else: nodes = topology.list_openstack_nodes(group='controller') for node in nodes: sh.execute(kill_galera, ssh_client=node.ssh_client) LOG.info('kill galera: {} on server: {}'.format( kill_galera, node.name)) retry = tobiko.retry(timeout=30, interval=5) for _ in retry: if not (pacemaker.PacemakerResourcesStatus().galera_resource_healthy() ): return
def get_console_output(server: typing.Optional[ServerType] = None, server_id: typing.Optional[str] = None, timeout: tobiko.Seconds = None, interval: tobiko.Seconds = None, length: typing.Optional[int] = None, client: NovaClientType = None) -> \ typing.Optional[str]: if length is not None: length = min(length, MAX_SERVER_CONSOLE_OUTPUT_LENGTH) else: length = MAX_SERVER_CONSOLE_OUTPUT_LENGTH server_id = get_server_id(server=server, server_id=server_id) for attempt in tobiko.retry(timeout=timeout, interval=interval, default_timeout=60., default_interval=5.): try: output = nova_client(client).servers.get_console_output( server=server_id, length=length) except (TypeError, novaclient.exceptions.NotFound): # Only active servers have console output server = get_server(server_id=server_id) if server.status != 'ACTIVE': LOG.debug(f"Server '{server_id}' has no console output " f"(status = '{server.status}').") break else: # For some reason it could happen resulting body cannot be # translated to json object and it is converted to None LOG.exception(f"Error getting server '{server_id}' console " "output") else: if output: LOG.debug(f"got server '{server_id}' console output " f"(length = {len(output)}).") return output if attempt.is_last: LOG.info(f"No console output produced by server '{server_id}') " f" after {attempt.elapsed_time} seconds") break LOG.debug(f"Waiting for server '{server_id}' console output...") return None
def execute(self, retry_count: int = None, retry_timeout: tobiko.Seconds = None, retry_interval: tobiko.Seconds = None) -> \ sh.ShellExecuteResult: for attempt in tobiko.retry(count=retry_count, timeout=retry_timeout, interval=retry_interval, default_count=1): self.start() result = self.wait(check=attempt.is_last) if result.exit_status == 0: break self.stop() else: raise RuntimeError("Retry loop broken") return result