def force_prompt_wait(self, remaining=None): """ One of the challenges we face is that kernel log messages can appear half way through a shell prompt. So, if things are taking a while, we send a newline along to maybe provoke a new prompt. We wait for half the timeout period and then wait for one tenth of the timeout 6 times (so we wait for 1.1 times the timeout period overall). :return: the index into the connection.prompt_str list """ logger = logging.getLogger('dispatcher') prompt_wait_count = 0 if not remaining: return self.wait() # connection_prompt_limit partial_timeout = remaining / 2.0 while True: try: return self.raw_connection.expect(self.prompt_str, timeout=partial_timeout) except (pexpect.TIMEOUT, TestError) as exc: if prompt_wait_count < 6: logger.warning( '%s: Sending %s in case of corruption. Connection timeout %s, retry in %s', exc, self.check_char, seconds_to_str(remaining), seconds_to_str(partial_timeout)) logger.debug("pattern: %s", self.prompt_str) prompt_wait_count += 1 partial_timeout = remaining / 10 self.sendline(self.check_char) continue else: # TODO: is someone expecting pexpect.TIMEOUT? raise
def force_prompt_wait(self, remaining=None): """ One of the challenges we face is that kernel log messages can appear half way through a shell prompt. So, if things are taking a while, we send a newline along to maybe provoke a new prompt. We wait for half the timeout period and then wait for one tenth of the timeout 6 times (so we wait for 1.1 times the timeout period overall). :return: the index into the connection.prompt_str list """ logger = logging.getLogger('dispatcher') prompt_wait_count = 0 if not remaining: return self.wait() # connection_prompt_limit partial_timeout = remaining / 2.0 while True: try: return self.raw_connection.expect(self.prompt_str, timeout=partial_timeout) except (pexpect.TIMEOUT, TestError) as exc: if prompt_wait_count < 6: logger.warning( '%s: Sending %s in case of corruption. Connection timeout %s, retry in %s', exc, self.check_char, seconds_to_str(remaining), seconds_to_str(partial_timeout)) logger.debug("pattern: %s", self.prompt_str) prompt_wait_count += 1 partial_timeout = remaining / 10 self.sendline(self.check_char) continue else: # TODO: is someone expecting pexpect.TIMEOUT? raise
def run_actions(self, connection, max_end_time): # pylint: disable=too-many-branches,too-many-statements,too-many-locals for action in self.actions: failed = False namespace = action.parameters.get('namespace', 'common') # Begin the action try: parent = self.parent if self.parent else self.job with action.timeout(parent, max_end_time) as action_max_end_time: # Add action start timestamp to the log message # Log in INFO for root actions and in DEBUG for the other actions timeout = seconds_to_str(action_max_end_time - action.timeout.start) msg = 'start: %s %s (timeout %s) [%s]' % ( action.level, action.name, timeout, namespace) if self.parent is None: action.logger.info(msg) else: action.logger.debug(msg) new_connection = action.run(connection, action_max_end_time) except LAVAError as exc: action.logger.exception(str(exc)) #str_errormsg = "[%s]%s" % (exc.error_type,exc.error_help) #self.repexp2up(str_errormsg) self.repexp2up(str(exc)) # allows retries without setting errors, which make the job incomplete. failed = True action.results = {'fail': str(exc)} self._diagnose(connection) raise except Exception as exc: action.logger.exception(traceback.format_exc()) #self.repexp2up(traceback.format_exc()) # allows retries without setting errors, which make the job incomplete. failed = True action.results = {'fail': str(exc)} # Raise a LAVABug that will be correctly classified later raise LAVABug(str(exc)) finally: # Add action end timestamp to the log message duration = round(action.timeout.elapsed_time) msg = "end: %s %s (duration %s) [%s]" % ( action.level, action.name, seconds_to_str(duration), namespace) if self.parent is None: action.logger.info(msg) else: action.logger.debug(msg) # set results including retries and failed actions action.log_action_results(fail=failed) if new_connection: connection = new_connection return connection
def run(self, connection, max_end_time): self.sleep = self.parameters.get("failure_retry_interval", self.sleep) retries = 0 has_failed = False self.call_protocols() while retries < self.max_retries: retries += 1 try: connection = self.pipeline.run_actions(connection, max_end_time) if "repeat" not in self.parameters: # failure_retry returns on first success. repeat returns only at max_retries. return connection # Do not retry for LAVABug (as it's a bug in LAVA) except (InfrastructureError, JobError, TestError) as exc: has_failed = True # Print the error message self.logger.error( "%s failed: %d of %d attempts. '%s'", self.name, retries, self.max_retries, exc, ) # Cleanup the action to allow for a safe restart self.cleanup(connection) # re-raise if this is the last loop if retries == self.max_retries: self.errors = "%s retries failed for %s" % (retries, self.name) raise # Wait some time before retrying time.sleep(self.sleep) # Restart max_end_time or the retry on a timeout fails with duration < 0 max_end_time += time.time() - self.timeout.start self.timeout.start = time.time() self.logger.warning( "Retrying: %s %s (timeout %s)", self.level, self.name, seconds_to_str(max_end_time - self.timeout.start), ) # If we are repeating, check that all repeat were a success. if has_failed: # tried and failed raise JobError("%s retries failed for %s" % (retries, self.name)) return connection
def wait(self, connection, max_end_time=None): if not connection: return if not connection.connected: self.logger.debug("Already disconnected") return if not max_end_time: max_end_time = self.timeout.duration + self.timeout.start remaining = max_end_time - time.time() # FIXME: connection.prompt_str needs to be always a list # bootloader_prompt is one which does not get set that way # also need functionality to clear the list at times. self.logger.debug("%s: Wait for prompt %s (timeout %s)", self.name, connection.prompt_str, seconds_to_str(remaining)) if self.force_prompt: return connection.force_prompt_wait(remaining) else: return connection.wait(max_end_time)
def test_seconds_to_str(): assert seconds_to_str(0) == "00:00:00" assert seconds_to_str(1) == "00:00:01" assert seconds_to_str(62) == "00:01:02" assert seconds_to_str(147) == "00:02:27" assert seconds_to_str(3641) == "01:00:41"
def run_actions(self, connection, max_end_time): for action in self.actions: failed = False namespace = action.parameters.get("namespace", "common") # Begin the action try: parent = self.parent if self.parent else self.job with action.timeout(parent, max_end_time) as action_max_end_time: # Add action start timestamp to the log message # Log in INFO for root actions and in DEBUG for the other actions timeout = seconds_to_str(action_max_end_time - action.timeout.start) msg = "start: %s %s (timeout %s) [%s]" % ( action.level, action.name, timeout, namespace, ) if self.parent is None: action.logger.info(msg) else: action.logger.debug(msg) new_connection = action.run(connection, action_max_end_time) except LAVATimeoutError as exc: action.logger.exception(str(exc)) # allows retries without setting errors, which make the job incomplete. failed = True action.results = {"fail": str(exc)} if action.timeout.can_skip(action.parameters): if self.parent is None: action.logger.warning( "skip_timeout is set for %s - continuing to next action block." % (action.name)) else: raise new_connection = None else: raise TestError(str(exc)) except LAVAError as exc: action.logger.exception(str(exc)) # allows retries without setting errors, which make the job incomplete. failed = True action.results = {"fail": str(exc)} self._diagnose(connection) raise except Exception as exc: action.logger.exception(traceback.format_exc()) # allows retries without setting errors, which make the job incomplete. failed = True action.results = {"fail": str(exc)} # Raise a LAVABug that will be correctly classified later raise LAVABug(str(exc)) finally: # Add action end timestamp to the log message duration = round(action.timeout.elapsed_time) msg = "end: %s %s (duration %s) [%s]" % ( action.level, action.name, seconds_to_str(duration), namespace, ) if self.parent is None: action.logger.info(msg) else: action.logger.debug(msg) # set results including retries and failed actions action.log_action_results(fail=failed) if new_connection: connection = new_connection return connection
def parse_failures(cls, connection, action=None, max_end_time=None): # pylint: disable=too-many-branches """ Returns a list of dictionaries of matches for failure strings and other kernel messages. If kernel_prompts are in use, a success result is returned containing details of which of KERNEL_FREE_UNUSED_MSG and KERNEL_FREE_INIT_MSG were parsed. If the returned dictionary only contains this success message, then the the kernel-messages action can be deemed as pass. The init prompts exclude these messages, so a successful init parse returns an empty dictionary as init is generally processed by actions which do their own result parsing. If the returned list is not empty, add it to the results of the calling action. When multiple messages are identified, the list contains one dictionary for each message found. Always returns a list, the list may be empty. """ results = [] init = False if not connection: return results if cls.MESSAGE_CHOICES[cls.FREE_UNUSED][1] in connection.prompt_str: if cls.MESSAGE_CHOICES[cls.FREE_INIT][1] in connection.prompt_str: init = True remaining = max_end_time - time.time() while True: if action: action.logger.debug("[%s] Waiting for messages, (timeout %s)", action.name, seconds_to_str(remaining)) try: index = connection.force_prompt_wait(remaining) except (pexpect.EOF, pexpect.TIMEOUT, TestError): if action: msg = "Failed to match - connection timed out handling messages." action.logger.warning(msg) action.errors = msg break if action and index: action.logger.debug("Matched prompt #%s: %s", index, connection.prompt_str[index]) message = connection.raw_connection.after if index == cls.ALERT or index == cls.TRACE: if action: action.logger.warning( "%s: %s" % (action.name, cls.MESSAGE_CHOICES[index][2])) # ALERT or TRACE may need a newline to force a prompt connection.sendline(connection.check_char) # this is allowable behaviour, not a failure. results.append({ cls.MESSAGE_CHOICES[index][2]: cls.MESSAGE_CHOICES[index][1], 'message': message[:METADATA_MESSAGE_LIMIT] }) continue elif index == cls.PANIC or index == cls.EXCEPTION: if action: action.logger.error( "%s %s" % (action.name, cls.MESSAGE_CHOICES[index][2])) results.append({ cls.MESSAGE_CHOICES[index][2]: cls.MESSAGE_CHOICES[index][1], 'message': message[:METADATA_MESSAGE_LIMIT] }) continue elif index and index >= cls.FREE_UNUSED: if init and index <= cls.FREE_INIT: results.append({ cls.MESSAGE_CHOICES[index][2]: cls.MESSAGE_CHOICES[index][1], 'message': 'kernel-messages' }) continue else: results.append({'success': connection.prompt_str[index]}) break else: break # allow calling actions to pick up failures # without overriding their own success result, if any. return results
def parse_failures(cls, connection, action=None, max_end_time=None): # pylint: disable=too-many-branches """ Returns a list of dictionaries of matches for failure strings and other kernel messages. If kernel_prompts are in use, a success result is returned containing details of which of KERNEL_FREE_UNUSED_MSG and KERNEL_FREE_INIT_MSG were parsed. If the returned dictionary only contains this success message, then the the kernel-messages action can be deemed as pass. The init prompts exclude these messages, so a successful init parse returns an empty dictionary as init is generally processed by actions which do their own result parsing. If the returned list is not empty, add it to the results of the calling action. When multiple messages are identified, the list contains one dictionary for each message found. Always returns a list, the list may be empty. """ results = [] # wrap inside a dict to use in results res = "pass" halt = None init = False start = time.time() if not connection: return results if cls.MESSAGE_CHOICES[cls.FREE_UNUSED][1] in connection.prompt_str: if cls.MESSAGE_CHOICES[cls.FREE_INIT][1] in connection.prompt_str: init = True remaining = max_end_time - time.time() while True: if action: action.logger.debug("[%s] Waiting for messages, (timeout %s)", action.name, seconds_to_str(remaining)) try: index = connection.force_prompt_wait(remaining) except (pexpect.EOF, pexpect.TIMEOUT, TestError): if action: msg = "Failed to match - connection timed out handling messages." action.logger.warning(msg) action.errors = msg break if action and index: action.logger.debug("Matched prompt #%s: %s", index, connection.prompt_str[index]) message = connection.raw_connection.after if index == cls.TRACE or index == cls.EXCEPTION: res = "fail" if action: action.logger.warning( "%s: %s" % (action.name, cls.MESSAGE_CHOICES[index][2])) # TRACE may need a newline to force a prompt connection.sendline(connection.check_char) # this is allowable behaviour, not a failure. results.append({ cls.MESSAGE_CHOICES[index][2]: cls.MESSAGE_CHOICES[index][1], 'message': message[:METADATA_MESSAGE_LIMIT] }) continue elif index == cls.PANIC: res = "fail" if action: action.logger.error( "%s %s" % (action.name, cls.MESSAGE_CHOICES[index][2])) results.append({ cls.MESSAGE_CHOICES[index][2]: cls.MESSAGE_CHOICES[index][1], 'message': message[:METADATA_MESSAGE_LIMIT] }) halt = message[:METADATA_MESSAGE_LIMIT] break elif index and index >= cls.FREE_UNUSED: if init and index <= cls.FREE_INIT: results.append({ cls.MESSAGE_CHOICES[index][2]: cls.MESSAGE_CHOICES[index][1], 'message': 'kernel-messages' }) continue else: results.append({'success': connection.prompt_str[index]}) break else: break # record a specific result for the kernel messages for later debugging. if action and isinstance(action.logger, YAMLLogger): action.logger.results({ # pylint: disable=no-member "definition": "lava", "namespace": action.parameters.get('namespace', 'common'), "case": cls.name, "level": action.level, "duration": "%.02f" % (time.time() - start), "result": res, "extra": { 'extra': results } }) if halt: # end this job early, kernel is unable to continue raise JobError(halt) # allow calling actions to also pick up failures # without overriding their own success result, if any. return results
def parse_failures( cls, connection, action, max_end_time, fail_msg, auto_login=False ): """ Returns a list of dictionaries of matches for failure strings and other kernel messages. If kernel_prompts are in use, a success result is returned containing details of which of KERNEL_FREE_UNUSED_MSG and KERNEL_FREE_INIT_MSG were parsed. If the returned dictionary only contains this success message, then the the kernel-messages action can be deemed as pass. The init prompts exclude these messages, so a successful init parse returns an empty dictionary as init is generally processed by actions which do their own result parsing. If the returned list is not empty, add it to the results of the calling action. When multiple messages are identified, the list contains one dictionary for each message found. Always returns a list, the list may be empty. """ results = [] # wrap inside a dict to use in results res = "pass" halt = None init = False start = time.time() if not connection: return results if cls.MESSAGE_CHOICES[cls.FREE_UNUSED][1] in connection.prompt_str: if cls.MESSAGE_CHOICES[cls.FREE_INIT][1] in connection.prompt_str: init = True remaining = max_end_time - start if remaining < 0: raise LAVABug( "Invalid time remaining: max: %s start: %s" % (max_end_time, start) ) while True: action.logger.debug( "[%s] Waiting for messages, (timeout %s)", action.name, seconds_to_str(remaining), ) try: if action.force_prompt: index = connection.force_prompt_wait(remaining) else: index = connection.wait(max_end_time) except (pexpect.EOF, pexpect.TIMEOUT, TestError): msg = "Failed to match - connection timed out handling messages." action.logger.warning(msg) action.errors = msg break if index: action.logger.debug( "Matched prompt #%s: %s", index, connection.prompt_str[index] ) message = connection.raw_connection.after if index in [cls.TRACE, cls.EXCEPTION, cls.WARNING, cls.BUG]: res = "fail" action.logger.warning( "%s: %s" % (action.name, cls.MESSAGE_CHOICES[index][2]) ) # TRACE may need a newline to force a prompt (only when not using auto-login) if not auto_login: connection.sendline(connection.check_char) # this is allowable behaviour, not a failure. results.append( { cls.MESSAGE_CHOICES[index][2]: cls.MESSAGE_CHOICES[index][1], "message": message[:METADATA_MESSAGE_LIMIT], } ) continue elif index == cls.PANIC: res = "fail" action.logger.error( "%s %s" % (action.name, cls.MESSAGE_CHOICES[index][2]) ) results.append( { cls.MESSAGE_CHOICES[index][2]: cls.MESSAGE_CHOICES[index][1], "message": message[:METADATA_MESSAGE_LIMIT], } ) halt = message[:METADATA_MESSAGE_LIMIT] break elif fail_msg and index and fail_msg == connection.prompt_str[index]: res = "fail" # user has declared this message to be terminal for this test job. halt = "Matched job-specific failure message: '%s'" % fail_msg action.logger.error("%s %s" % (action.name, halt)) results.append({"message": "kernel-messages"}) elif index and index == cls.FREE_UNUSED or index == cls.FREE_INIT: if init and index <= cls.FREE_INIT: results.append( { cls.MESSAGE_CHOICES[index][2]: cls.MESSAGE_CHOICES[index][ 1 ], "message": "kernel-messages", } ) continue else: results.append({"success": connection.prompt_str[index]}) break else: break # record a specific result for the kernel messages for later debugging. if isinstance(action.logger, YAMLLogger): action.logger.results( { "definition": "lava", "namespace": action.parameters.get("namespace", "common"), "case": cls.name, "level": action.level, "duration": "%.02f" % (time.time() - start), "result": res, "extra": {"extra": results}, } ) if halt: # end this job early, kernel is unable to continue raise JobError(halt) # allow calling actions to also pick up failures # without overriding their own success result, if any. return results
def parse_failures(cls, connection, action=None, max_end_time=None): # pylint: disable=too-many-branches """ Returns a list of dictionaries of matches for failure strings and other kernel messages. If kernel_prompts are in use, a success result is returned containing details of which of KERNEL_FREE_UNUSED_MSG and KERNEL_FREE_INIT_MSG were parsed. If the returned dictionary only contains this success message, then the the kernel-messages action can be deemed as pass. The init prompts exclude these messages, so a successful init parse returns an empty dictionary as init is generally processed by actions which do their own result parsing. If the returned list is not empty, add it to the results of the calling action. When multiple messages are identified, the list contains one dictionary for each message found. Always returns a list, the list may be empty. """ results = [] init = False if not connection: return results if cls.MESSAGE_CHOICES[cls.FREE_UNUSED][1] in connection.prompt_str: if cls.MESSAGE_CHOICES[cls.FREE_INIT][1] in connection.prompt_str: init = True remaining = max_end_time - time.time() while True: if action: action.logger.debug( "[%s] Waiting for messages, (timeout %s)", action.name, seconds_to_str(remaining)) try: index = connection.force_prompt_wait(remaining) except (pexpect.EOF, pexpect.TIMEOUT, TestError): if action: msg = "Failed to match - connection timed out handling messages." action.logger.warning(msg) action.errors = msg break if action and index: action.logger.debug("Matched prompt #%s: %s", index, connection.prompt_str[index]) message = connection.raw_connection.after if index == cls.ALERT or index == cls.TRACE: if action: action.logger.warning("%s: %s" % (action.name, cls.MESSAGE_CHOICES[index][2])) # ALERT or TRACE may need a newline to force a prompt connection.sendline(connection.check_char) # this is allowable behaviour, not a failure. results.append({ cls.MESSAGE_CHOICES[index][2]: cls.MESSAGE_CHOICES[index][1], 'message': message[:METADATA_MESSAGE_LIMIT] }) continue elif index == cls.PANIC or index == cls.EXCEPTION: if action: action.logger.error("%s %s" % (action.name, cls.MESSAGE_CHOICES[index][2])) results.append({ cls.MESSAGE_CHOICES[index][2]: cls.MESSAGE_CHOICES[index][1], 'message': message[:METADATA_MESSAGE_LIMIT] }) continue elif index and index >= cls.FREE_UNUSED: if init and index <= cls.FREE_INIT: results.append({ cls.MESSAGE_CHOICES[index][2]: cls.MESSAGE_CHOICES[index][1], 'message': 'kernel-messages' }) continue else: results.append({ 'success': connection.prompt_str[index] }) break else: break # allow calling actions to pick up failures # without overriding their own success result, if any. return results