def _wait_for_reboot(self, old_boot_id): logging.info("Client is rebooting") logging.info("Waiting for client to halt") if not self.host.wait_down(self.host.WAIT_DOWN_REBOOT_TIMEOUT, old_boot_id=old_boot_id): err = "%s failed to shutdown after %d" err %= (self.host.hostname, self.host.WAIT_DOWN_REBOOT_TIMEOUT) raise error.AutotestRunError(err) logging.info("Client down, waiting for restart") if not self.host.wait_up(self.host.DEFAULT_REBOOT_TIMEOUT): # since reboot failed # hardreset the machine once if possible # before failing this control file warning = "%s did not come back up, hard resetting" warning %= self.host.hostname logging.warning(warning) try: self.host.hardreset(wait=False) except (AttributeError, error.AutoservUnsupportedError), detail: warning = ("Hard reset unsupported on %s: %s" % (self.hostname, detail)) logging.warning(warning) raise error.AutotestRunError( "%s failed to boot after %ds" % (self.host.hostname, self.host.DEFAULT_REBOOT_TIMEOUT))
def execute_section(self, section, timeout, stderr_redirector, client_disconnect_timeout): logging.info("Executing %s/bin/autotest %s/control phase %d", self.autodir, self.autodir, section) if self.background: result = self._execute_in_background(section, timeout) else: result = self._execute_daemon(section, timeout, stderr_redirector, client_disconnect_timeout) last_line = stderr_redirector.last_line # check if we failed hard enough to warrant an exception if result.exit_status == 1: err = error.AutotestRunError("client job was aborted") elif not self.background and not result.stderr: err = error.AutotestRunError( "execute_section %s failed to return anything\n" "stdout:%s\n" % (section, result.stdout)) else: err = None # log something if the client failed AND never finished logging if err and not self.is_client_job_finished(last_line): self.log_unexpected_abort(stderr_redirector) if err: raise err else: return stderr_redirector.last_line
def execute_control(self, timeout=None, client_disconnect_timeout=None): if not self.background: collector = log_collector(self.host, self.tag, self.results_dir) hostname = self.host.hostname remote_results = collector.client_results_dir local_results = collector.server_results_dir self.host.job.add_client_log(hostname, remote_results, local_results) job_record_context = self.host.job.get_record_context() section = 0 start_time = time.time() logger = client_logger(self.host, self.tag, self.results_dir) try: while not timeout or time.time() < start_time + timeout: if timeout: section_timeout = start_time + timeout - time.time() else: section_timeout = None boot_id = self.host.get_boot_id() last = self.execute_section(section, section_timeout, logger, client_disconnect_timeout) if self.background: return section += 1 if self.is_client_job_finished(last): logging.info("Client complete") return elif self.is_client_job_rebooting(last): try: self._wait_for_reboot(boot_id) except error.AutotestRunError, e: self.host.job.record("ABORT", None, "reboot", str(e)) self.host.job.record("END ABORT", None, None, str(e)) raise continue # if we reach here, something unexpected happened self.log_unexpected_abort(logger) # give the client machine a chance to recover from a crash self.host.wait_up(self.host.HOURS_TO_WAIT_FOR_RECOVERY * 3600) msg = ("Aborting - unexpected final status message from " "client on %s: %s\n") % (self.host.hostname, last) raise error.AutotestRunError(msg) finally: logger.close() if not self.background: collector.collect_client_job_results() collector.remove_redundant_client_logs() state_file = os.path.basename(self.remote_control_file + '.state') state_path = os.path.join(self.results_dir, state_file) self.host.job.postprocess_client_state(state_path) self.host.job.remove_client_log(hostname, remote_results, local_results) job_record_context.restore() # should only get here if we timed out assert timeout raise error.AutotestTimeoutError()
def execute_control(self, timeout=None, client_disconnect_timeout=None): collector = log_collector(self.host, self.tag, self.results_dir) hostname = self.host.hostname remote_results = collector.client_results_dir local_results = collector.server_results_dir self.host.job.add_client_log(hostname, remote_results, local_results) job_record_context = self.host.job.get_record_context() logger = client_logger(self.host, self.tag, self.results_dir) try: boot_id = self.host.get_boot_id() last = self._really_execute_control(timeout, logger, client_disconnect_timeout) if self.is_client_job_finished(last): logging.info("Client complete") return elif self.is_client_job_rebooting(last): # TODO(crbug.com/684311) This feature is never used. Validate # and drop this case. m = 'chromeos/autotest/errors/client_test_triggered_reboot' metrics.Counter(m).increment() self.host.job.record("ABORT", None, "reboot", 'client triggered reboot is unsupported') self.host.job.record("END ABORT", None, None, 'client triggered reboot is unsupported') return # If a test fails without probable cause we try to bucket it's # failure into one of 2 categories. If we can determine the # current state of the device and it is suspicious, we close the # status lines indicating a failure. If we either cannot # determine the state of the device, or it appears totally # healthy, we give up and abort. try: self._diagnose_dut(boot_id) except AutotestDeviceError as e: # The status lines of the test are pretty much tailed to # our log, with indentation, from the client job on the DUT. # So if the DUT goes down unexpectedly we'll end up with a # malformed status log unless we manually unwind the status # stack. Ideally we would want to write a nice wrapper like # server_job methods run_reboot, run_group but they expect # reboots and we don't. self.host.job.record('FAIL', None, None, str(e)) self.host.job.record('END FAIL', None, None) self.host.job.record('END GOOD', None, None) self.host.job.failed_with_device_error = True return except AutotestAbort as e: self.host.job.record('ABORT', None, None, str(e)) self.host.job.record('END ABORT', None, None) # give the client machine a chance to recover from a crash self.host.wait_up(self.host.HOURS_TO_WAIT_FOR_RECOVERY * 3600) logging.debug( 'Unexpected final status message from ' 'client %s: %s', self.host.hostname, last) # The line 'last' may have sensitive phrases, like # 'END GOOD', which breaks the tko parser. So the error # message will exclude it, since it will be recorded to # status.log. msg = ("Aborting - unexpected final status message from " "client on %s\n") % self.host.hostname raise error.AutotestRunError(msg) finally: logging.debug('Autotest job finishes running. Below is the ' 'post-processing operations.') logger.close() collector.collect_client_job_results() collector.remove_redundant_client_logs() state_file = os.path.basename(self.remote_control_file + '.state') state_path = os.path.join(self.results_dir, state_file) self.host.job.postprocess_client_state(state_path) self.host.job.remove_client_log(hostname, remote_results, local_results) job_record_context.restore() logging.debug('Autotest job finishes.') # should only get here if we timed out assert timeout raise error.AutotestTimeoutError()
def execute_control(self, timeout=None, client_disconnect_timeout=None): if not self.background: collector = log_collector(self.host, self.tag, self.results_dir) hostname = self.host.hostname remote_results = collector.client_results_dir local_results = collector.server_results_dir self.host.job.add_client_log(hostname, remote_results, local_results) job_record_context = self.host.job.get_record_context() section = 0 start_time = time.time() logger = client_logger(self.host, self.tag, self.results_dir) try: while not timeout or time.time() < start_time + timeout: if timeout: section_timeout = start_time + timeout - time.time() else: section_timeout = None boot_id = self.host.get_boot_id() last = self.execute_section(section, section_timeout, logger, client_disconnect_timeout) if self.background: return section += 1 if self.is_client_job_finished(last): logging.info("Client complete") return elif self.is_client_job_rebooting(last): try: self._wait_for_reboot(boot_id) except error.AutotestRunError, e: self.host.job.record("ABORT", None, "reboot", str(e)) self.host.job.record("END ABORT", None, None, str(e)) raise continue # If a test fails without probable cause we try to bucket it's # failure into one of 2 categories. If we can determine the # current state of the device and it is suspicious, we close the # status lines indicating a failure. If we either cannot # determine the state of the device, or it appears totally # healthy, we give up and abort. try: self._diagnose_dut(boot_id) except AutotestDeviceError as e: # The status lines of the test are pretty much tailed to # our log, with indentation, from the client job on the DUT. # So if the DUT goes down unexpectedly we'll end up with a # malformed status log unless we manually unwind the status # stack. Ideally we would want to write a nice wrapper like # server_job methods run_reboot, run_group but they expect # reboots and we don't. self.host.job.record('FAIL', None, None, str(e)) self.host.job.record('END FAIL', None, None) self.host.job.record('END GOOD', None, None) self.host.job.failed_with_device_error = True return except AutotestAbort as e: self.host.job.record('ABORT', None, None, str(e)) self.host.job.record('END ABORT', None, None) # give the client machine a chance to recover from a crash self.host.wait_up(self.host.HOURS_TO_WAIT_FOR_RECOVERY * 3600) msg = ("Aborting - unexpected final status message from " "client on %s: %s\n") % (self.host.hostname, last) raise error.AutotestRunError(msg) finally: logger.close() if not self.background: collector.collect_client_job_results() collector.remove_redundant_client_logs() state_file = os.path.basename(self.remote_control_file + '.state') state_path = os.path.join(self.results_dir, state_file) self.host.job.postprocess_client_state(state_path) self.host.job.remove_client_log(hostname, remote_results, local_results) job_record_context.restore() # should only get here if we timed out assert timeout raise error.AutotestTimeoutError()