class FastbootRun: def __init__(self, args, test_timeout): self.powerup = args.powerup self.ser = SerialBuffer(args.dev, "results/serial-output.txt", "R SERIAL> ") self.fastboot = "fastboot boot -s {ser} artifacts/fastboot.img".format( ser=args.fbserial) self.test_timeout = test_timeout def close(self): self.ser.close() def print_error(self, message): RED = '\033[0;31m' NO_COLOR = '\033[0m' print(RED + message + NO_COLOR) def logged_system(self, cmd, timeout=60): print("Running '{}'".format(cmd)) try: return subprocess.call(cmd, shell=True, timeout=timeout) except subprocess.TimeoutExpired: self.print_error("timeout, restarting run...") return 2 def run(self): if ret := self.logged_system(self.powerup): return ret fastboot_ready = False for line in self.ser.lines(timeout=2 * 60, phase="bootloader"): if re.search("fastboot: processing commands", line) or \ re.search("Listening for fastboot command on", line): fastboot_ready = True break if re.search("data abort", line): self.print_error( "Detected crash during boot, restarting run...") return 2 if not fastboot_ready: self.print_error( "Failed to get to fastboot prompt, restarting run...") return 2 if ret := self.logged_system(self.fastboot): return ret
class CrosServoRun: def __init__(self, cpu, ec): # Merged FIFO for the two serial buffers, fed by threads. self.serial_queue = queue.Queue() self.sentinel = object() self.threads_done = 0 self.ec_ser = SerialBuffer(ec, "results/serial-ec.txt", "R SERIAL-EC> ") self.cpu_ser = SerialBuffer(cpu, "results/serial.txt", "R SERIAL-CPU> ") self.iter_feed_ec = threading.Thread(target=self.iter_feed_queue, daemon=True, args=(self.ec_ser.lines(), )) self.iter_feed_ec.start() self.iter_feed_cpu = threading.Thread(target=self.iter_feed_queue, daemon=True, args=(self.cpu_ser.lines(), )) self.iter_feed_cpu.start() def close(self): self.ec_ser.close() self.cpu_ser.close() self.iter_feed_ec.join() self.iter_feed_cpu.join() # Feed lines from our serial queues into the merged queue, marking when our # input is done. def iter_feed_queue(self, it): for i in it: self.serial_queue.put(i) self.serial_queue.put(self.sentinel) # Return the next line from the queue, counting how many threads have # terminated and joining when done def get_serial_queue_line(self): line = self.serial_queue.get() if line == self.sentinel: self.threads_done = self.threads_done + 1 if self.threads_done == 2: self.iter_feed_cpu.join() self.iter_feed_ec.join() return line # Returns an iterator for getting the next line. def serial_queue_lines(self): return iter(self.get_serial_queue_line, self.sentinel) def ec_write(self, s): print("W SERIAL-EC> %s" % s) self.ec_ser.serial.write(s.encode()) def cpu_write(self, s): print("W SERIAL-CPU> %s" % s) self.cpu_ser.serial.write(s.encode()) def print_error(self, message): RED = '\033[0;31m' NO_COLOR = '\033[0m' print(RED + message + NO_COLOR) def run(self): # Flush any partial commands in the EC's prompt, then ask for a reboot. self.ec_write("\n") self.ec_write("reboot\n") # This is emitted right when the bootloader pauses to check for input. # Emit a ^N character to request network boot, because we don't have a # direct-to-netboot firmware on cheza. for line in self.serial_queue_lines(): if re.search("load_archive: loading locale_en.bin", line): self.cpu_write("\016") break # The Cheza boards have issues with failing to bring up power to # the system sometimes, possibly dependent on ambient temperature # in the farm. if re.search("POWER_GOOD not seen in time", line): self.print_error( "Detected intermittent poweron failure, restarting run...") return 2 tftp_failures = 0 for line in self.serial_queue_lines(): if re.search("---. end Kernel panic", line): return 1 # The Cheza firmware seems to occasionally get stuck looping in # this error state during TFTP booting, possibly based on amount of # network traffic around it, but it'll usually recover after a # reboot. if re.search("R8152: Bulk read error 0xffffffbf", line): tftp_failures += 1 if tftp_failures >= 100: self.print_error( "Detected intermittent tftp failure, restarting run..." ) return 2 # There are very infrequent bus errors during power management transitions # on cheza, which we don't expect to be the case on future boards. if re.search( "Kernel panic - not syncing: Asynchronous SError Interrupt", line): self.print_error( "Detected cheza power management bus error, restarting run..." ) return 2 # If the network device dies, it's probably not graphics's fault, just try again. if re.search("NETDEV WATCHDOG", line): self.print_error( "Detected network device failure, restarting run...") return 2 # These HFI response errors started appearing with the introduction # of piglit runs. CosmicPenguin says: # # "message ID 106 isn't a thing, so likely what happened is that we # got confused when parsing the HFI queue. If it happened on only # one run, then memory corruption could be a possible clue" # # Given that it seems to trigger randomly near a GPU fault and then # break many tests after that, just restart the whole run. if re.search( "a6xx_hfi_send_msg.*Unexpected message id .* on the response queue", line): self.print_error( "Detected cheza power management bus error, restarting run..." ) return 2 if re.search("coreboot.*bootblock starting", line): self.print_error( "Detected spontaneous reboot, restarting run...") return 2 if re.search( "arm-smmu 5040000.iommu: TLB sync timed out -- SMMU may be deadlocked", line): self.print_error("Detected cheza MMU fail, restarting run...") return 2 result = re.search("hwci: mesa: (\S*)", line) if result: if result.group(1) == "pass": return 0 else: return 1 self.print_error( "Reached the end of the CPU serial log without finding a result") return 1
class CrosServoRun: def __init__(self, cpu, ec, test_timeout): self.cpu_ser = SerialBuffer(cpu, "results/serial.txt", "R SERIAL-CPU> ") # Merge the EC serial into the cpu_ser's line stream so that we can # effectively poll on both at the same time and not have to worry about self.ec_ser = SerialBuffer(ec, "results/serial-ec.txt", "R SERIAL-EC> ", line_queue=self.cpu_ser.line_queue) self.test_timeout = test_timeout def close(self): self.ec_ser.close() self.cpu_ser.close() def ec_write(self, s): print("W SERIAL-EC> %s" % s) self.ec_ser.serial.write(s.encode()) def cpu_write(self, s): print("W SERIAL-CPU> %s" % s) self.cpu_ser.serial.write(s.encode()) def print_error(self, message): RED = '\033[0;31m' NO_COLOR = '\033[0m' print(RED + message + NO_COLOR) def run(self): # Flush any partial commands in the EC's prompt, then ask for a reboot. self.ec_write("\n") self.ec_write("reboot\n") bootloader_done = False # This is emitted right when the bootloader pauses to check for input. # Emit a ^N character to request network boot, because we don't have a # direct-to-netboot firmware on cheza. for line in self.cpu_ser.lines(timeout=120, phase="bootloader"): if re.search("load_archive: loading locale_en.bin", line): self.cpu_write("\016") bootloader_done = True break # If the board has a netboot firmware and we made it to booting the # kernel, proceed to processing of the test run. if re.search("Booting Linux", line): bootloader_done = True break # The Cheza boards have issues with failing to bring up power to # the system sometimes, possibly dependent on ambient temperature # in the farm. if re.search("POWER_GOOD not seen in time", line): self.print_error( "Detected intermittent poweron failure, restarting run...") return 2 if not bootloader_done: print("Failed to make it through bootloader, restarting run...") return 2 tftp_failures = 0 for line in self.cpu_ser.lines(timeout=self.test_timeout, phase="test"): if re.search("---. end Kernel panic", line): return 1 # The Cheza firmware seems to occasionally get stuck looping in # this error state during TFTP booting, possibly based on amount of # network traffic around it, but it'll usually recover after a # reboot. if re.search("R8152: Bulk read error 0xffffffbf", line): tftp_failures += 1 if tftp_failures >= 100: self.print_error( "Detected intermittent tftp failure, restarting run..." ) return 2 # There are very infrequent bus errors during power management transitions # on cheza, which we don't expect to be the case on future boards. if re.search( "Kernel panic - not syncing: Asynchronous SError Interrupt", line): self.print_error( "Detected cheza power management bus error, restarting run..." ) return 2 # If the network device dies, it's probably not graphics's fault, just try again. if re.search("NETDEV WATCHDOG", line): self.print_error( "Detected network device failure, restarting run...") return 2 # These HFI response errors started appearing with the introduction # of piglit runs. CosmicPenguin says: # # "message ID 106 isn't a thing, so likely what happened is that we # got confused when parsing the HFI queue. If it happened on only # one run, then memory corruption could be a possible clue" # # Given that it seems to trigger randomly near a GPU fault and then # break many tests after that, just restart the whole run. if re.search( "a6xx_hfi_send_msg.*Unexpected message id .* on the response queue", line): self.print_error( "Detected cheza power management bus error, restarting run..." ) return 2 if re.search("coreboot.*bootblock starting", line): self.print_error( "Detected spontaneous reboot, restarting run...") return 2 if re.search( "arm-smmu 5040000.iommu: TLB sync timed out -- SMMU may be deadlocked", line): self.print_error("Detected cheza MMU fail, restarting run...") return 2 result = re.search("hwci: mesa: (\S*)", line) if result: if result.group(1) == "pass": return 0 else: return 1 self.print_error( "Reached the end of the CPU serial log without finding a result") return 2
class FastbootRun: def __init__(self, args): self.powerup = args.powerup # We would like something like a 1 minute timeout, but the piglit traces # jobs stall out for long periods of time. self.ser = SerialBuffer(args.dev, "results/serial-output.txt", "R SERIAL> ", timeout=600) self.fastboot = "fastboot boot -s {ser} artifacts/fastboot.img".format( ser=args.fbserial) def close(self): self.ser.close() def print_error(self, message): RED = '\033[0;31m' NO_COLOR = '\033[0m' print(RED + message + NO_COLOR) def logged_system(self, cmd): print("Running '{}'".format(cmd)) return os.system(cmd) def run(self): if self.logged_system(self.powerup) != 0: return 1 fastboot_ready = False for line in self.ser.lines(): if re.search("fastboot: processing commands", line) or \ re.search("Listening for fastboot command on", line): fastboot_ready = True break if re.search("data abort", line): self.print_error( "Detected crash during boot, restarting run...") return 2 if not fastboot_ready: self.print_error( "Failed to get to fastboot prompt, restarting run...") return 2 if self.logged_system(self.fastboot) != 0: return 1 print_more_lines = -1 for line in self.ser.lines(): if print_more_lines == 0: return 2 if print_more_lines > 0: print_more_lines -= 1 if re.search("---. end Kernel panic", line): return 1 # The db820c boards intermittently reboot. Just restart the run # when if we see a reboot after we got past fastboot. if re.search("PON REASON", line): self.print_error( "Detected spontaneous reboot, restarting run...") return 2 # db820c sometimes wedges around iommu fault recovery if re.search("watchdog: BUG: soft lockup - CPU.* stuck", line): self.print_error( "Detected kernel soft lockup, restarting run...") return 2 # If the network device dies, it's probably not graphics's fault, just try again. if re.search("NETDEV WATCHDOG", line): self.print_error( "Detected network device failure, restarting run...") return 2 # A3xx recovery doesn't quite work. Sometimes the GPU will get # wedged and recovery will fail (because power can't be reset?) # This assumes that the jobs are sufficiently well-tested that GPU # hangs aren't always triggered, so just try again. But print some # more lines first so that we get better information on the cause # of the hang. Once a hang happens, it's pretty chatty. if "[drm:adreno_recover] *ERROR* gpu hw init failed: -22" in line: self.print_error("Detected GPU hang, restarting run...") if print_more_lines == -1: print_more_lines = 30 result = re.search("hwci: mesa: (\S*)", line) if result: if result.group(1) == "pass": return 0 else: return 1 self.print_error( "Reached the end of the CPU serial log without finding a result, restarting run..." ) return 2