def fetch_logs(job, max_idle_time, log_follower) -> None: # Poll to check for new logs, assuming that a prolonged period of # silence means that the device has died and we should try it again if datetime.now() - job.last_log_time > max_idle_time: max_idle_time_min = max_idle_time.total_seconds() / 60 raise MesaCITimeoutError( f"{CONSOLE_LOG['BOLD']}" f"{CONSOLE_LOG['FG_YELLOW']}" f"LAVA job {job.job_id} does not respond for {max_idle_time_min} " "minutes. Retry." f"{CONSOLE_LOG['RESET']}", timeout_duration=max_idle_time, ) time.sleep(LOG_POLLING_TIME_SEC) # The XMLRPC binary packet may be corrupted, causing a YAML scanner error. # Retry the log fetching several times before exposing the error. for _ in range(5): with contextlib.suppress(MesaCIParseException): new_log_lines = job.get_logs() break else: raise MesaCIParseException if log_follower.feed(new_log_lines): # If we had non-empty log data, we can assure that the device is alive. job.heartbeat() parsed_lines = log_follower.flush() parsed_lines = job.parse_job_result_from_log(parsed_lines) for line in parsed_lines: print_log(line)
def print_job_final_status(job): if job.status == "running": job.status = "hung" color = LAVAJob.COLOR_STATUS_MAP.get(job.status, CONSOLE_LOG["FG_RED"]) print_log(f"{color}" f"LAVA Job finished with status: {job.status}" f"{CONSOLE_LOG['RESET']}")
def retriable_follow_job(proxy, job_definition) -> LAVAJob: retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION for attempt_no in range(1, retry_count + 2): job = LAVAJob(proxy, job_definition) try: follow_job_execution(job) return job except MesaCIKnownIssueException as found_issue: print_log(found_issue) job.status = "canceled" except MesaCIException as mesa_exception: print_log(mesa_exception) job.cancel() except KeyboardInterrupt as e: print_log( "LAVA job submitter was interrupted. Cancelling the job.") job.cancel() raise e finally: print_log( f"{CONSOLE_LOG['BOLD']}" f"Finished executing LAVA job in the attempt #{attempt_no}" f"{CONSOLE_LOG['RESET']}") print_job_final_status(job) raise MesaCIRetryError( f"{CONSOLE_LOG['BOLD']}" f"{CONSOLE_LOG['FG_RED']}" "Job failed after it exceeded the number of " f"{retry_count} retries." f"{CONSOLE_LOG['RESET']}", retry_count=retry_count, )
def setup_lava_proxy(): config = lavacli.load_config("default") uri, usr, tok = (config.get(key) for key in ("uri", "username", "token")) uri_obj = urllib.parse.urlparse(uri) uri_str = "{}://{}:{}@{}{}".format(uri_obj.scheme, usr, tok, uri_obj.netloc, uri_obj.path) transport = lavacli.RequestsTransport( uri_obj.scheme, config.get("proxy"), config.get("timeout", 120.0), config.get("verify_ssl_cert", True), ) proxy = xmlrpc.client.ServerProxy(uri_str, allow_none=True, transport=transport) print_log("Proxy for {} created.".format(config['uri'])) return proxy
def follow_job_execution(job): try: job.submit() except Exception as mesa_ci_err: raise MesaCIException( f"Could not submit LAVA job. Reason: {mesa_ci_err}" ) from mesa_ci_err print_log(f"Waiting for job {job.job_id} to start.") while not job.is_started(): time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC) print_log(f"Job {job.job_id} started.") gl = GitlabSection( id="lava_boot", header="LAVA boot", type=LogSectionType.LAVA_BOOT, start_collapsed=True, ) print(gl.start()) max_idle_time = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC) with LogFollower(current_section=gl) as lf: max_idle_time = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC) # Start to check job's health job.heartbeat() while not job.is_finished: fetch_logs(job, max_idle_time, lf) show_job_data(job) # Mesa Developers expect to have a simple pass/fail job result. # If this does not happen, it probably means a LAVA infrastructure error # happened. if job.status not in ["pass", "fail"]: find_lava_error(job)
job_definition = generate_lava_yaml(args) if args.dump_yaml: with GitlabSection( "yaml_dump", "LAVA job definition (YAML)", type=LogSectionType.LAVA_BOOT, start_collapsed=True, ): print(hide_sensitive_data(job_definition)) job = LAVAJob(proxy, job_definition) if errors := job.validate(): fatal_err(f"Error in LAVA job definition: {errors}") print_log("LAVA job definition validated successfully") if args.validate_only: return finished_job = retriable_follow_job(proxy, job_definition) exit_code = 0 if finished_job.status == "pass" else 1 sys.exit(exit_code) def create_parser(): parser = argparse.ArgumentParser("LAVA job submitter") parser.add_argument("--pipeline-info") parser.add_argument("--rootfs-url-prefix") parser.add_argument("--kernel-url-prefix")