def wait(self) -> None: if self.wait_for_running: wait(self.is_running) self.logger.info("tasks started") if self.to_monitor: wait(self.has_files) self.logger.info("new files found")
def check_repro(self) -> bool: self.logger.info("checking repros") return wait(self.check_repro_impl)
def check_jobs(self) -> bool: """ Check all of the integration jobs """ self.logger.info("checking jobs") return wait(self.check_jobs_impl)
def check_logs_for_errors(self) -> None: # only check for errors that exist between the start and stop markers # also, only check for the most recent 100 errors within the last 3 # hours. The records are scanned through in reverse chronological # order. self.inject_log(self.stop_log_marker) wait(self.check_log_end_marker, frequency=5.0) self.logger.info("application insights log flushed") logs = self.of.debug.logs.keyword("error", limit=100000, timespan="PT3H") seen_errors = False seen_stop = False for entry in logs: message = entry.get("message", "") if not seen_stop: if self.stop_log_marker in message: seen_stop = True continue if self.start_log_marker in message: break # ignore logging.info coming from Azure Functions if entry.get("customDimensions", {}).get("LogLevel") == "Information": continue # ignore warnings coming from the rust code, only be concerned # about errors if (entry.get("severityLevel") == 2 and entry.get("sdkVersion") == "rust:0.1.5"): continue # ignore resource not found warnings from azure-functions layer, # which relate to azure-retry issues if (message.startswith("Client-Request-ID=") and "ResourceNotFound" in message and entry.get( "sdkVersion", "").startswith("azurefunctions")): continue # ignore analyzer output, as we can't control what applications # being fuzzed send to stdout or stderr. (most importantly, cdb # prints "Symbol Loading Error Summary") if message.startswith( "process (stdout) analyzer:") or message.startswith( "process (stderr) analyzer:"): continue # TODO: ignore queue errors until tasks are shut down before # deleting queues https://github.com/microsoft/onefuzz/issues/141 if ("storage queue pop failed" in message or "storage queue delete failed" in message) and entry.get("sdkVersion") == "rust:0.1.5": continue if message is None: self.logger.error("error log: %s", entry) else: self.logger.error("error log: %s", message) seen_errors = True if seen_errors: raise Exception("logs included errors")
def check_repro(self, repros: Dict[UUID, Tuple[Job, Repro]]) -> bool: self.logger.info("checking repros") self.success = True def check_repro_impl() -> Tuple[bool, str, bool]: # check all of the launched repros self.cleared = False def clear() -> None: if not self.cleared: self.cleared = True print("") commands: Dict[OS, Tuple[str, str]] = { OS.windows: ("r rip", r"^rip=[a-f0-9]{16}"), OS.linux: ("info reg rip", r"^rip\s+0x[a-f0-9]+\s+0x[a-f0-9]+"), } for (job, repro) in list(repros.values()): repros[job.job_id] = (job, self.of.repro.get(repro.vm_id)) for (job, repro) in list(repros.values()): if repro.error: clear() self.logger.error( "repro failed: %s: %s", job.config.name, repro.error, ) self.of.repro.delete(repro.vm_id) del repros[job.job_id] elif repro.state == VmState.running: try: result = self.of.repro.connect( repro.vm_id, delete_after_use=True, debug_command=commands[repro.os][0], ) if result is not None and re.search( commands[repro.os][1], result, re.MULTILINE): clear() self.logger.info("repro succeeded: %s", job.config.name) else: clear() self.logger.error("repro failed: %s - %s", job.config.name, result) except Exception as err: clear() self.logger.error("repro failed: %s - %s", job.config.name, err) del repros[job.job_id] elif repro.state not in [ VmState.init, VmState.extensions_launch ]: self.logger.error("repro failed: %s - bad state: %s", job.config.name, repro.state) del repros[job.job_id] repro_states: Dict[str, List[str]] = {} for (job, repro) in repros.values(): if repro.state.name not in repro_states: repro_states[repro.state.name] = [] repro_states[repro.state.name].append(job.config.name) logline = [] for state in repro_states: logline.append("%s:%s" % (state, ",".join(repro_states[state]))) msg = "waiting repro: %s" % " ".join(logline) if len(msg) > 80: msg = "waiting on %d repros" % len(repros) return (not bool(repros), msg, self.success) return wait(check_repro_impl)
def check_jobs(self, poll: bool = False, stop_on_complete_check: bool = False) -> bool: """ Check all of the integration jobs """ jobs: Dict[UUID, Job] = {x.job_id: x for x in self.get_jobs()} job_tasks: Dict[UUID, List[Task]] = {} check_containers: Dict[UUID, Dict[Container, Tuple[ContainerWrapper, int]]] = {} for job in jobs.values(): if job.config.name not in TARGETS: self.logger.error("unknown job target: %s", job.config.name) continue tasks = self.of.jobs.tasks.list(job.job_id) job_tasks[job.job_id] = tasks check_containers[job.job_id] = {} for task in tasks: for container in task.config.containers: if container.type in TARGETS[ job.config.name].wait_for_files: count = TARGETS[job.config.name].wait_for_files[ container.type] check_containers[job.job_id][container.name] = ( ContainerWrapper( self.of.containers.get( container.name).sas_url), count, ) self.success = True self.logger.info("checking %d jobs", len(jobs)) self.cleared = False def clear() -> None: if not self.cleared: self.cleared = True if poll: print("") def check_jobs_impl() -> Tuple[bool, str, bool]: self.cleared = False failed_jobs: Set[UUID] = set() job_task_states: Dict[UUID, Set[TaskTestState]] = {} for job_id in check_containers: finished_containers: Set[Container] = set() for (container_name, container_impl) in check_containers[job_id].items(): container_client, count = container_impl if len(container_client.list_blobs()) >= count: clear() self.logger.info( "found files for %s - %s", jobs[job_id].config.name, container_name, ) finished_containers.add(container_name) for container_name in finished_containers: del check_containers[job_id][container_name] scalesets = self.of.scalesets.list() for job_id in job_tasks: finished_tasks: Set[UUID] = set() job_task_states[job_id] = set() for task in job_tasks[job_id]: if job_id not in jobs: continue task_result = self.check_task(jobs[job_id], task, scalesets) if task_result == TaskTestState.failed: self.success = False failed_jobs.add(job_id) elif task_result == TaskTestState.stopped: finished_tasks.add(task.task_id) else: job_task_states[job_id].add(task_result) job_tasks[job_id] = [ x for x in job_tasks[job_id] if x.task_id not in finished_tasks ] to_remove: Set[UUID] = set() for job in jobs.values(): # stop tracking failed jobs if job.job_id in failed_jobs: if job.job_id in check_containers: del check_containers[job.job_id] if job.job_id in job_tasks: del job_tasks[job.job_id] continue # stop checking containers once all the containers for the job # have checked out. if job.job_id in check_containers: if not check_containers[job.job_id]: clear() self.logger.info( "found files in all containers for %s", job.config.name) del check_containers[job.job_id] if job.job_id not in check_containers: if job.job_id in job_task_states: if set([TaskTestState.running ]).issuperset(job_task_states[job.job_id]): del job_tasks[job.job_id] if job.job_id not in job_tasks and job.job_id not in check_containers: clear() self.logger.info("%s completed", job.config.name) to_remove.add(job.job_id) for job_id in to_remove: if stop_on_complete_check: self.stop_job(jobs[job_id]) del jobs[job_id] msg = "waiting on: %s" % ",".join( sorted(x.config.name for x in jobs.values())) if poll and len(msg) > 80: msg = "waiting on %d jobs" % len(jobs) if not jobs: msg = "done all tasks" return (not bool(jobs), msg, self.success) if poll: return wait(check_jobs_impl) else: _, msg, result = check_jobs_impl() self.logger.info(msg) return result