async def count_running_kernels(self): """ updates: * self.figures with number of running kernels * self.last_activity - a epoch/timestamp/nb of seconds may be None if using an old jupyter """ port = self.port_number() if not port: return url = f"http://localhost:{port}/{port}/api/kernels?token={self.name}" self.last_activity = None try: async with aiohttp.ClientSession() as session: async with session.get(url) as response: json_str = await response.text() api_kernels = json.loads(json_str) self.nb_kernels = len(api_kernels) last_times = [ self.last_time(api_kernel) for api_kernel in api_kernels ] # if times is empty (no kernel): no activity self.last_activity = max(last_times, default=0) # this somehow tends to happen a lot sometimes # until we figure it out, let's make it less conspicuous except ClientConnectionError as _exc: logger.info(f"could not reach warming up {url} for last activity") except Exception: logger.exception(f"Cannot probe number of kernels with {self} - unhandled exception")
def run_forever(self): tick = time.time() # one cycle can take some time as all the jupyters need to be http-probed # so let us compute the actual time to wait logger.info("nbh-monitor is starting up") for c in CourseDir.objects.all(): Stats(c.coursename).record_monitor_known_counts_line() while True: try: self.run_once() # just be extra sure it doesn't crash except Exception: logger.exception(f"Unexpected error") tick += self.period duration = max(0, int(tick - time.time())) logger.info(f"monitor is waiting for {duration}s") time.sleep(duration)
def _scan_containers(self, figures_by_course): # initialize all known courses - we want data on all courses # even if they don't run any container yet logger.info(f"monitor cycle with period={self.period//60}' " f"idle={self.idle//60}' " f"lingering={self.lingering//3600}h") hash_by_course = {c.coursename : c.image_hash() for c in CourseDir.objects.all()} with podman.ApiConnection(podman_url) as podman_api: # returns None when no container is found ! containers = podman.containers.list_containers(podman_api, all=True) or [] logger.info(f"found {len(hash_by_course)} courses " f"and {len(containers)} containers") monitoreds = [] for container in containers: try: name = container['Names'][0] coursename, student = name.split('-x-') figures_by_course.setdefault(coursename, CourseFigures()) figures = figures_by_course[coursename] # may be None if s/t is misconfigured image_hash = hash_by_course[coursename] \ or f"hash not found for course {coursename}" monitoreds.append(MonitoredJupyter( container, coursename, student, figures, image_hash)) # typically non-nbhosting containers except ValueError: # ignore this container as we don't even know # in what course it belongs logger.info(f"ignoring non-nbhosting {container}") except KeyError: # typically hash_by_course[coursename] is failing # this may happen when a course gets outdated logger.info(f"ignoring container {container} - " f"can't find image hash for {coursename}") except Exception: logger.exception(f"monitor has to ignore {container}") # run the whole stuff futures = [mon.co_run(self.idle, self.lingering) for mon in monitoreds] #asyncio.run(asyncio.gather(*futures)) asyncio.get_event_loop().run_until_complete( asyncio.gather(*futures)) self.system_containers = len(monitoreds) self.system_kernels = sum((mon.nb_kernels or 0) for mon in monitoreds)
async def _co_run(self, idle, lingering): """ both timeouts in seconds """ now = time.time() self.reload() # inspection remains None on InternalServerError if self.inspection is None: logger.info(f"BLIP weirdo (0) {self.name} - cannot inspect - ignored") return state = self.inspection['State']['Status'] if state in ('stopped', 'configured'): logger.info(f"BLIP weirdo (1) {self.name} - removing") logger.info(f"BLIP weirdo (1) detailed state was {self.inspection['State']}") self.remove_container() return # ignore non running containers if state != 'running': logger.info(f"BLIP weirdo (2) {self.name} - ignoring") logger.info(f"BLIP weirdo (2) detailed state was {self.inspection['State']}") return # count number of kernels and last activity await self.count_running_kernels() # last_activity may be 0 if no kernel is running inside that container # or None if we could not determine it properly if self.last_activity is None: # an unreachable container may be one that is just taking off # as unlikely as that sounds, it actually tends to happen much more # often than I at least had foreseen at first logger.info(f"unreachable (1) {self} - will try again in {GRACE}s") await asyncio.sleep(GRACE) await self.count_running_kernels() if self.last_activity is None: logger.info(f"Killing unreachable (2) {self}") self.kill_container() return # check there has been activity in the last grace_idle_in_minutes idle_minutes = (int)((now - self.last_activity) // 60) if (now - self.last_activity) < idle: logger.debug( f"Sparing running {self} that had activity {idle_minutes} mn ago") self.figures.count_container(True, self.nb_kernels) elif self.last_activity == 0: logger.info(f"running and empty (1) {self} - will try again in {GRACE}s") await asyncio.sleep(GRACE) await self.count_running_kernels() if self.last_activity == 0: logger.info( f"Killing (running and empty) (2) {self} " f"that has no kernel attached") self.kill_container() return else: logger.info( f"Killing (running & idle) {self} " f"that has been idle for {idle_minutes} mn") self.kill_container() return # if students accidentally leave stuff running in the background # last_activity may be misleading # so we kill all caontainers older than <lingering> # the unit here is seconds but the front CLI has it in hours created_time = self.creation_time() ellapsed = int(now - created_time) if ellapsed > lingering: created_days = (int)(ellapsed // (24 * 3600)) created_hours = (int)((ellapsed // 3600) % 24) logger.warning( f"Removing lingering {self} " f"that was created {created_days} days " f"{created_hours} hours ago (idle_minutes={idle_minutes})") self.kill_container() return