async def count_running_kernels(self): """ updates: * self.figures with number of running kernels * self.last_activity - a epoch/timestamp/nb of seconds may be None if using an old jupyter """ port = self.port_number() if not port: return url = f"http://localhost:{port}/{port}/api/kernels?token={self.name}" self.last_activity = None try: async with aiohttp.ClientSession() as session: async with session.get(url) as response: json_str = await response.text() api_kernels = json.loads(json_str) self.nb_kernels = len(api_kernels) last_times = [ self.last_time(api_kernel) for api_kernel in api_kernels ] # if times is empty (no kernel): no activity self.last_activity = max(last_times, default=0) # this somehow tends to happen a lot sometimes # until we figure it out, let's make it less conspicuous except ClientConnectionError as _exc: logger.info(f"could not reach warming up {url} for last activity") except Exception: logger.exception(f"Cannot probe number of kernels with {self} - unhandled exception")
def last_time(kernel_data): """ expects as input the data returned by /api/kernels for one kernel, that is to say e.g.: {'connections': 1, 'execution_state': 'idle', 'id': '15be5b4c-b5f2-46f0-9a9b-ff54f4495cb4', 'last_activity': '2018-02-19T12:58:25.204761Z', 'name': 'python3'} returns a comparable time (using max) that this kernel has been doing something Notes: * cases where connections = 0 should not be disregarded it is important to keep those alive, it does not indicate a lack of activity * last_activity format: we found some items where the milliseconds part was simply not present (at all, i.e. not exposed as .0 or anything) * if anything goes wrong, it's best to return a timestamp that means 'now' rather than the epoch """ try: last_activity = kernel_data['last_activity'] return MonitoredJupyter.parse_time(last_activity) except Exception: logger.exception(f"last_time failed with kernel_data = {kernel_data}") # to stay on the safe side, return current time return time.time()
async def co_run(self, idle, lingering): try: await self._co_run(idle, lingering) except Exception as exc: # xx used to be a simple error but until pip podman 3.x is settled # it's probably best like this logger.exception( f"unexpected error {type(exc)} " f"when dealing with {self.name} - ignored\n...exception={exc}")
def run_once(self): try: return self._run_once() except podman.errors.InternalServerError as exc: reporter = logger.exception if sitesettings.DEBUG else logger.error reporter(f"{exc} - skipping rest of monitor cycle") except Exception: logger.exception( "Something wrong happened during monitor cycle - skipping") return
def _scan_containers(self, figures_by_course): # initialize all known courses - we want data on all courses # even if they don't run any container yet logger.info(f"monitor cycle with period={self.period//60}' " f"idle={self.idle//60}' " f"lingering={self.lingering//3600}h") hash_by_course = {c.coursename : c.image_hash() for c in CourseDir.objects.all()} with podman.ApiConnection(podman_url) as podman_api: # returns None when no container is found ! containers = podman.containers.list_containers(podman_api, all=True) or [] logger.info(f"found {len(hash_by_course)} courses " f"and {len(containers)} containers") monitoreds = [] for container in containers: try: name = container['Names'][0] coursename, student = name.split('-x-') figures_by_course.setdefault(coursename, CourseFigures()) figures = figures_by_course[coursename] # may be None if s/t is misconfigured image_hash = hash_by_course[coursename] \ or f"hash not found for course {coursename}" monitoreds.append(MonitoredJupyter( container, coursename, student, figures, image_hash)) # typically non-nbhosting containers except ValueError: # ignore this container as we don't even know # in what course it belongs logger.info(f"ignoring non-nbhosting {container}") except KeyError: # typically hash_by_course[coursename] is failing # this may happen when a course gets outdated logger.info(f"ignoring container {container} - " f"can't find image hash for {coursename}") except Exception: logger.exception(f"monitor has to ignore {container}") # run the whole stuff futures = [mon.co_run(self.idle, self.lingering) for mon in monitoreds] #asyncio.run(asyncio.gather(*futures)) asyncio.get_event_loop().run_until_complete( asyncio.gather(*futures)) self.system_containers = len(monitoreds) self.system_kernels = sum((mon.nb_kernels or 0) for mon in monitoreds)
def run_forever(self): tick = time.time() # one cycle can take some time as all the jupyters need to be http-probed # so let us compute the actual time to wait logger.info("nbh-monitor is starting up") for c in CourseDir.objects.all(): Stats(c.coursename).record_monitor_known_counts_line() while True: try: self.run_once() # just be extra sure it doesn't crash except Exception: logger.exception(f"Unexpected error") tick += self.period duration = max(0, int(tick - time.time())) logger.info(f"monitor is waiting for {duration}s") time.sleep(duration)
def _gather_system_facts(self, figures_by_course): # ds stands for disk_space if self._graphroot is None: with podman.ApiConnection(podman_url) as podman_api: self._graphroot = podman.system.info(podman_api)['store']['graphRoot'] nbhroot = sitesettings.nbhroot system_root = "/" disk_spaces = {} for name, root in (('container', self._graphroot), ('nbhosting', nbhroot), ('system', system_root)): disk_spaces[name] = {} try: stat = os.statvfs(root) disk_spaces[name]['percent'] = round(100 * stat.f_bfree / stat.f_blocks) # unit is MiB disk_spaces[name]['free'] = round((stat.f_bfree * stat.f_bsize) / (1024**2)) except Exception: disk_spaces[name]['free'] = 0 disk_spaces[name]['percent'] = 0 logger.exception( f"monitor cannot compute disk space {name} on {root}") # loads try: uptime_output = subprocess.check_output('uptime').decode().strip() end_of_line = uptime_output.split(':')[-1] floads = end_of_line.split(', ') load1, load5, load15 = [round(100*float(x)) for x in floads] except Exception: load1, load5, load15 = 0, 0, 0 logger.exception(f"monitor cannot compute cpu loads") loads = dict(load1=load1, load5=load5, load15=load15) # memory from /proc/meminfo try: def handle_line(line): _label, value, unit = line.split() if unit == 'kB': return int(value) * 1024 logger.warning(f"unexpected unit {unit} in meminfo") return 0 with open("/proc/meminfo") as feed: total_line = feed.readline() free_line = feed.readline() avail_line = feed.readline() total_mem = handle_line(total_line) free_mem = handle_line(free_line) avail_mem = handle_line(avail_line) except: logger.exception("failed to probe memory") total_mem, free_mem, avail_mem = 0, 0, 0 memory = dict(memory_total=total_mem, memory_free=free_mem, memory_available=avail_mem) return disk_spaces, loads, memory
def port_number(self): try: return self.container['Ports'][0]['hostPort'] except Exception: logger.exception(f"Cannot locate port number for {self}") return 0