async def count_running_kernels(self): """ updates: * self.figures with number of running kernels * self.last_activity - a epoch/timestamp/nb of seconds may be None if using an old jupyter """ port = self.port_number() if not port: return url = "http://localhost:{}/api/kernels?token={}"\ .format(port, self.name) try: async with aiohttp.ClientSession() as session: async with session.get(url) as response: json_str = await response.text() api_kernels = json.loads(json_str) self.nb_kernels = len(api_kernels) last_times = [ self.last_time(api_kernel) for api_kernel in api_kernels ] # if times is empty (no kernel): no activity self.last_activity = max(last_times, default=0) except Exception as e: logger.exception( "Cannot probe number of kernels in {} - {}: {}".format( self, type(e), e)) self.last_activity = None
def port_number(self): try: return int(self.container.attrs['NetworkSettings']['Ports'] ['8888/tcp'][0]['HostPort']) except Exception: logger.exception(f"Cannot locate port number for {self}") return 0
def last_time(kernel_data): """ expects as input the data returned by /api/kernels for one kernel, that is to say e.g.: {'connections': 1, 'execution_state': 'idle', 'id': '15be5b4c-b5f2-46f0-9a9b-ff54f4495cb4', 'last_activity': '2018-02-19T12:58:25.204761Z', 'name': 'python3'} returns a comparable time (using max) that this kernel has been doing something Notes: * cases where connections = 0 should not be disregarded it is important to keep those alive, it does not indicate a lack of activity * last_activity format: we found some items where the milliseconds part was simply not present (at all, i.e. not exposed as .0 or anything) * if anything goes wrong, it's best to return a timestamp that means 'now' rather than the epoch """ try: last_activity = kernel_data['last_activity'] return MonitoredJupyter.parse_docker_time(last_activity) except Exception: logger.exception( f"last_time failed with kernel_data = {kernel_data}") # to stay on the safe side, return current time return time.time()
def run_forever(self): tick = time.time() # one cycle can take some time as all the jupyters need to be http-probed # so let us compute the actual time to wait logger.info("nbh-monitor is starting up") coursenames = CoursesDir().coursenames() for coursename in coursenames: Stats(coursename).record_monitor_known_counts_line() while True: try: self.run_once() # just be extra sure it doesn't crash except Exception: logger.exception(f"Unexpected error") tick += self.period duration = max(0, int(tick - time.time())) logger.info(f"monitor is waiting for {duration}s") time.sleep(duration)
def run_once(self): # initialize all known courses - we want data on courses # even if they don't run any container yet logger.debug("scanning courses") coursesdir = CoursesDir() coursenames = coursesdir.coursenames() figures_by_course = { coursename: CourseFigures() for coursename in coursenames } try: proxy = docker.from_env(version='auto') logger.debug("scanning containers") containers = proxy.containers.list(all=True) hash_by_course = { coursename: CourseDir(coursename).image_hash(proxy) for coursename in coursenames } except Exception as e: logger.exception( "Cannot gather containers list at the docker daemon - skipping" ) return # a list of async futures futures = [] for container in containers: try: name = container.name # too much spam ven in debug mode # logger.debug("dealing with container {}".format(name)) coursename, student = name.split('-x-') figures_by_course.setdefault(coursename, CourseFigures()) figures = figures_by_course[coursename] # may be None if s/t is misconfigured hash = hash_by_course[coursename] \ or "hash not found for course {}".format(coursename) monitored_jupyter = MonitoredJupyter(container, coursename, student, figures, hash) futures.append(monitored_jupyter.co_run(self.grace)) # typically non-nbhosting containers except ValueError as e: # ignore this container as we don't even know # in what course it logger.info("ignoring non-nbhosting {}".format(container)) except Exception as e: logger.exception( "ignoring {} in monitor - unexpected exception".format( container)) # ds stands for disk_space docker_root = proxy.info()['DockerRootDir'] nbhroot = sitesettings.nbhroot system_root = "/" ds = {} for name, root in ( ('docker', docker_root), ('nbhosting', nbhroot), ('system', system_root), ): ds[name] = {} try: stat = os.statvfs(root) ds[name]['percent'] = round(100 * stat.f_bfree / stat.f_blocks) # unit is MiB ds[name]['free'] = round( (stat.f_bfree * stat.f_bsize) / (1024**2)) except Exception as e: ds[name]['free'] = 0 ds[name]['percent'] = 0 logger.exception( "monitor cannot compute disk space with name {} on {}". format(name, root)) # loads try: uptime_output = subprocess.check_output('uptime').decode().strip() end_of_line = uptime_output.split(':')[-1] floads = end_of_line.split(', ') load1, load5, load15 = [round(100 * float(x)) for x in floads] except Exception as e: load1, load5, load15 = 0, 0, 0 logger.exception("monitor cannot compute cpu loads") # run the whole stuff asyncio.get_event_loop().run_until_complete(asyncio.gather(*futures)) # write results for coursename, figures in figures_by_course.items(): student_homes = CourseDir(coursename).student_homes() Stats(coursename).record_monitor_counts( figures.running_containers, figures.frozen_containers, figures.running_kernels, student_homes, load1, load5, load15, ds['docker']['percent'], ds['docker']['free'], ds['nbhosting']['percent'], ds['nbhosting']['free'], ds['system']['percent'], ds['system']['free'], )