def _write_results(self, figures_by_course, disk_spaces, loads, memory): coursedirs_by_name = {c.coursename: c for c in CourseDir.objects.all()} # write results for coursename, figures in figures_by_course.items(): nb_student_homes = coursedirs_by_name[coursename].nb_student_homes( ) Stats(coursename).record_monitor_counts( figures.running_containers, figures.frozen_containers, figures.running_kernels, nb_student_homes, loads['load1'], loads['load5'], loads['load15'], disk_spaces['container']['percent'], disk_spaces['container']['free'], disk_spaces['nbhosting']['percent'], disk_spaces['nbhosting']['free'], disk_spaces['system']['percent'], disk_spaces['system']['free'], memory['memory_total'], memory['memory_free'], memory['memory_available'], self.system_containers, self.system_kernels, )
async def co_run(self, grace): nbhroot = Path(sitesettings.nbhroot) # stopped containers are useful only for statistics if self.container.status != 'running': self.figures.count_container(False) return # count number of kernels and last activity await self.count_running_kernels() # last_activity may be 0 if no kernel is running inside that container # or None if we could not determine it properly if self.last_activity is None: logger.error( "skipping container {} with no known last_activity".format( self.name)) return # check there has been activity in the last <grace> seconds now = time.time() grace_past = now - grace idle_minutes = (now - self.last_activity) // 60 if self.last_activity > grace_past: logger.debug("sparing {} that had activity {}' ago".format( self, idle_minutes)) self.figures.count_container(True, self.nb_kernels) else: if self.last_activity: logger.info("{} has been idle for {} mn - killing".format( self, idle_minutes)) else: logger.info("{} has no kernel attached - killing".format(self)) # kill it self.container.kill() # if that container does not run the expected image hash # it is because the course image was upgraded in the meanwhile # then we even remove the container so it will get re-created # next time with the right image this time actual_hash = self.container.image.id if actual_hash != self.hash: logger.info( "removing container {} - has hash {} instead of expected {}" .format(self.name, actual_hash[:15], self.hash[:15])) self.container.remove(v=True) # this counts for one dead container self.figures.count_container(False) # keep track or that removal in events.raw Stats(self.course).record_kill_jupyter(self.student)
def run_forever(self): tick = time.time() # one cycle can take some time as all the jupyters need to be http-probed # so let us compute the actual time to wait logger.info("nbh-monitor is starting up") for c in CourseDir.objects.all(): Stats(c.coursename).record_monitor_known_counts_line() while True: try: self.run_once() # just be extra sure it doesn't crash except Exception: logger.exception(f"Unexpected error") tick += self.period duration = max(0, int(tick - time.time())) logger.info(f"monitor is waiting for {duration}s") time.sleep(duration)
def run_forever(self): tick = time.time() # one cycle can take some time as all the jupyters need to be http-probed # so let us compute the actual time to wait logger.info("nbh-monitor is starting up") coursenames = CoursesDir().coursenames() for coursename in coursenames: Stats(coursename).record_monitor_known_counts_line() while True: try: self.run_once() # just be extra sure it doesn't crash except Exception as e: logger.exception( "protecting against unexpected exception {}".format(e)) tick += self.period duration = max(0, int(tick - time.time())) logger.info("monitor is waiting for {}s".format(duration)) time.sleep(duration)
def _open_notebook(request, coursename, student, notebook, *, forcecopy, init_student_git): # pylint: disable=r0914 """ implement both edx_request and classroom_request that behave almost exactly the same """ ok, explanation = authorized(request) if not ok: return HttpResponseForbidden( f"Access denied: {explanation}") coursedir = CourseDir.objects.get(coursename=coursename) if not coursedir.is_valid(): return error_page( request, coursename, student, notebook, f"no such course `{coursename}'", header=True, ) # the ipynb extension is removed from the notebook name in urls.py exists, notebook_with_ext, _, is_genuine_notebook = \ locate_notebook(coursedir.git_dir, notebook) # second attempt from the student's space # in case the student has created it locally... if not exists: exists, notebook_with_ext, _, is_genuine_notebook = \ locate_notebook(coursedir.student_dir(student), notebook) if not exists: msg = f"notebook `{notebook}' not known in this course or student" return error_page(request, coursename, student, notebook, msg, header="notebook not found") # deal with concurrent requests on the same container # by using a shared memory (a redis cache) # starting_containers is the cache name # as configured in nbhosting.ini(.in) # in devel mode we don't have redis redis_cache = None try: import redis idling = 0.5 # just a safety in case our code would not release stuff properly expire_in_s = 15 def my_repr(timedelta): return f"{timedelta.seconds}s {timedelta.microseconds}µs" redis_cache = redis.Redis() container = f'{coursename}-x-{student}' for attempt in itertools.count(1): already = redis_cache.get(container) # good to go if not already: logger.info(f"{attempt=} going ahead with {container=} and {notebook=}") redis_cache.set(container, b'1') redis_cache.expire(container, expire_in_s) break # has the stored token expired ? logger.info(f"{attempt=} waiting for {idling=} because {container} is being started" f"with {container=} and {notebook=}") time.sleep(idling) except ModuleNotFoundError: # make sure this error does not go unnoticed in production if not DEBUG: raise else: pass subcommand = 'container-view-student-course-notebook' # build command command = ['nbh', '-d', sitesettings.nbhroot] if DEBUG: command.append('-x') command.append(subcommand) # propagate the forcecopy flag for reset_from_origin if forcecopy: command.append('-f') # propagate that a git initialization was requested # forcecopy has no effect in this case if init_student_git: command.append('-g') # a student repo gets cloned from local course git # for lower delays when updating, and removing issues # like accessing private repos from the students space ref_giturl = str(coursedir.git_dir) else: ref_giturl = coursedir.giturl # add arguments to the subcommand command += [student, coursename, notebook_with_ext, coursedir.image, ref_giturl] command_str = " ".join(command) logger.info(f'edxfront is running (DEBUG={DEBUG}): {command_str}') completed = subprocess.run( command, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) log_completed_process(completed, subcommand) try: action, _container_name, actual_port, jupyter_token = completed.stdout.split() if completed.returncode != 0 or action.startswith("failed"): message = failed_command_message( command_str, completed, prefix="failed to spawn notebook container") header = failed_command_header(action) return error_page( request, coursename, student, notebook, message, header) # remember that in events file for statistics Stats(coursename).record_open_notebook(student, notebook, action, actual_port) # redirect with same proto (http or https) as incoming scheme = request.scheme # get the host part of the incoming URL host = request.get_host() # remove initial port if present in URL if ':' in host: host, _ = host.split(':', 1) ########## forge a URL that nginx will intercept # passing along course and student is for 'reset_from_origin' if is_genuine_notebook: url = (f"{scheme}://{host}/{actual_port}/notebooks/" f"{notebook_with_ext}?token={jupyter_token}&" f"course={coursename}&student={student}") else: url = (f"{scheme}://{host}/{actual_port}/lab/tree/{notebook_with_ext}") logger.info(f"edxfront: redirecting to {url}") return HttpResponseRedirect(url) except Exception as exc: prefix = (f"exception when parsing output of nbh {subcommand}\n" f"{type(exc)}: {exc}") message = failed_command_message(command_str, completed, prefix=prefix) return error_page( request, coursename, student, notebook, message) finally: if redis_cache: redis_cache.delete(container)
def send_material_usage(request, course): stats = Stats(course) encoded = json.dumps(stats.material_usage()) response = HttpResponse(encoded, content_type="application/json") response['Access-Control-Allow-Origin'] = '*' return response
def run_once(self): # initialize all known courses - we want data on courses # even if they don't run any container yet logger.debug("scanning courses") coursesdir = CoursesDir() coursenames = coursesdir.coursenames() figures_by_course = { coursename: CourseFigures() for coursename in coursenames } try: proxy = docker.from_env(version='auto') logger.debug("scanning containers") containers = proxy.containers.list(all=True) hash_by_course = { coursename: CourseDir(coursename).image_hash(proxy) for coursename in coursenames } except Exception as e: logger.exception( "Cannot gather containers list at the docker daemon - skipping" ) return # a list of async futures futures = [] for container in containers: try: name = container.name # too much spam ven in debug mode # logger.debug("dealing with container {}".format(name)) coursename, student = name.split('-x-') figures_by_course.setdefault(coursename, CourseFigures()) figures = figures_by_course[coursename] # may be None if s/t is misconfigured hash = hash_by_course[coursename] \ or "hash not found for course {}".format(coursename) monitored_jupyter = MonitoredJupyter(container, coursename, student, figures, hash) futures.append(monitored_jupyter.co_run(self.grace)) # typically non-nbhosting containers except ValueError as e: # ignore this container as we don't even know # in what course it logger.info("ignoring non-nbhosting {}".format(container)) except Exception as e: logger.exception( "ignoring {} in monitor - unexpected exception".format( container)) # ds stands for disk_space docker_root = proxy.info()['DockerRootDir'] nbhroot = sitesettings.nbhroot system_root = "/" ds = {} for name, root in ( ('docker', docker_root), ('nbhosting', nbhroot), ('system', system_root), ): ds[name] = {} try: stat = os.statvfs(root) ds[name]['percent'] = round(100 * stat.f_bfree / stat.f_blocks) # unit is MiB ds[name]['free'] = round( (stat.f_bfree * stat.f_bsize) / (1024**2)) except Exception as e: ds[name]['free'] = 0 ds[name]['percent'] = 0 logger.exception( "monitor cannot compute disk space with name {} on {}". format(name, root)) # loads try: uptime_output = subprocess.check_output('uptime').decode().strip() end_of_line = uptime_output.split(':')[-1] floads = end_of_line.split(', ') load1, load5, load15 = [round(100 * float(x)) for x in floads] except Exception as e: load1, load5, load15 = 0, 0, 0 logger.exception("monitor cannot compute cpu loads") # run the whole stuff asyncio.get_event_loop().run_until_complete(asyncio.gather(*futures)) # write results for coursename, figures in figures_by_course.items(): student_homes = CourseDir(coursename).student_homes() Stats(coursename).record_monitor_counts( figures.running_containers, figures.frozen_containers, figures.running_kernels, student_homes, load1, load5, load15, ds['docker']['percent'], ds['docker']['free'], ds['nbhosting']['percent'], ds['nbhosting']['free'], ds['system']['percent'], ds['system']['free'], )
async def co_run(self, idle, unused): """ both timeouts in seconds """ now = time.time() actual_hash = self.container.image.id # stopped containers need to be handled a bit differently if self.container.status != 'running': if actual_hash != self.image_hash: logger.info(f"Removing (stopped & outdated) {self} " f"that has outdated hash {actual_hash[:15]} " f"vs expected {self.image_hash[:15]}") self.container.remove(v=True) else: exited_time = self.exited_time() unused_days = (int)((now - exited_time) // (24 * 3600)) unused_hours = (int)((now - exited_time) // (3600) % 24) if (now - exited_time) > unused: logger.info(f"Removing (stopped & unused) {self} " f"that has been unused for {unused_days} days " f"{unused_hours} hours") self.container.remove(v=True) else: logger.debug( f"Ignoring stopped {self} that " f"exited {unused_days} days {unused_hours} hours ago") self.figures.count_container(False) return # count number of kernels and last activity await self.count_running_kernels() # last_activity may be 0 if no kernel is running inside that container # or None if we could not determine it properly if self.last_activity is None: logger.error( f"Skipping running {self} with no known last_activity") return # check there has been activity in the last grace_idle_in_minutes idle_minutes = (int)((now - self.last_activity) // 60) if (now - self.last_activity) < idle: logger.debug( f"Sparing running {self} that had activity {idle_minutes} mn ago" ) self.figures.count_container(True, self.nb_kernels) else: if self.last_activity: logger.info(f"Killing (running & idle) {self} " f"that has been idle for {idle_minutes} mn") else: logger.info(f"Killing (running and empty){self} " f"that has no kernel attached") # kill it self.container.kill() # keep track or that removal in events.raw Stats(self.course).record_kill_jupyter(self.student) # if that container does not run the expected image hash # it is because the course image was upgraded in the meanwhile # then we even remove the container so it will get re-created # next time with the right image this time if actual_hash != self.image_hash: logger.info(f"Removing (just killed & outdated) {self} " f"that has outdated hash {actual_hash[:15]} " f"vs expected {self.image_hash[:15]}") self.container.remove(v=True) else: # this counts for one dead container self.figures.count_container(False)
def send_daily_metrics(request, course): stats = Stats(course) encoded = json.dumps(stats.daily_metrics()) return HttpResponse(encoded, content_type = "application/json")
def send_material_usage(request, course): stats = Stats(course) encoded = json.dumps(stats.material_usage()) return HttpResponse(encoded, content_type = "application/json")
def send_monitor_counts(request, course): stats = Stats(course) encoded = json.dumps(stats.monitor_counts()) return HttpResponse(encoded, content_type = "application/json")
def edx_request(request, course, student, notebook): """ the main edxfront entry point; it * creates a student if needed * copies the notebook if needed * makes sure the student container is ready to answer http requests and then returns a http redirect to /port/<notebook_path> """ if not authorized(request): return HttpResponseForbidden() # the ipynb extension is removed from the notebook name in urls.py notebook_withext = notebook + ".ipynb" # have we received a request to force the copy (for reset_from_origin) forcecopy = request.GET.get('forcecopy', False) subcommand = 'docker-view-student-course-notebook' # build command command = ['nbh', '-d', sitesettings.nbhroot] if DEBUG: command.append('-x') command.append(subcommand) # propagate the forcecopy flag for reset_from_origin if forcecopy: command.append('-f') # add arguments to the subcommand command += [student, course, notebook_withext] logger.info("In {}\n-> Running command {}".format(Path.cwd(), " ".join(command))) completed_process = subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) log_completed_process(completed_process, subcommand) if completed_process.returncode != 0: message = "command {} returned {}\nstderr:{}"\ .format(" ".join(command), completed_process.returncode, completed_process.stderr) return error_page(request, course, student, notebook, message) try: action, docker_name, actual_port, jupyter_token = completed_process.stdout.split( ) if action.startswith("failed"): message = ("failed to spawn notebook container\n" "command {}\nreturned with retcod={} action={}\n" "stdout:{}\n" "stderr:{}").format(" ".join(command), completed_process.returncode, action, completed_process.stdout, completed_process.stderr) return error_page(request, course, student, notebook, message) # remember that in events file for statistics Stats(course).record_open_notebook(student, notebook, action, actual_port) # redirect with same proto (http or https) as incoming scheme = request.scheme # get the host part of the incoming URL host = request.get_host() # remove initial port if present in URL if ':' in host: host, _ = host.split(':', 1) ########## forge a URL that nginx will intercept # port depends on scheme - we do not specify it # passing along course and student is for 'reset_from_origin' url = "{scheme}://{host}/{port}/notebooks/{path}?token={token}&course={course}&student={student}"\ .format(scheme=scheme, host=host, port=actual_port, path=notebook_withext, token=jupyter_token, course=course, student=student) logger.info("edxfront: redirecting to {}".format(url)) # return HttpResponse('<a href="{}">click to be redirected</h1>'.format(url)) return HttpResponseRedirect(url) except Exception as e: message = "exception when parsing output of nbh {}\n{}\n{}"\ .format(subcommand, completed_process.stdout, e) return error_page(request, course, student, notebook, message)
def _open_notebook(request, coursename, student, notebook, *, forcecopy, init_student_git): # pylint: disable=r0914 """ implement both edx_request and classroom_request that behave almost exactly the same """ ok, explanation = authorized(request) if not ok: return HttpResponseForbidden( f"Access denied: {explanation}") coursedir = CourseDir.objects.get(coursename=coursename) if not coursedir.is_valid(): return error_page( request, coursename, student, notebook, f"no such course `{coursename}'", header=True, ) # the ipynb extension is removed from the notebook name in urls.py exists, notebook_with_ext, _, is_genuine_notebook = \ locate_notebook(coursedir.git_dir, notebook) # second attempt from the student's space # in case the student has created it locally... if not exists: exists, notebook_with_ext, _, is_genuine_notebook = \ locate_notebook(coursedir.student_dir(student), notebook) if not exists: msg = f"notebook `{notebook}' not known in this course or student" return error_page(request, coursename, student, notebook, msg, header="notebook not found") subcommand = 'container-view-student-course-notebook' # build command command = ['nbh', '-d', sitesettings.nbhroot] if DEBUG: command.append('-x') command.append(subcommand) # propagate the forcecopy flag for reset_from_origin if forcecopy: command.append('-f') # propagate that a git initialization was requested # forcecopy has no effect in this case if init_student_git: command.append('-g') # a student repo gets cloned from local course git # for lower delays when updating, and removing issues # like accessing private repos from the students space ref_giturl = str(coursedir.git_dir) else: ref_giturl = coursedir.giturl # add arguments to the subcommand command += [student, coursename, notebook_with_ext, coursedir.image, ref_giturl] command_str = " ".join(command) logger.info(f'edxfront is running: {command_str} DEBUG={DEBUG}') completed = subprocess.run( command, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) log_completed_process(completed, subcommand) try: action, _container_name, actual_port, jupyter_token = completed.stdout.split() if completed.returncode != 0 or action.startswith("failed"): message = failed_command_message( command_str, completed, prefix="failed to spawn notebook container") header = failed_command_header(action) return error_page( request, coursename, student, notebook, message, header) # remember that in events file for statistics Stats(coursename).record_open_notebook(student, notebook, action, actual_port) # redirect with same proto (http or https) as incoming scheme = request.scheme # get the host part of the incoming URL host = request.get_host() # remove initial port if present in URL if ':' in host: host, _ = host.split(':', 1) ########## forge a URL that nginx will intercept # passing along course and student is for 'reset_from_origin' if is_genuine_notebook: url = (f"{scheme}://{host}/{actual_port}/notebooks/" f"{notebook_with_ext}?token={jupyter_token}&" f"course={coursename}&student={student}") else: url = (f"{scheme}://{host}/{actual_port}/lab/tree/{notebook_with_ext}") logger.info(f"edxfront: redirecting to {url}") return HttpResponseRedirect(url) except Exception as exc: prefix = (f"exception when parsing output of nbh {subcommand}\n" f"{type(exc)}: {exc}") message = failed_command_message(command_str, completed, prefix=prefix) return error_page( request, coursename, student, notebook, message)