예제 #1
0
    async def count_running_kernels(self):
        """
        updates:
        * self.figures with number of running kernels
        * self.last_activity - a epoch/timestamp/nb of seconds
          may be None if using an old jupyter
        """
        port = self.port_number()
        if not port:
            return
        url = f"http://localhost:{port}/{port}/api/kernels?token={self.name}"
        self.last_activity = None
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    json_str = await response.text()
            api_kernels = json.loads(json_str)
            self.nb_kernels = len(api_kernels)

            last_times = [
                self.last_time(api_kernel) for api_kernel in api_kernels
            ]
            # if times is empty (no kernel): no activity
            self.last_activity = max(last_times, default=0)

        # this somehow tends to happen a lot sometimes
        # until we figure it out, let's make it less conspicuous
        except ClientConnectionError as _exc:
            logger.info(f"could not reach warming up {url} for last activity")

        except Exception:
            logger.exception(f"Cannot probe number of kernels with {self} - unhandled exception")
예제 #2
0
    def run_forever(self):
        tick = time.time()

        # one cycle can take some time as all the jupyters need to be http-probed
        # so let us compute the actual time to wait
        logger.info("nbh-monitor is starting up")
        for c in CourseDir.objects.all():
            Stats(c.coursename).record_monitor_known_counts_line()
        while True:
            try:
                self.run_once()
            # just be extra sure it doesn't crash
            except Exception:
                logger.exception(f"Unexpected error")
            tick += self.period
            duration = max(0, int(tick - time.time()))
            logger.info(f"monitor is waiting for {duration}s")
            time.sleep(duration)
예제 #3
0
    def _scan_containers(self, figures_by_course):

        # initialize all known courses - we want data on all courses
        # even if they don't run any container yet
        logger.info(f"monitor cycle with period={self.period//60}' "
                    f"idle={self.idle//60}' "
                    f"lingering={self.lingering//3600}h")
        hash_by_course = {c.coursename : c.image_hash()
                          for c in CourseDir.objects.all()}

        with podman.ApiConnection(podman_url) as podman_api:
            # returns None when no container is found !
            containers = podman.containers.list_containers(podman_api, all=True) or []
        logger.info(f"found {len(hash_by_course)} courses "
                    f"and {len(containers)} containers")


        monitoreds = []
        for container in containers:
            try:
                name = container['Names'][0]
                coursename, student = name.split('-x-')
                figures_by_course.setdefault(coursename, CourseFigures())
                figures = figures_by_course[coursename]
                # may be None if s/t is misconfigured
                image_hash = hash_by_course[coursename] \
                       or f"hash not found for course {coursename}"
                monitoreds.append(MonitoredJupyter(
                    container, coursename, student,
                    figures, image_hash))
            # typically non-nbhosting containers
            except ValueError:
                # ignore this container as we don't even know
                # in what course it belongs
                logger.info(f"ignoring non-nbhosting {container}")
            except KeyError:
                # typically hash_by_course[coursename] is failing
                # this may happen when a course gets outdated
                logger.info(f"ignoring container {container} - "
                            f"can't find image hash for {coursename}")
            except Exception:
                logger.exception(f"monitor has to ignore {container}")
                                
        # run the whole stuff
        futures = [mon.co_run(self.idle, self.lingering)
                   for mon in monitoreds]
        
        #asyncio.run(asyncio.gather(*futures))
        asyncio.get_event_loop().run_until_complete(
            asyncio.gather(*futures))
        
        self.system_containers = len(monitoreds)
        self.system_kernels = sum((mon.nb_kernels or 0) for mon in monitoreds)
예제 #4
0
    async def _co_run(self, idle, lingering):
        """
        both timeouts in seconds
        """
        now = time.time()
        self.reload()
        # inspection remains None on InternalServerError
        if self.inspection is None:
            logger.info(f"BLIP weirdo (0) {self.name} - cannot inspect - ignored")
            return

        state = self.inspection['State']['Status']
        
        if state in ('stopped', 'configured'):
            logger.info(f"BLIP weirdo (1) {self.name} - removing")
            logger.info(f"BLIP weirdo (1) detailed state was {self.inspection['State']}")
            self.remove_container()
            return
        
        # ignore non running containers
        if state != 'running':
            logger.info(f"BLIP weirdo (2) {self.name} - ignoring")
            logger.info(f"BLIP weirdo (2) detailed state was {self.inspection['State']}")
            return
        
        # count number of kernels and last activity
        await self.count_running_kernels()
        # last_activity may be 0 if no kernel is running inside that container
        # or None if we could not determine it properly
        if self.last_activity is None:
            # an unreachable container may be one that is just taking off
            # as unlikely as that sounds, it actually tends to happen much more
            # often than I at least had foreseen at first
            logger.info(f"unreachable (1) {self} - will try again in {GRACE}s")
            await asyncio.sleep(GRACE)
            await self.count_running_kernels()
            if self.last_activity is None:
                logger.info(f"Killing unreachable (2) {self}")
                self.kill_container()
                return
        # check there has been activity in the last grace_idle_in_minutes
        idle_minutes = (int)((now - self.last_activity) // 60)
        if (now - self.last_activity) < idle:
            logger.debug(
                f"Sparing running {self} that had activity {idle_minutes} mn ago")
            self.figures.count_container(True, self.nb_kernels)
        elif self.last_activity == 0:
            logger.info(f"running and empty (1) {self} - will try again in {GRACE}s")
            await asyncio.sleep(GRACE)
            await self.count_running_kernels()
            if self.last_activity == 0:
                logger.info(
                    f"Killing (running and empty) (2) {self} "
                    f"that has no kernel attached")
                self.kill_container()
                return
        else:
            logger.info(
                f"Killing (running & idle) {self} "
                f"that has been idle for {idle_minutes} mn")
            self.kill_container()
            return
        
        # if students accidentally leave stuff running in the background
        # last_activity may be misleading
        # so we kill all caontainers older than <lingering>
        # the unit here is seconds but the front CLI has it in hours
        created_time = self.creation_time()
        ellapsed = int(now - created_time)
        if ellapsed > lingering:
            created_days = (int)(ellapsed // (24 * 3600))
            created_hours = (int)((ellapsed // 3600) % 24)
            logger.warning(
                f"Removing lingering {self} "
                f"that was created {created_days} days "
                f"{created_hours} hours ago (idle_minutes={idle_minutes})")
            self.kill_container()
            return