예제 #1
0
    async def _handle_new_job(self, message: BackendNewJob):
        self._logger.info("Received request for jobid %s", message.job_id)

        # For send_job_result internal checks
        self.__running_job[message.job_id] = False  # no ssh info sent

        # Tell the backend we started running the job
        await ZMQUtils.send(self.__backend_socket, AgentJobStarted(message.job_id))

        try:
            if message.environment not in self.environments:
                self._logger.warning("Task %s/%s ask for an unknown environment %s (not in aliases)", message.course_id, message.task_id,
                                     message.environment)
                raise CannotCreateJobException('This environment is not available in this agent. Please contact your course administrator.')

            task_fs = self._tasks_filesystem.from_subfolder(message.course_id).from_subfolder(message.task_id)
            if not task_fs.exists():
                self._logger.warning("Task %s/%s unavailable on this agent", message.course_id, message.task_id)
                raise CannotCreateJobException('Task unavailable on agent. Please retry later, the agents should synchronize soon. If the error '
                                               'persists, please contact your course administrator.')

            # Let the subclass run the job
            await self.new_job(message)
        except CannotCreateJobException as e:
            await self.send_job_result(message.job_id, "crash", e.message)
        except TooManyCallsException:
            self._logger.exception("TooManyCallsException in new_job")
            await self.send_job_result(message.job_id, "crash", "An unknown error occured in the agent. Please contact your course "
                                                                "administrator.")
        except JobNotRunningException:
            self._logger.exception("JobNotRunningException in new_job")
        except:
            self._logger.exception("Unknown exception in new_job")
            await self.send_job_result(message.job_id, "crash", "An unknown error occured in the agent. Please contact your course "
                                                                "administrator.")
예제 #2
0
    async def handle_new_job(self, message: BackendNewJob):
        """
        Handles a new job: starts the grading container
        """
        try:
            self._logger.info("Received request for jobid %s", message.job_id)

            course_id = message.course_id
            task_id = message.task_id

            debug = message.debug
            environment_name = message.environment
            enable_network = message.enable_network
            time_limit = message.time_limit
            hard_time_limit = message.hard_time_limit or time_limit * 3
            mem_limit = message.mem_limit

            task_fs = self.tasks_fs.from_subfolder(course_id).from_subfolder(
                task_id)
            if not task_fs.exists():
                self._logger.warning("Task %s/%s unavailable on this agent",
                                     course_id, task_id)
                await self.send_job_result(
                    message.job_id, "crash",
                    'Task unavailable on agent. Please retry later, the agents should synchronize soon. If the error '
                    'persists, please contact your course administrator.')
                return

            # Check for realistic memory limit value
            if mem_limit < 20:
                mem_limit = 20
            elif mem_limit > self._max_memory_per_slot:
                self._logger.warning(
                    "Task %s/%s ask for too much memory (%dMB)! Available: %dMB",
                    course_id, task_id, mem_limit, self._max_memory_per_slot)
                await self.send_job_result(
                    message.job_id, "crash",
                    'Not enough memory on agent (available: %dMB). Please contact your course administrator.'
                    % self._max_memory_per_slot)
                return

            if environment_name not in self._containers:
                self._logger.warning(
                    "Task %s/%s ask for an unknown environment %s (not in aliases)",
                    course_id, task_id, environment_name)
                await self.send_job_result(
                    message.job_id, "crash",
                    'Unknown container. Please contact your course administrator.'
                )
                return

            environment = self._containers[environment_name]["id"]

            # Handle ssh debugging
            ssh_port = None
            if debug == "ssh":
                # allow 30 minutes of real time.
                time_limit = 30 * 60
                hard_time_limit = 30 * 60

                # select a port
                if len(self.ssh_ports) == 0:
                    self._logger.warning(
                        "User asked for an ssh debug but no ports are available"
                    )
                    await self.send_job_result(
                        message.job_id, "crash",
                        'No ports are available for SSH debug right now. Please retry later.'
                    )
                    return
                ssh_port = self.ssh_ports.pop()

            # Create directories for storing all the data for the job
            try:
                container_path = tempfile.mkdtemp(dir=self.tmp_dir)
            except Exception as e:
                self._logger.error("Cannot make container temp directory! %s",
                                   str(e),
                                   exc_info=True)
                await self.send_job_result(
                    message.job_id, "crash",
                    'Cannot make container temp directory.')
                if ssh_port is not None:
                    self.ssh_ports.add(ssh_port)
                return

            task_path = os.path.join(container_path,
                                     'task')  # tmp_dir/id/task/
            sockets_path = os.path.join(container_path,
                                        'sockets')  # tmp_dir/id/socket/
            student_path = os.path.join(task_path,
                                        'student')  # tmp_dir/id/task/student/
            systemfiles_path = os.path.join(
                task_path, 'systemfiles')  # tmp_dir/id/task/systemfiles/

            # Create the needed directories
            os.mkdir(sockets_path)
            os.chmod(container_path, 0o777)
            os.chmod(sockets_path, 0o777)

            # TODO: avoid copy
            await self._loop.run_in_executor(
                None, lambda: task_fs.copy_from(None, task_path))
            os.chmod(task_path, 0o777)

            if not os.path.exists(student_path):
                os.mkdir(student_path)
                os.chmod(student_path, 0o777)

            # Run the container
            try:
                container_id = await self._loop.run_in_executor(
                    None, lambda: self._docker.create_container(
                        environment, enable_network, mem_limit, task_path,
                        sockets_path, ssh_port))
            except Exception as e:
                self._logger.warning("Cannot create container! %s",
                                     str(e),
                                     exc_info=True)
                await self.send_job_result(message.job_id, "crash",
                                           'Cannot create container.')
                await self._loop.run_in_executor(
                    None, lambda: rmtree(container_path))
                if ssh_port is not None:
                    self.ssh_ports.add(ssh_port)
                return

            # Store info
            future_results = asyncio.Future()
            self._containers_running[
                container_id] = message, container_path, future_results
            self._container_for_job[message.job_id] = container_id
            self._student_containers_for_job[message.job_id] = set()
            if ssh_port is not None:
                self.running_ssh_debug[container_id] = ssh_port

            try:
                # Start the container
                await self._loop.run_in_executor(
                    None, lambda: self._docker.start_container(container_id))
            except Exception as e:
                self._logger.warning("Cannot start container! %s",
                                     str(e),
                                     exc_info=True)
                await self.send_job_result(message.job_id, "crash",
                                           'Cannot start container')
                await self._loop.run_in_executor(
                    None, lambda: rmtree(container_path))
                if ssh_port is not None:
                    self.ssh_ports.add(ssh_port)
                return

            # Talk to the container
            self._loop.create_task(
                self.handle_running_container(
                    message.job_id, container_id, message.inputdata, debug,
                    ssh_port, environment_name, mem_limit, time_limit,
                    hard_time_limit, sockets_path, student_path,
                    systemfiles_path, future_results))

            # Ask the "cgroup" thread to verify the timeout/memory limit
            await ZMQUtils.send(
                self._killer_watcher_push.get_push_socket(),
                KWPRegisterContainer(container_id, mem_limit, time_limit,
                                     hard_time_limit))

            # Tell the backend/client the job has started
            await ZMQUtils.send(self._backend_socket,
                                AgentJobStarted(message.job_id))
        except:
            self._logger.exception("Exception in handle_new_job")