def __init__(self, context, backend_addr, friendly_name, nb_sub_agents, tasks_fs: FileSystemProvider, ssh_host=None, ssh_ports=None, tmp_dir="./agent_tmp"): """ :param context: ZeroMQ context for this process :param backend_addr: address of the backend (for example, "tcp://127.0.0.1:2222") :param friendly_name: a string containing a friendly name to identify agent :param nb_sub_agents: nb of slots available for this agent :param tasks_fs: FileSystemProvider for the course/tasks :param ssh_host: hostname/ip/... to which external client should connect to access to an ssh remote debug session :param ssh_ports: iterable containing ports to which the docker instance can assign ssh servers (for remote debugging) :param tmp_dir: temp dir that is used by the agent to start new containers """ self._logger = logging.getLogger("inginious.agent.docker") self._logger.info("Starting agent") self._backend_addr = backend_addr self._context = context self._loop = asyncio.get_event_loop() self._friendly_name = friendly_name self._nb_sub_agents = nb_sub_agents self._max_memory_per_slot = int(psutil.virtual_memory().total / nb_sub_agents / 1024 / 1024) # data about running containers self._containers_running = {} self._student_containers_running = {} self._containers_ending = {} self._student_containers_ending = {} self._container_for_job = {} self._student_containers_for_job = {} self.tmp_dir = tmp_dir self.tasks_fs = tasks_fs # Delete tmp_dir, and recreate-it again try: rmtree(tmp_dir) except: pass try: os.mkdir(tmp_dir) except OSError: pass # Docker self._docker = DockerInterface() # Auto discover containers self._logger.info("Discovering containers") self._containers = self._docker.get_containers() # SSH remote debug self.ssh_host = ssh_host if self.ssh_host is None and len(self._containers) != 0: self._logger.info("Guessing external host IP") self.ssh_host = self._docker.get_host_ip( next(iter(self._containers.values()))["id"]) if self.ssh_host is None: self._logger.warning( "Cannot find external host IP. Please indicate it in the configuration. Remote SSH debug has been deactivated." ) ssh_ports = None else: self._logger.info("External address for SSH remote debug is %s", self.ssh_host) self.ssh_ports = set(ssh_ports) if ssh_ports is not None else set() self.running_ssh_debug = {} # container_id : ssh_port # Sockets self._backend_socket = self._context.socket(zmq.DEALER) self._backend_socket.ipv6 = True self._docker_events_publisher = self._context.socket(zmq.PUB) self._docker_events_subscriber = self._context.socket(zmq.SUB) # Watchers self._killer_watcher_push = PipelinePush(context, "agentpush") self._killer_watcher_pull = PipelinePull(context, "agentpull") self._timeout_watcher = TimeoutWatcher(context, self._docker) self._containers_killed = dict() # Poller self._poller = Poller() self._poller.register(self._backend_socket, zmq.POLLIN) self._poller.register(self._docker_events_subscriber, zmq.POLLIN) self._poller.register(self._killer_watcher_pull.get_pull_socket(), zmq.POLLIN)
def __init__(self, context, backend_addr, friendly_name, concurrency, tasks_fs: FileSystemProvider, ssh_host=None, ssh_ports=None, tmp_dir="./agent_tmp"): """ :param context: ZeroMQ context for this process :param backend_addr: address of the backend (for example, "tcp://127.0.0.1:2222") :param friendly_name: a string containing a friendly name to identify agent :param concurrency: number of simultaneous jobs that can be run by this agent :param tasks_fs: FileSystemProvider for the course / tasks :param ssh_host: hostname/ip/... to which external client should connect to access to an ssh remote debug session :param ssh_ports: iterable containing ports to which the docker instance can assign ssh servers (for remote debugging) :param tmp_dir: temp dir that is used by the agent to start new containers """ super(DockerAgent, self).__init__(context, backend_addr, friendly_name, concurrency, tasks_fs) self._logger = logging.getLogger("inginious.agent.docker") self._max_memory_per_slot = int(psutil.virtual_memory().total/concurrency/1024/1024) # Data about running containers self._containers_running = {} self._container_for_job = {} self._student_containers_running = {} self._student_containers_for_job = {} self.tasks_fs = tasks_fs self._containers_killed = dict() # Temp dir self._tmp_dir = tmp_dir # Delete tmp_dir, and recreate-it again try: rmtree(tmp_dir) except: pass try: os.mkdir(tmp_dir) except OSError: pass # Docker self._docker = DockerInterface() # Auto discover containers self._logger.info("Discovering containers") self._containers = self._docker.get_containers() # SSH remote debug self._ssh_host = ssh_host if self._ssh_host is None and len(self._containers) != 0: self._logger.info("Guessing external host IP") self._ssh_host = self._docker.get_host_ip(next(iter(self._containers.values()))["id"]) if self._ssh_host is None: self._logger.warning("Cannot find external host IP. Please indicate it in the configuration. Remote SSH debug has been deactivated.") ssh_ports = None else: self._logger.info("External address for SSH remote debug is %s", self._ssh_host) self._ssh_ports = set(ssh_ports) if ssh_ports is not None else set() self._running_ssh_debug = {} # container_id : ssh_port # Watchers self._timeout_watcher = TimeoutWatcher(self._docker)
class DockerAgent(object): def __init__(self, context, backend_addr, friendly_name, nb_sub_agents, tasks_fs: FileSystemProvider, ssh_host=None, ssh_ports=None, tmp_dir="./agent_tmp"): """ :param context: ZeroMQ context for this process :param backend_addr: address of the backend (for example, "tcp://127.0.0.1:2222") :param friendly_name: a string containing a friendly name to identify agent :param nb_sub_agents: nb of slots available for this agent :param tasks_fs: FileSystemProvider for the course/tasks :param ssh_host: hostname/ip/... to which external client should connect to access to an ssh remote debug session :param ssh_ports: iterable containing ports to which the docker instance can assign ssh servers (for remote debugging) :param tmp_dir: temp dir that is used by the agent to start new containers """ self._logger = logging.getLogger("inginious.agent.docker") self._logger.info("Starting agent") self._backend_addr = backend_addr self._context = context self._loop = asyncio.get_event_loop() self._friendly_name = friendly_name self._nb_sub_agents = nb_sub_agents self._max_memory_per_slot = int(psutil.virtual_memory().total / nb_sub_agents / 1024 / 1024) # data about running containers self._containers_running = {} self._student_containers_running = {} self._containers_ending = {} self._student_containers_ending = {} self._container_for_job = {} self._student_containers_for_job = {} self.tmp_dir = tmp_dir self.tasks_fs = tasks_fs # Delete tmp_dir, and recreate-it again try: rmtree(tmp_dir) except: pass try: os.mkdir(tmp_dir) except OSError: pass # Docker self._docker = DockerInterface() # Auto discover containers self._logger.info("Discovering containers") self._containers = self._docker.get_containers() # SSH remote debug self.ssh_host = ssh_host if self.ssh_host is None and len(self._containers) != 0: self._logger.info("Guessing external host IP") self.ssh_host = self._docker.get_host_ip( next(iter(self._containers.values()))["id"]) if self.ssh_host is None: self._logger.warning( "Cannot find external host IP. Please indicate it in the configuration. Remote SSH debug has been deactivated." ) ssh_ports = None else: self._logger.info("External address for SSH remote debug is %s", self.ssh_host) self.ssh_ports = set(ssh_ports) if ssh_ports is not None else set() self.running_ssh_debug = {} # container_id : ssh_port # Sockets self._backend_socket = self._context.socket(zmq.DEALER) self._backend_socket.ipv6 = True self._docker_events_publisher = self._context.socket(zmq.PUB) self._docker_events_subscriber = self._context.socket(zmq.SUB) # Watchers self._killer_watcher_push = PipelinePush(context, "agentpush") self._killer_watcher_pull = PipelinePull(context, "agentpull") self._timeout_watcher = TimeoutWatcher(context, self._docker) self._containers_killed = dict() # Poller self._poller = Poller() self._poller.register(self._backend_socket, zmq.POLLIN) self._poller.register(self._docker_events_subscriber, zmq.POLLIN) self._poller.register(self._killer_watcher_pull.get_pull_socket(), zmq.POLLIN) async def init_watch_docker_events(self): """ Init everything needed to watch docker events """ url = "inproc://docker_events" self._docker_events_publisher.bind(url) self._docker_events_subscriber.connect(url) self._docker_events_subscriber.setsockopt(zmq.SUBSCRIBE, b'') self._loop.create_task(self._watch_docker_events()) async def init_watcher_pipe(self): """ Init the killer pipeline """ # Start elements in the pipeline self._loop.create_task(self._timeout_watcher.run_pipeline()) # Link the pipeline self._timeout_watcher.link(self._killer_watcher_push) # [ if one day we have more watchers, add them here in the pipeline ] self._killer_watcher_pull.link(self._timeout_watcher) async def _watch_docker_events(self): """ Get raw docker events and convert them to more readable objects, and then give them to self._docker_events_subscriber """ try: source = AsyncIteratorWrapper( self._docker.event_stream(filters={"event": ["die", "oom"]})) async for i in source: if i["Type"] == "container" and i["status"] == "die": container_id = i["id"] try: retval = int(i["Actor"]["Attributes"]["exitCode"]) except: self._logger.exception( "Cannot parse exitCode for container %s", container_id) retval = -1 await ZMQUtils.send( self._docker_events_publisher, EventContainerDied(container_id, retval)) elif i["Type"] == "container" and i["status"] == "oom": await ZMQUtils.send(self._docker_events_publisher, EventContainerOOM(i["id"])) else: raise TypeError(str(i)) except: self._logger.exception("Exception in _watch_docker_events") async def handle_backend_message(self, message): """Dispatch messages received from clients to the right handlers""" message_handlers = { BackendNewJob: self.handle_new_job, BackendKillJob: self.handle_kill_job, Ping: self.handle_ping } try: func = message_handlers[message.__class__] except: raise TypeError("Unknown message type %s" % message.__class__) self._loop.create_task(func(message)) async def handle_watcher_pipe_message(self, message): """Dispatch messages received from the watcher pipe to the right handlers""" message_handlers = { KWPKilledStatus: self.handle_kwp_killed_status, KWPRegisterContainer: self.handle_kwp_register_container } try: func = message_handlers[message.__class__] except: raise TypeError("Unknown message type %s" % message.__class__) self._loop.create_task(func(message)) async def handle_kwp_killed_status(self, message: KWPKilledStatus): """ Handles the messages returned by the "killer pipeline", that indicates if a particular container was killed by an element of the pipeline. Gives the message to the right handler. """ if message.container_id in self._containers_ending: self._loop.create_task(self.handle_job_closing_p2(message)) elif message.container_id in self._student_containers_ending: self._loop.create_task(self.handle_student_job_closing_p2(message)) async def handle_kwp_register_container(self, message: KWPRegisterContainer): # ignore pass async def handle_ping(self, _: Ping): """ Handle an Ping message. Pong the backend """ await ZMQUtils.send(self._backend_socket, Pong()) async def handle_new_job(self, message: BackendNewJob): """ Handles a new job: starts the grading container """ try: self._logger.info("Received request for jobid %s", message.job_id) course_id = message.course_id task_id = message.task_id debug = message.debug environment_name = message.environment enable_network = message.enable_network time_limit = message.time_limit hard_time_limit = message.hard_time_limit or time_limit * 3 mem_limit = message.mem_limit task_fs = self.tasks_fs.from_subfolder(course_id).from_subfolder( task_id) if not task_fs.exists(): self._logger.warning("Task %s/%s unavailable on this agent", course_id, task_id) await self.send_job_result( message.job_id, "crash", 'Task unavailable on agent. Please retry later, the agents should synchronize soon. If the error ' 'persists, please contact your course administrator.') return # Check for realistic memory limit value if mem_limit < 20: mem_limit = 20 elif mem_limit > self._max_memory_per_slot: self._logger.warning( "Task %s/%s ask for too much memory (%dMB)! Available: %dMB", course_id, task_id, mem_limit, self._max_memory_per_slot) await self.send_job_result( message.job_id, "crash", 'Not enough memory on agent (available: %dMB). Please contact your course administrator.' % self._max_memory_per_slot) return if environment_name not in self._containers: self._logger.warning( "Task %s/%s ask for an unknown environment %s (not in aliases)", course_id, task_id, environment_name) await self.send_job_result( message.job_id, "crash", 'Unknown container. Please contact your course administrator.' ) return environment = self._containers[environment_name]["id"] # Handle ssh debugging ssh_port = None if debug == "ssh": # allow 30 minutes of real time. time_limit = 30 * 60 hard_time_limit = 30 * 60 # select a port if len(self.ssh_ports) == 0: self._logger.warning( "User asked for an ssh debug but no ports are available" ) await self.send_job_result( message.job_id, "crash", 'No ports are available for SSH debug right now. Please retry later.' ) return ssh_port = self.ssh_ports.pop() # Create directories for storing all the data for the job try: container_path = tempfile.mkdtemp(dir=self.tmp_dir) except Exception as e: self._logger.error("Cannot make container temp directory! %s", str(e), exc_info=True) await self.send_job_result( message.job_id, "crash", 'Cannot make container temp directory.') if ssh_port is not None: self.ssh_ports.add(ssh_port) return task_path = os.path.join(container_path, 'task') # tmp_dir/id/task/ sockets_path = os.path.join(container_path, 'sockets') # tmp_dir/id/socket/ student_path = os.path.join(task_path, 'student') # tmp_dir/id/task/student/ systemfiles_path = os.path.join( task_path, 'systemfiles') # tmp_dir/id/task/systemfiles/ # Create the needed directories os.mkdir(sockets_path) os.chmod(container_path, 0o777) os.chmod(sockets_path, 0o777) # TODO: avoid copy await self._loop.run_in_executor( None, lambda: task_fs.copy_from(None, task_path)) os.chmod(task_path, 0o777) if not os.path.exists(student_path): os.mkdir(student_path) os.chmod(student_path, 0o777) # Run the container try: container_id = await self._loop.run_in_executor( None, lambda: self._docker.create_container( environment, enable_network, mem_limit, task_path, sockets_path, ssh_port)) except Exception as e: self._logger.warning("Cannot create container! %s", str(e), exc_info=True) await self.send_job_result(message.job_id, "crash", 'Cannot create container.') await self._loop.run_in_executor( None, lambda: rmtree(container_path)) if ssh_port is not None: self.ssh_ports.add(ssh_port) return # Store info future_results = asyncio.Future() self._containers_running[ container_id] = message, container_path, future_results self._container_for_job[message.job_id] = container_id self._student_containers_for_job[message.job_id] = set() if ssh_port is not None: self.running_ssh_debug[container_id] = ssh_port try: # Start the container await self._loop.run_in_executor( None, lambda: self._docker.start_container(container_id)) except Exception as e: self._logger.warning("Cannot start container! %s", str(e), exc_info=True) await self.send_job_result(message.job_id, "crash", 'Cannot start container') await self._loop.run_in_executor( None, lambda: rmtree(container_path)) if ssh_port is not None: self.ssh_ports.add(ssh_port) return # Talk to the container self._loop.create_task( self.handle_running_container( message.job_id, container_id, message.inputdata, debug, ssh_port, environment_name, mem_limit, time_limit, hard_time_limit, sockets_path, student_path, systemfiles_path, future_results)) # Ask the "cgroup" thread to verify the timeout/memory limit await ZMQUtils.send( self._killer_watcher_push.get_push_socket(), KWPRegisterContainer(container_id, mem_limit, time_limit, hard_time_limit)) # Tell the backend/client the job has started await ZMQUtils.send(self._backend_socket, AgentJobStarted(message.job_id)) except: self._logger.exception("Exception in handle_new_job") async def create_student_container(self, job_id, parent_container_id, sockets_path, student_path, systemfiles_path, socket_id, environment_name, memory_limit, time_limit, hard_time_limit, share_network, write_stream): """ Creates a new student container. :param write_stream: stream on which to write the return value of the container (with a correctly formatted msgpack message) """ try: self._logger.debug("Starting new student container... %s %s %s %s", environment_name, memory_limit, time_limit, hard_time_limit) if environment_name not in self._containers: self._logger.warning( "Student container asked for an unknown environment %s (not in aliases)", environment_name) await self._write_to_container_stdin( write_stream, { "type": "run_student_retval", "retval": 254, "socket_id": socket_id }) return environment = self._containers[environment_name]["id"] try: socket_path = os.path.join(sockets_path, str(socket_id) + ".sock") container_id = await self._loop.run_in_executor( None, lambda: self._docker.create_container_student( parent_container_id, environment, share_network, memory_limit, student_path, socket_path, systemfiles_path)) except: self._logger.exception("Cannot create student container!") await self._write_to_container_stdin( write_stream, { "type": "run_student_retval", "retval": 254, "socket_id": socket_id }) return self._student_containers_for_job[job_id].add(container_id) self._student_containers_running[ container_id] = job_id, parent_container_id, socket_id, write_stream # send to the container that the sibling has started await self._write_to_container_stdin(write_stream, { "type": "run_student_started", "socket_id": socket_id }) try: await self._loop.run_in_executor( None, lambda: self._docker.start_container(container_id)) except: self._logger.exception("Cannot start student container!") await self._write_to_container_stdin( write_stream, { "type": "run_student_retval", "retval": 254, "socket_id": socket_id }) return # Ask the "cgroup" thread to verify the timeout/memory limit await ZMQUtils.send( self._killer_watcher_push.get_push_socket(), KWPRegisterContainer(container_id, memory_limit, time_limit, hard_time_limit)) except: self._logger.exception("Exception in create_student_container") async def _write_to_container_stdin(self, write_stream, message): """ Send a message to the stdin of a container, with the right data :param write_stream: asyncio write stream to the stdin of the container :param message: dict to be msgpacked and sent """ msg = msgpack.dumps(message, encoding="utf8", use_bin_type=True) self._logger.debug("Sending %i bytes to container", len(msg)) write_stream.write(struct.pack('I', len(msg))) write_stream.write(msg) await write_stream.drain() async def handle_running_container(self, job_id, container_id, inputdata, debug, ssh_port, orig_env, orig_memory_limit, orig_time_limit, orig_hard_time_limit, sockets_path, student_path, systemfiles_path, future_results): """ Talk with a container. Sends the initial input. Allows to start student containers """ sock = await self._loop.run_in_executor( None, lambda: self._docker.attach_to_container(container_id)) try: read_stream, write_stream = await asyncio.open_connection( sock=sock.get_socket()) except: self._logger.exception( "Exception occurred while creating read/write stream to container" ) return None # Send hello msg await self._write_to_container_stdin(write_stream, { "type": "start", "input": inputdata, "debug": debug }) buffer = bytearray() try: while not read_stream.at_eof(): msg_header = await read_stream.readexactly(8) outtype, length = struct.unpack_from( '>BxxxL', msg_header ) # format imposed by docker in the attach endpoint if length != 0: content = await read_stream.readexactly(length) if outtype == 1: # stdout buffer += content if outtype == 2: # stderr self._logger.debug( "Received stderr from containers:\n%s", content) # 4 first bytes are the lenght of the message. If we have a complete message... while len(buffer) > 4 and len(buffer) >= 4 + struct.unpack( 'I', buffer[0:4])[0]: msg_encoded = buffer[ 4:4 + struct.unpack('I', buffer[0:4])[0]] # ... get it buffer = buffer[ 4 + struct.unpack('I', buffer[0:4]) [0]:] # ... withdraw it from the buffer try: msg = msgpack.unpackb(msg_encoded, encoding="utf8", use_list=False) self._logger.debug( "Received msg %s from container %s", msg["type"], container_id) if msg["type"] == "run_student": # start a new student container environment = msg["environment"] or orig_env memory_limit = min( msg["memory_limit"] or orig_memory_limit, orig_memory_limit) time_limit = min( msg["time_limit"] or orig_time_limit, orig_time_limit) hard_time_limit = min( msg["hard_time_limit"] or orig_hard_time_limit, orig_hard_time_limit) share_network = msg["share_network"] socket_id = msg["socket_id"] assert "/" not in socket_id # ensure task creator do not try to break the agent :-( self._loop.create_task( self.create_student_container( job_id, container_id, sockets_path, student_path, systemfiles_path, socket_id, environment, memory_limit, time_limit, hard_time_limit, share_network, write_stream)) elif msg["type"] == "ssh_key": # send the data to the backend (and client) self._logger.info( "%s %s", self.running_ssh_debug[container_id], str(msg)) await ZMQUtils.send( self._backend_socket, AgentJobSSHDebug(job_id, self.ssh_host, ssh_port, msg["ssh_key"])) elif msg["type"] == "result": # last message containing the results of the container future_results.set_result(msg["result"]) write_stream.close() sock.close_socket() return # this is the last message except: self._logger.exception( "Received incorrect message from container %s (job id %s)", container_id, job_id) future_results.set_result(None) write_stream.close() sock.close_socket() return except asyncio.IncompleteReadError: self._logger.debug( "Container output ended with an IncompleteReadError; It was probably killed." ) except: self._logger.exception( "Exception while reading container %s output", container_id) # EOF without result :-( self._logger.warning("Container %s has not given any result", container_id) write_stream.close() sock.close_socket() future_results.set_result(None) async def handle_student_job_closing_p1(self, container_id, retval): """ First part of the student container ending handler. Ask the killer pipeline if they killed the container that recently died. Do some cleaning. """ try: self._logger.debug("Closing student (p1) for %s", container_id) try: job_id, parent_container_id, socket_id, write_stream = self._student_containers_running[ container_id] del self._student_containers_running[container_id] except: self._logger.warning( "Student container %s that has finished(p1) was not launched by this agent", str(container_id), exc_info=True) return # Delete remaining student containers if job_id in self._student_containers_for_job: # if it does not exists, then the parent container has closed self._student_containers_for_job[job_id].remove(container_id) self._student_containers_ending[container_id] = ( job_id, parent_container_id, socket_id, write_stream, retval) await ZMQUtils.send( self._killer_watcher_push.get_push_socket(), KWPKilledStatus( container_id, self._containers_killed[container_id] if container_id in self._containers_killed else None)) except: self._logger.exception( "Exception in handle_student_job_closing_p1") async def handle_student_job_closing_p2(self, killed_msg: KWPKilledStatus): """ Second part of the student container ending handler. Gather results and send them to the grading container associated with the job. """ try: container_id = killed_msg.container_id self._logger.debug("Closing student (p2) for %s", container_id) try: _, parent_container_id, socket_id, write_stream, retval = self._student_containers_ending[ container_id] del self._student_containers_ending[container_id] except: self._logger.warning( "Student container %s that has finished(p2) was not launched by this agent", str(container_id)) return if killed_msg.killed_result == "timeout": retval = 253 elif killed_msg.killed_result == "overflow": retval = 252 try: await self._write_to_container_stdin( write_stream, { "type": "run_student_retval", "retval": retval, "socket_id": socket_id }) except: pass # parent container closed # Do not forget to remove the container try: self._loop.run_in_executor( None, lambda: self._docker.remove_container(container_id)) except: pass # ignore except: self._logger.exception( "Exception in handle_student_job_closing_p1") async def handle_job_closing_p1(self, container_id, retval): """ First part of the end job handler. Ask the killer pipeline if they killed the container that recently died. Do some cleaning. """ try: self._logger.debug("Closing (p1) for %s", container_id) try: message, container_path, future_results = self._containers_running[ container_id] del self._containers_running[container_id] except: self._logger.warning( "Container %s that has finished(p1) was not launched by this agent", str(container_id), exc_info=True) return self._containers_ending[container_id] = (message, container_path, retval, future_results) # Close sub containers for student_container_id_loop in self._student_containers_for_job[ message.job_id]: # little hack to ensure the value of student_container_id_loop is copied into the closure def close_and_delete( student_container_id=student_container_id_loop): try: self._docker.kill_container(student_container_id) self._docker.remove_container(student_container_id) except: pass # ignore asyncio.ensure_future( self._loop.run_in_executor(None, close_and_delete)) del self._student_containers_for_job[message.job_id] # Allow other container to reuse the ssh port this container has finished to use if container_id in self.running_ssh_debug: self.ssh_ports.add(self.running_ssh_debug[container_id]) del self.running_ssh_debug[container_id] await ZMQUtils.send( self._killer_watcher_push.get_push_socket(), KWPKilledStatus( container_id, self._containers_killed[container_id] if container_id in self._containers_killed else None)) except: self._logger.exception("Exception in handle_job_closing_p1") async def handle_job_closing_p2(self, killed_msg: KWPKilledStatus): """ Second part of the end job handler. Gather results and send them to the backend. """ try: container_id = killed_msg.container_id self._logger.debug("Closing (p2) for %s", container_id) try: message, container_path, retval, future_results = self._containers_ending[ container_id] del self._containers_ending[container_id] except: self._logger.warning( "Container %s that has finished(p2) was not launched by this agent", str(container_id)) return stdout = "" stderr = "" result = "crash" if retval == -1 else None error_msg = None grade = None problems = {} custom = {} tests = {} archive = None if killed_msg.killed_result is not None: result = killed_msg.killed_result # If everything did well, continue to retrieve the status from the container if result is None: # Get logs back try: return_value = await future_results # Accepted types for return dict accepted_types = { "stdout": str, "stderr": str, "result": str, "text": str, "grade": float, "problems": dict, "custom": dict, "tests": dict, "archive": str } # Check dict content for key, item in return_value.items(): if not isinstance(item, accepted_types[key]): raise Exception( "Feedback file is badly formatted.") elif accepted_types[key] == dict: for sub_key, sub_item in item.items(): if not id_checker(sub_key) or isinstance( sub_item, dict): raise Exception( "Feedback file is badly formatted.") # Set output fields stdout = return_value.get("stdout", "") stderr = return_value.get("stderr", "") result = return_value.get("result", "error") error_msg = return_value.get("text", "") grade = return_value.get("grade", None) problems = return_value.get("problems", {}) custom = return_value.get("custom", {}) tests = return_value.get("tests", {}) archive = return_value.get("archive", None) if archive is not None: archive = base64.b64decode(archive) except Exception as e: self._logger.exception( "Cannot get back output of container %s! (%s)", container_id, str(e)) result = "crash" error_msg = 'The grader did not return a readable output : {}'.format( str(e)) # Default values if error_msg is None: error_msg = "" if grade is None: if result == "success": grade = 100.0 else: grade = 0.0 # Remove container self._loop.run_in_executor( None, lambda: self._docker.remove_container(container_id)) # Delete folders try: await self._loop.run_in_executor( None, lambda: rmtree(container_path)) except PermissionError: self._logger.debug("Cannot remove old container path!") # todo: run a docker container to force removal # Return! await self.send_job_result(message.job_id, result, error_msg, grade, problems, tests, custom, archive, stdout, stderr) # Do not forget to remove data from internal state del self._container_for_job[message.job_id] if container_id in self._containers_killed: del self._containers_killed[container_id] except: self._logger.exception("Exception in handle_job_closing_p2") async def handle_kill_job(self, message: BackendKillJob): """ Handles `kill` messages. Kill things. """ try: if message.job_id in self._container_for_job: self._containers_killed[self._container_for_job[ message.job_id]] = "killed" await self._loop.run_in_executor( None, self._docker.kill_container, self._container_for_job[message.job_id]) else: self._logger.warning( "Cannot kill container for job %s because it is not running", str(message.job_id)) except: self._logger.exception("Exception in handle_kill_job") async def handle_docker_event(self, message): """ Handles events from Docker, notably `die` and `oom` """ try: if type(message) == EventContainerDied: if message.container_id in self._containers_running: self._loop.create_task( self.handle_job_closing_p1(message.container_id, message.retval)) elif message.container_id in self._student_containers_running: self._loop.create_task( self.handle_student_job_closing_p1( message.container_id, message.retval)) elif type(message) == EventContainerOOM: if message.container_id in self._containers_running or message.container_id in self._student_containers_running: self._logger.info("Container %s did OOM, killing it", message.container_id) self._containers_killed[message.container_id] = "overflow" await self._loop.run_in_executor( None, lambda: self._docker.kill_container( message.container_id)) except: self._logger.exception("Exception in handle_docker_event") async def send_job_result(self, job_id: BackendJobId, result: str, text: str = "", grade: float = None, problems: Dict[str, SPResult] = None, tests: Dict[str, Any] = None, custom: Dict[str, Any] = None, archive: Optional[bytes] = None, stdout: Optional[str] = None, stderr: Optional[str] = None): """ Send the result of a job back to the backend """ if grade is None: if result == "success": grade = 100.0 else: grade = 0.0 if problems is None: problems = {} if custom is None: custom = {} if tests is None: tests = {} await ZMQUtils.send( self._backend_socket, AgentJobDone(job_id, (result, text), round(grade, 2), problems, tests, custom, archive, stdout, stderr)) async def run_dealer(self): """ Run the agent """ self._logger.info("Agent started") self._backend_socket.connect(self._backend_addr) # Init Docker events watcher await self.init_watch_docker_events() # Init watcher pipe await self.init_watcher_pipe() # Tell the backend we are up and have `nb_sub_agents` threads available self._logger.info("Saying hello to the backend") await ZMQUtils.send( self._backend_socket, AgentHello(self._friendly_name, self._nb_sub_agents, self._containers)) # And then run the agent try: while True: socks = await self._poller.poll() socks = dict(socks) # New message from backend if self._backend_socket in socks: message = await ZMQUtils.recv(self._backend_socket) await self.handle_backend_message(message) # New docker event if self._docker_events_subscriber in socks: message = await ZMQUtils.recv( self._docker_events_subscriber) await self.handle_docker_event(message) # End of watcher pipe if self._killer_watcher_pull.get_pull_socket() in socks: message = await ZMQUtils.recv( self._killer_watcher_pull.get_pull_socket()) await self.handle_watcher_pipe_message(message) except asyncio.CancelledError: return except KeyboardInterrupt: return
class DockerAgent(Agent): def __init__(self, context, backend_addr, friendly_name, concurrency, tasks_fs: FileSystemProvider, ssh_host=None, ssh_ports=None, tmp_dir="./agent_tmp"): """ :param context: ZeroMQ context for this process :param backend_addr: address of the backend (for example, "tcp://127.0.0.1:2222") :param friendly_name: a string containing a friendly name to identify agent :param concurrency: number of simultaneous jobs that can be run by this agent :param tasks_fs: FileSystemProvider for the course / tasks :param ssh_host: hostname/ip/... to which external client should connect to access to an ssh remote debug session :param ssh_ports: iterable containing ports to which the docker instance can assign ssh servers (for remote debugging) :param tmp_dir: temp dir that is used by the agent to start new containers """ super(DockerAgent, self).__init__(context, backend_addr, friendly_name, concurrency, tasks_fs) self._logger = logging.getLogger("inginious.agent.docker") self._max_memory_per_slot = int(psutil.virtual_memory().total/concurrency/1024/1024) # Data about running containers self._containers_running = {} self._container_for_job = {} self._student_containers_running = {} self._student_containers_for_job = {} self.tasks_fs = tasks_fs self._containers_killed = dict() # Temp dir self._tmp_dir = tmp_dir # Delete tmp_dir, and recreate-it again try: rmtree(tmp_dir) except: pass try: os.mkdir(tmp_dir) except OSError: pass # Docker self._docker = DockerInterface() # Auto discover containers self._logger.info("Discovering containers") self._containers = self._docker.get_containers() # SSH remote debug self._ssh_host = ssh_host if self._ssh_host is None and len(self._containers) != 0: self._logger.info("Guessing external host IP") self._ssh_host = self._docker.get_host_ip(next(iter(self._containers.values()))["id"]) if self._ssh_host is None: self._logger.warning("Cannot find external host IP. Please indicate it in the configuration. Remote SSH debug has been deactivated.") ssh_ports = None else: self._logger.info("External address for SSH remote debug is %s", self._ssh_host) self._ssh_ports = set(ssh_ports) if ssh_ports is not None else set() self._running_ssh_debug = {} # container_id : ssh_port # Watchers self._timeout_watcher = TimeoutWatcher(self._docker) @property def environments(self): return self._containers async def _watch_docker_events(self): """ Get raw docker events and convert them to more readable objects, and then give them to self._docker_events_subscriber """ try: source = AsyncIteratorWrapper(self._docker.event_stream(filters={"event": ["die", "oom"]})) async for i in source: if i["Type"] == "container" and i["status"] == "die": container_id = i["id"] try: retval = int(i["Actor"]["Attributes"]["exitCode"]) except: self._logger.exception("Cannot parse exitCode for container %s", container_id) retval = -1 if container_id in self._containers_running: self._loop.create_task(self.handle_job_closing(container_id, retval)) elif container_id in self._student_containers_running: self._loop.create_task(self.handle_student_job_closing(container_id, retval)) elif container_id in self._batch_containers_running: self._loop.create_task(self.handle_batch_job_closing(container_id, retval)) elif i["Type"] == "container" and i["status"] == "oom": container_id = i["id"] if container_id in self._containers_running or container_id in self._student_containers_running: self._logger.info("Container %s did OOM, killing it", container_id) self._containers_killed[container_id] = "overflow" try: self._loop.create_task(self._loop.run_in_executor(None, lambda: self._docker.kill_container(container_id))) except: # this call can sometimes fail, and that is normal. pass else: raise TypeError(str(i)) except: self._logger.exception("Exception in _watch_docker_events") async def new_job(self, message: BackendNewJob): """ Handles a new job: starts the grading container """ try: self._logger.info("Received request for jobid %s", message.job_id) course_id = message.course_id task_id = message.task_id debug = message.debug environment_name = message.environment enable_network = message.enable_network time_limit = message.time_limit hard_time_limit = message.hard_time_limit or time_limit * 3 mem_limit = message.mem_limit task_fs = self.tasks_fs.from_subfolder(course_id).from_subfolder(task_id) if not task_fs.exists(): self._logger.warning("Task %s/%s unavailable on this agent", course_id, task_id) await self.send_job_result(message.job_id, "crash", 'Task unavailable on agent. Please retry later, the agents should synchronize soon. If the error ' 'persists, please contact your course administrator.') return # Check for realistic memory limit value if mem_limit < 20: mem_limit = 20 elif mem_limit > self._max_memory_per_slot: self._logger.warning("Task %s/%s ask for too much memory (%dMB)! Available: %dMB", course_id, task_id, mem_limit, self._max_memory_per_slot) await self.send_job_result(message.job_id, "crash", 'Not enough memory on agent (available: %dMB). Please contact your course administrator.' % self._max_memory_per_slot) return if environment_name not in self._containers: self._logger.warning("Task %s/%s ask for an unknown environment %s (not in aliases)", course_id, task_id, environment_name) await self.send_job_result(message.job_id, "crash", 'Unknown container. Please contact your course administrator.') return environment = self._containers[environment_name]["id"] # Handle ssh debugging ssh_port = None if debug == "ssh": # allow 30 minutes of real time. time_limit = 30 * 60 hard_time_limit = 30 * 60 # select a port if len(self._ssh_ports) == 0: self._logger.warning("User asked for an ssh debug but no ports are available") await self.send_job_result(message.job_id, "crash", 'No ports are available for SSH debug right now. Please retry later.') return ssh_port = self._ssh_ports.pop() # Create directories for storing all the data for the job try: container_path = tempfile.mkdtemp(dir=self._tmp_dir) except Exception as e: self._logger.error("Cannot make container temp directory! %s", str(e), exc_info=True) await self.send_job_result(message.job_id, "crash", 'Cannot make container temp directory.') if ssh_port is not None: self._ssh_ports.add(ssh_port) return task_path = os.path.join(container_path, 'task') # tmp_dir/id/task/ sockets_path = os.path.join(container_path, 'sockets') # tmp_dir/id/socket/ student_path = os.path.join(task_path, 'student') # tmp_dir/id/task/student/ systemfiles_path = os.path.join(task_path, 'systemfiles') # tmp_dir/id/task/systemfiles/ # Create the needed directories os.mkdir(sockets_path) os.chmod(container_path, 0o777) os.chmod(sockets_path, 0o777) # TODO: avoid copy await self._loop.run_in_executor(None, lambda: task_fs.copy_from(None, task_path)) os.chmod(task_path, 0o777) if not os.path.exists(student_path): os.mkdir(student_path) os.chmod(student_path, 0o777) # Run the container try: container_id = await self._loop.run_in_executor(None, lambda: self._docker.create_container(environment, enable_network, mem_limit, task_path, sockets_path, ssh_port)) except Exception as e: self._logger.warning("Cannot create container! %s", str(e), exc_info=True) await self.send_job_result(message.job_id, "crash", 'Cannot create container.') await self._loop.run_in_executor(None, lambda: rmtree(container_path)) if ssh_port is not None: self._ssh_ports.add(ssh_port) return # Store info future_results = asyncio.Future() self._containers_running[container_id] = message, container_path, future_results self._container_for_job[message.job_id] = container_id self._student_containers_for_job[message.job_id] = set() if ssh_port is not None: self._running_ssh_debug[container_id] = ssh_port try: # Start the container await self._loop.run_in_executor(None, lambda: self._docker.start_container(container_id)) except Exception as e: self._logger.warning("Cannot start container! %s", str(e), exc_info=True) await self.send_job_result(message.job_id, "crash", 'Cannot start container') await self._loop.run_in_executor(None, lambda: rmtree(container_path)) if ssh_port is not None: self._ssh_ports.add(ssh_port) raise CannotCreateJobException('Cannot start container') # Talk to the container self._loop.create_task(self.handle_running_container(message.job_id, container_id, message.inputdata, debug, ssh_port, environment_name, mem_limit, time_limit, hard_time_limit, sockets_path, student_path, systemfiles_path, future_results)) # Verify the time limit await self._timeout_watcher.register_container(container_id, time_limit, hard_time_limit) except: self._logger.exception("Exception in new_job") async def create_student_container(self, job_id, parent_container_id, sockets_path, student_path, systemfiles_path, socket_id, environment_name, memory_limit, time_limit, hard_time_limit, share_network, write_stream): """ Creates a new student container. :param write_stream: stream on which to write the return value of the container (with a correctly formatted msgpack message) """ try: self._logger.debug("Starting new student container... %s %s %s %s", environment_name, memory_limit, time_limit, hard_time_limit) if environment_name not in self._containers: self._logger.warning("Student container asked for an unknown environment %s (not in aliases)", environment_name) await self._write_to_container_stdin(write_stream, {"type": "run_student_retval", "retval": 254, "socket_id": socket_id}) return environment = self._containers[environment_name]["id"] try: socket_path = os.path.join(sockets_path, str(socket_id) + ".sock") container_id = await self._loop.run_in_executor(None, lambda: self._docker.create_container_student(parent_container_id, environment, share_network, memory_limit, student_path, socket_path, systemfiles_path)) except: self._logger.exception("Cannot create student container!") await self._write_to_container_stdin(write_stream, {"type": "run_student_retval", "retval": 254, "socket_id": socket_id}) return self._student_containers_for_job[job_id].add(container_id) self._student_containers_running[container_id] = job_id, parent_container_id, socket_id, write_stream # send to the container that the sibling has started await self._write_to_container_stdin(write_stream, {"type": "run_student_started", "socket_id": socket_id}) try: await self._loop.run_in_executor(None, lambda: self._docker.start_container(container_id)) except: self._logger.exception("Cannot start student container!") await self._write_to_container_stdin(write_stream, {"type": "run_student_retval", "retval": 254, "socket_id": socket_id}) return # Verify the time limit await self._timeout_watcher.register_container(container_id, time_limit, hard_time_limit) except: self._logger.exception("Exception in create_student_container") async def _write_to_container_stdin(self, write_stream, message): """ Send a message to the stdin of a container, with the right data :param write_stream: asyncio write stream to the stdin of the container :param message: dict to be msgpacked and sent """ msg = msgpack.dumps(message, encoding="utf8", use_bin_type=True) self._logger.debug("Sending %i bytes to container", len(msg)) write_stream.write(struct.pack('I', len(msg))) write_stream.write(msg) await write_stream.drain() async def handle_running_container(self, job_id, container_id, inputdata, debug, ssh_port, orig_env, orig_memory_limit, orig_time_limit, orig_hard_time_limit, sockets_path, student_path, systemfiles_path, future_results): """ Talk with a container. Sends the initial input. Allows to start student containers """ sock = await self._loop.run_in_executor(None, lambda: self._docker.attach_to_container(container_id)) try: read_stream, write_stream = await asyncio.open_connection(sock=sock.get_socket()) except: self._logger.exception("Exception occurred while creating read/write stream to container") return None # Send hello msg await self._write_to_container_stdin(write_stream, {"type": "start", "input": inputdata, "debug": debug}) buffer = bytearray() try: while not read_stream.at_eof(): msg_header = await read_stream.readexactly(8) outtype, length = struct.unpack_from('>BxxxL', msg_header) # format imposed by docker in the attach endpoint if length != 0: content = await read_stream.readexactly(length) if outtype == 1: # stdout buffer += content if outtype == 2: # stderr self._logger.debug("Received stderr from containers:\n%s", content) # 4 first bytes are the lenght of the message. If we have a complete message... while len(buffer) > 4 and len(buffer) >= 4+struct.unpack('I',buffer[0:4])[0]: msg_encoded = buffer[4:4 + struct.unpack('I', buffer[0:4])[0]] # ... get it buffer = buffer[4 + struct.unpack('I', buffer[0:4])[0]:] # ... withdraw it from the buffer try: msg = msgpack.unpackb(msg_encoded, encoding="utf8", use_list=False) self._logger.debug("Received msg %s from container %s", msg["type"], container_id) if msg["type"] == "run_student": # start a new student container environment = msg["environment"] or orig_env memory_limit = min(msg["memory_limit"] or orig_memory_limit, orig_memory_limit) time_limit = min(msg["time_limit"] or orig_time_limit, orig_time_limit) hard_time_limit = min(msg["hard_time_limit"] or orig_hard_time_limit, orig_hard_time_limit) share_network = msg["share_network"] socket_id = msg["socket_id"] assert "/" not in socket_id # ensure task creator do not try to break the agent :-( self._loop.create_task(self.create_student_container(job_id, container_id, sockets_path, student_path, systemfiles_path, socket_id, environment, memory_limit, time_limit, hard_time_limit, share_network, write_stream)) elif msg["type"] == "ssh_key": # send the data to the backend (and client) self._logger.info("%s %s", self._running_ssh_debug[container_id], str(msg)) await self.send_ssh_job_info(job_id, self._ssh_host, ssh_port, msg["ssh_key"]) elif msg["type"] == "result": # last message containing the results of the container future_results.set_result(msg["result"]) write_stream.close() sock.close_socket() return # this is the last message except: self._logger.exception("Received incorrect message from container %s (job id %s)", container_id, job_id) future_results.set_result(None) write_stream.close() sock.close_socket() return except asyncio.IncompleteReadError: self._logger.debug("Container output ended with an IncompleteReadError; It was probably killed.") except: self._logger.exception("Exception while reading container %s output", container_id) # EOF without result :-( self._logger.warning("Container %s has not given any result", container_id) write_stream.close() sock.close_socket() future_results.set_result(None) async def handle_student_job_closing(self, container_id, retval): """ Handle a closing student container. Do some cleaning, verify memory limits, timeouts, ... and returns data to the associated grading container """ try: self._logger.debug("Closing student %s", container_id) try: job_id, parent_container_id, socket_id, write_stream = self._student_containers_running[container_id] del self._student_containers_running[container_id] except: self._logger.warning("Student container %s that has finished(p1) was not launched by this agent", str(container_id), exc_info=True) return # Delete remaining student containers if job_id in self._student_containers_for_job: # if it does not exists, then the parent container has closed self._student_containers_for_job[job_id].remove(container_id) killed = await self._timeout_watcher.was_killed(container_id) if container_id in self._containers_killed: killed = self._containers_killed[container_id] del self._containers_killed[container_id] if killed == "timeout": retval = 253 elif killed == "overflow": retval = 252 try: await self._write_to_container_stdin(write_stream, {"type": "run_student_retval", "retval": retval, "socket_id": socket_id}) except: pass # parent container closed # Do not forget to remove the container try: self._loop.run_in_executor(None, lambda: self._docker.remove_container(container_id)) except: pass # ignore except: self._logger.exception("Exception in handle_student_job_closing") async def handle_job_closing(self, container_id, retval): """ Handle a closing student container. Do some cleaning, verify memory limits, timeouts, ... and returns data to the backend """ try: self._logger.debug("Closing %s", container_id) try: message, container_path, future_results = self._containers_running[container_id] del self._containers_running[container_id] except: self._logger.warning("Container %s that has finished(p1) was not launched by this agent", str(container_id), exc_info=True) return # Close sub containers for student_container_id_loop in self._student_containers_for_job[message.job_id]: # little hack to ensure the value of student_container_id_loop is copied into the closure def close_and_delete(student_container_id=student_container_id_loop): try: self._docker.kill_container(student_container_id) self._docker.remove_container(student_container_id) except: pass # ignore asyncio.ensure_future(self._loop.run_in_executor(None, close_and_delete)) del self._student_containers_for_job[message.job_id] # Allow other container to reuse the ssh port this container has finished to use if container_id in self._running_ssh_debug: self._ssh_ports.add(self._running_ssh_debug[container_id]) del self._running_ssh_debug[container_id] # Verify if the container was killed, either by the client, by an OOM or by a timeout killed = await self._timeout_watcher.was_killed(container_id) if container_id in self._containers_killed: killed = self._containers_killed[container_id] self._ssh_ports.add(self._running_ssh_debug[container_id]) del self._containers_killed[container_id] stdout = "" stderr = "" result = "crash" if retval == -1 else None error_msg = None grade = None problems = {} custom = {} tests = {} archive = None if killed is not None: result = killed # If everything did well, continue to retrieve the status from the container if result is None: # Get logs back try: return_value = await future_results # Accepted types for return dict accepted_types = {"stdout": str, "stderr": str, "result": str, "text": str, "grade": float, "problems": dict, "custom": dict, "tests": dict, "archive": str} # Check dict content for key, item in return_value.items(): if not isinstance(item, accepted_types[key]): raise Exception("Feedback file is badly formatted.") elif accepted_types[key] == dict: for sub_key, sub_item in item.items(): if not id_checker(sub_key) or isinstance(sub_item, dict): raise Exception("Feedback file is badly formatted.") # Set output fields stdout = return_value.get("stdout", "") stderr = return_value.get("stderr", "") result = return_value.get("result", "error") error_msg = return_value.get("text", "") grade = return_value.get("grade", None) problems = return_value.get("problems", {}) custom = return_value.get("custom", {}) tests = return_value.get("tests", {}) archive = return_value.get("archive", None) if archive is not None: archive = base64.b64decode(archive) except Exception as e: self._logger.exception("Cannot get back output of container %s! (%s)", container_id, str(e)) result = "crash" error_msg = 'The grader did not return a readable output : {}'.format(str(e)) # Default values if error_msg is None: error_msg = "" if grade is None: if result == "success": grade = 100.0 else: grade = 0.0 # Remove container self._loop.run_in_executor(None, lambda: self._docker.remove_container(container_id)) # Delete folders try: await self._loop.run_in_executor(None, lambda: rmtree(container_path)) except PermissionError: self._logger.debug("Cannot remove old container path!") pass # todo: run a docker container to force removal # Return! await self.send_job_result(message.job_id, result, error_msg, grade, problems, tests, custom, archive, stdout, stderr) # Do not forget to remove data from internal state del self._container_for_job[message.job_id] except: self._logger.exception("Exception in handle_job_closing") async def kill_job(self, message: BackendKillJob): """ Handles `kill` messages. Kill things. """ try: if message.job_id in self._container_for_job: self._containers_killed[self._container_for_job[message.job_id]] = "killed" await self._loop.run_in_executor(None, self._docker.kill_container, self._container_for_job[message.job_id]) else: self._logger.warning("Cannot kill container for job %s because it is not running", str(message.job_id)) except: self._logger.exception("Exception in handle_kill_job") async def run(self): # Init Docker events watcher self._loop.create_task(self._watch_docker_events()) await super(DockerAgent, self).run()