Пример #1
0
    def post(self, *args, **kwargs):
        assignment_id = self.get_assignment_id(**kwargs)

        config_dao = daos.AssignmentConfigDao(self.settings)
        config = config_dao.find_by_id(assignment_id)
        if not config:
            self.abort({"message": "assignment configuration not found"})
            return

        if not self._assert_run_valid(config):
            # abort in valid-run method for conciseness
            return

        run_attrs = {
            **self.body,
            "assignment_id": assignment_id,
            "started_at": get_time(),
            "state": models.GradingRunState.READY,
            "student_jobs_left": len(self.body.get("students_env")),
        }
        run = models.GradingRun(**run_attrs)

        run_dao = daos.GradingRunDao(self.settings)
        run.id = str(run_dao.insert(run).inserted_id)

        if not continue_grading_run(self.settings, run):
            self.abort({"message": "failed to start grading run"}, status=500)
            return

        # trigger schedule event
        tornado.ioloop.IOLoop.current().add_callback(worker_schedule_job,
                                                     self.settings)

        return {"grading_run_id": run.id}
Пример #2
0
def _handle_lost_worker_node(settings, worker, reason="timeout"):
    lost_run_id = worker.running_job_id

    worker.is_alive = False
    worker.running_job_id = None
    worker_dao = WorkerNodeDao(settings)
    worker_dao.update(worker)

    if not lost_run_id:
        logger.critical(
            "worker '{}' went offline unexpectedly on '{}' due to {}".format(
                worker.id, worker.hostname, reason))
        return

    logger.critical("worker '{}' went offline unexpectedly on '{}' while"
                    " executing '{}' due to {}".format(worker.id,
                                                       worker.hostname,
                                                       lost_run_id, reason))

    jobs_dao = GradingJobDao(settings)
    job = jobs_dao.find_by_id(lost_run_id)
    if job is None:
        logger.critical(("worker was reportedly executing job '{}' "
                         "but this job does not exist").format(lost_run_id))
        return

    job.finished_at = get_time()
    job.success = False
    job.results = [{"result": "worker died while executing job"}]
    jobs_dao.update(job)

    tornado.ioloop.IOLoop.current().add_callback(job_update_callback, settings,
                                                 lost_run_id, job.run_id)
Пример #3
0
    def post(self, *args, **kwargs):
        worker_id = kwargs.get("worker_id")
        hostname = self.body.get("hostname")

        worker_node_dao = daos.WorkerNodeDao(self.settings)

        worker_node = models.WorkerNode(id_=worker_id,
                                        hostname=hostname,
                                        last_seen=get_time(),
                                        is_alive=True)

        dup = worker_node_dao.find_by_id(worker_id)

        if dup is None:
            logger.info("new worker {} joined on {}".format(
                worker_id, hostname))
            worker_node_dao.insert(worker_node)
        elif not dup.is_alive:
            dup.is_alive = True
            logger.info("worker {} alive again on {}".format(
                worker_id, hostname))
            worker_node_dao.update(dup)
        else:
            msg = "worker id '{}' already exists".format(worker_id)
            logger.info(msg)
            self.abort({"message": msg}, status=400)
            return

        return {"heartbeat": self.get_flags()["heartbeat_interval"]}
Пример #4
0
    def handler_register(self, hostname):
        if self.worker_id is None:
            return

        worker_node_dao = daos.WorkerNodeDao(self.settings)

        dup = worker_node_dao.find_by_id(self.worker_id)

        if dup is None:
            self.worker_node = models.WorkerNode(
                id_=self.worker_id,
                hostname=hostname,
                last_seen=get_time(),
                is_alive=True,
                use_ws=True,
            )
            logger.info("new worker '{}' joined on '{}'".format(
                self.worker_id, hostname))
            worker_node_dao.insert(self.worker_node)
        elif not dup.is_alive:
            self.worker_node = dup
            self.worker_node.hostname = hostname
            self.worker_node.last_seen = get_time()
            self.worker_node.is_alive = True
            self.use_ws = True
            logger.info("worker '{}' alive again on '{}'".format(
                self.worker_id, hostname))
            worker_node_dao.update(self.worker_node)
        else:
            msg = "worker id '{}' already exists".format(self.worker_id)
            logger.info(msg)
            self.send({"success": False})
            self.close(reason=msg, code=1002)
            return

        self.registered = True
        self.get_ws_conn_map()[self.worker_id] = self

        self.send({"success": True})

        # trigger schedule event
        tornado.ioloop.IOLoop.current().add_callback(worker_schedule_job,
                                                     self.settings)
Пример #5
0
def fail_grading_run(settings, run):
    run_dao = daos.GradingRunDao(settings)
    if run is None:
        logger.critical("cannot fail non-existent run with ID '{}'".format(
            run.id))
        return

    run.finished_at = get_time()
    run.state = GradingRunState.FAILED
    run.success = False
    run_dao.update(run)
Пример #6
0
    def post(self, *args, **kwargs):
        """
        Allows workers to update grading job status on completion
        """
        worker_id = kwargs.get("worker_id")
        job_id = self.body.get("grading_job_id")

        grading_job_dao = daos.GradingJobDao(self.settings)
        job = grading_job_dao.find_by_id(job_id)
        if not job:
            self.abort({"message": "job with the given ID not found"})
            return

        job_state = job.get_state()
        if job_state != models.GradingJobState.STARTED:
            logger.critical(
                "job with id '{}' updated when in state '{}'".format(
                    job_id, job_state.value))
            self.abort(
                {"message": "cannot update job that is not in STARTED state"})
            return

        worker_node_dao = daos.WorkerNodeDao(self.settings)
        worker_node = worker_node_dao.find_by_id(worker_id)
        if not worker_node:
            logger.critical(
                "unknown node with ID '{}' successfully updated job".format(
                    worker_id))
            self.abort({"message": ""}, status=404)
            return

        # clear the worker node's job
        worker_node.running_job_id = None
        worker_node.is_alive = True
        worker_node_dao.update(worker_node)

        # finish the job
        job.finished_at = get_time()
        job.results = self.body.get("results")
        job.success = self.body.get("success")
        grading_job_dao.update(job)

        # store the logs
        job_log_dao = daos.GradingJobLogDao(self.settings)
        job_log = models.GradingJobLog(job_id=job_id, **self.body.get("logs"))
        job_log_dao.insert(job_log)

        # thread safe callback
        tornado.ioloop.IOLoop.current().add_callback(job_update_callback,
                                                     self.settings, job_id,
                                                     job.run_id)
Пример #7
0
    def post(self, *args, **kwargs):
        worker_id = kwargs.get("worker_id")

        worker_node_dao = daos.WorkerNodeDao(self.settings)
        worker_node = worker_node_dao.find_by_id(worker_id)
        if not worker_node:
            logger.critical(
                "unknown node with ID '{}' successfully sent heartbeat".format(
                    worker_id))
            self.abort({"message": ""}, status=404)
            return

        worker_node.last_seen = get_time()
        worker_node.is_alive = True
        worker_node_dao.update(worker_node)
Пример #8
0
def _prepare_next_job(settings, grading_run, global_job_environ,
                      runtime_job_environ, job_stages, job_type):
    """
    Prepares a job to be submitted to queue
    """
    grading_job_dao = daos.GradingJobDao(settings)
    grading_job = models.GradingJob(job_type=job_type,
                                    run_id=grading_run.id,
                                    queued_at=get_time())
    grading_job.id = str(grading_job_dao.insert(grading_job).inserted_id)

    runtime_job_environ["GRADING_JOB_ID"] = grading_job.id
    grading_job.set_stages(job_stages, global_job_environ, runtime_job_environ)
    grading_job_dao.update(grading_job)

    return grading_job.id
Пример #9
0
def worker_schedule_job(settings):
    conn_map = settings["WS_CONN_MAP"]
    job_queue = settings["QUEUE"]
    stream_queue = settings["STREAM_QUEUE"]

    grading_job_dao = GradingJobDao(settings)
    worker_node_dao = WorkerNodeDao(settings)

    idle_workers = worker_node_dao.find_by_idleness()
    random.shuffle(idle_workers)

    for idle_worker in idle_workers:
        if idle_worker.use_ws and idle_worker.id in conn_map:
            conn = conn_map[idle_worker.id]

            try:
                grading_job_id = job_queue.pull()
                job_queue.update_all_job_positions(stream_queue)
                grading_job = grading_job_dao.find_by_id(grading_job_id)

                if not grading_job:
                    logger.critical(
                        "found job ID '{}' in queue, but job does not exist".
                        format(grading_job_id))
                    return

                grading_job.started_at = get_time()
                grading_job.worker_id = idle_worker.id
                grading_job_dao.update(grading_job)

                idle_worker.running_job_id = grading_job_id
                idle_worker.jobs_processed += 1
                worker_node_dao.update(idle_worker)

                conn.send({
                    "grading_job_id": grading_job_id,
                    "stages": grading_job.stages
                })

            except Empty:
                # no more jobs available
                return

            except Exception as e:
                logger.critical("failed to assign job to {}: {}".format(
                    idle_worker.id, repr(e)))
Пример #10
0
def worker_heartbeat_callback(settings):
    """
    Checks if any workers went offline (after 2 * heartbeat_interval seconds)
    """
    heartbeat_timestamp = get_time()
    heartbeat_interval = settings["FLAGS"]["heartbeat_interval"]
    conn_map = settings["WS_CONN_MAP"]

    dao = WorkerNodeDao(settings)

    for node in dao.find_by_liveness(alive=True):
        if (heartbeat_timestamp -
                node.last_seen).total_seconds() >= 2 * heartbeat_interval:
            if node.use_ws and node.id in conn_map:
                conn_map[node.id].close()

            _handle_lost_worker_node(settings, node)
Пример #11
0
    def get(self, *args, **kwargs):
        """
        Allows workers to request their next grading job
        """
        worker_id = kwargs.get("worker_id")
        worker_node_dao = daos.WorkerNodeDao(self.settings)
        worker_node = worker_node_dao.find_by_id(worker_id)
        if not worker_node:
            logger.critical(
                "unknown node with ID '{}' successfully requested job".format(
                    worker_id))
            self.abort({"message": ""}, status=404)
            return

        try:
            grading_job_id = self.get_queue().pull()
            self.get_stream_queue().update_job_state(
                grading_job_id, models.GradingJobState.STARTED.name)
            self.get_queue().update_all_job_positions(self.get_stream_queue())
            grading_job_dao = daos.GradingJobDao(self.settings)
            grading_job = grading_job_dao.find_by_id(grading_job_id)
            if not grading_job:
                logger.critical(
                    "found job ID '{}' in queue, but job does not exist".
                    format(grading_job_id))
                self.abort(
                    {"message": "a failure occurred while getting next job"},
                    status=500)
                return

            grading_job.started_at = get_time()
            grading_job.worker_id = worker_id
            grading_job_dao.update(grading_job)

            worker_node.running_job_id = grading_job_id
            worker_node.jobs_processed += 1
            worker_node.is_alive = True
            worker_node_dao.update(worker_node)

            return {
                "grading_job_id": grading_job_id,
                "stages": grading_job.stages
            }
        except Empty:
            self.abort({"message": "no jobs available"}, status=498)
Пример #12
0
    def on_ping(self, data):
        # ping messages have the same function as heartbeat requests
        # for normal http workers

        if self.worker_id is None:
            logger.critical("worker is not initialized")
            return

        worker_node_dao = daos.WorkerNodeDao(self.settings)
        worker_node = worker_node_dao.find_by_id(self.worker_id)

        if not worker_node:
            logger.critical(
                "unknown ws node with ID '{}' successfully sent heartbeat".
                format(self.worker_id))
            return

        worker_node.last_seen = get_time()
        worker_node_dao.update(worker_node)
Пример #13
0
def _finish_grading_run(settings, grading_run):
    grading_run_dao = daos.GradingRunDao(settings)
    grading_run.state = GradingRunState.FINISHED
    grading_run.finished_at = get_time()
    grading_run.success = True
    grading_run_dao.update(grading_run)
Пример #14
0
    def handler_job_result(self, grading_job_id, success, results, logs):
        if not self.registered:
            logger.info("worker '{}' submitted before registering".format(
                self.worker_id))
            self.close(reason="submitting before registering", code=1002)
            return

        grading_job_dao = daos.GradingJobDao(self.settings)
        job = grading_job_dao.find_by_id(grading_job_id)

        if not job:
            self.close(reason="job with the given ID not found", code=1002)
            return

        job_state = job.get_state()

        if job_state != models.GradingJobState.STARTED:
            logger.critical(
                "job with id '{}' updated when in state '{}'".format(
                    grading_job_id, job_state.value))
            self.close(reason="cannot update job that is not in STARTED state",
                       code=1002)
            return

        worker_node_dao = daos.WorkerNodeDao(self.settings)
        worker_node = worker_node_dao.find_by_id(self.worker_id)

        if not worker_node:
            msg = "unknown worker '{}' successfully updated job".format(
                self.worker_id)
            logger.critical(msg)
            self.close(reason=msg, code=1002)
            return

        logger.info("worker '{}' submitted job result for job '{}'".format(
            self.worker_id, grading_job_id))

        # clear the worker node's job
        worker_node.running_job_id = None
        worker_node_dao.update(worker_node)

        # finish the job
        job.finished_at = get_time()
        job.results = results
        job.success = success
        grading_job_dao.update(job)

        # store the logs
        job_log_dao = daos.GradingJobLogDao(self.settings)
        job_log = models.GradingJobLog(job_id=grading_job_id, **logs)
        job_log_dao.insert(job_log)

        # thread safe callback
        tornado.ioloop.IOLoop.current().add_callback(job_update_callback,
                                                     self.settings,
                                                     grading_job_id,
                                                     job.run_id)

        # trigger schedule event
        tornado.ioloop.IOLoop.current().add_callback(worker_schedule_job,
                                                     self.settings)