class Worker(Service): """This service implement the possibility to compile and evaluate submissions in a sandbox. The instructions to follow for the operations are in the TaskType classes, while the sandbox is in the Sandbox module. """ JOB_TYPE_COMPILATION = "compile" JOB_TYPE_EVALUATION = "evaluate" def __init__(self, shard): Service.__init__(self, shard) self.file_cacher = FileCacher(self) self.work_lock = gevent.coros.RLock() @rpc_method def precache_files(self, contest_id): """RPC to ask the worker to precache of files in the contest. contest_id (int): the id of the contest """ # In order to avoid a long-living connection, first fetch the # complete list of files and then download the files; since # this is just pre-caching, possible race conditions are not # dangerous logger.info("Precaching files for contest %d.", contest_id) with SessionGen() as session: contest = Contest.get_from_id(contest_id, session) files = contest.enumerate_files(skip_submissions=True, skip_user_tests=True) for digest in files: try: self.file_cacher.load(digest, if_needed=True) except KeyError: # No problem (at this stage) if we cannot find the # file pass logger.info("Precaching finished.") @rpc_method def execute_job(self, job_dict): """Receive a group of jobs in a dict format and executes them one by one. job_dict (dict): a dictionary suitable to be imported from Job. """ job = Job.import_from_dict_with_type(job_dict) if self.work_lock.acquire(False): try: logger.info("Starting job.", extra={"operation": job.info}) job.shard = self.shard task_type = get_task_type(job.task_type, job.task_type_parameters) task_type.execute_job(job, self.file_cacher) logger.info("Finished job.", extra={"operation": job.info}) return job.export_to_dict() except: err_msg = "Worker failed." logger.error(err_msg, exc_info=True) raise JobException(err_msg) finally: self.work_lock.release() else: err_msg = "Request received, but declined because of acquired " \ "lock (Worker is busy executing another job, this should " \ "not happen: check if there are more than one ES running, " \ "or for bugs in ES." logger.warning(err_msg) raise JobException(err_msg)
class Worker(Service): """This service implement the possibility to compile and evaluate submissions in a sandbox. The instructions to follow for the operations are in the TaskType classes, while the sandbox is in the Sandbox module. """ JOB_TYPE_COMPILATION = "compile" JOB_TYPE_EVALUATION = "evaluate" def __init__(self, shard): Service.__init__(self, shard) self.file_cacher = FileCacher(self) self.work_lock = gevent.coros.RLock() self._ignore_job = False @rpc_method def ignore_job(self): """RPC that inform the worker that its result for the current action will be discarded. The worker will try to return as soon as possible even if this means that the result are inconsistent. """ # We remember to quit as soon as possible. logger.info("Trying to interrupt job as requested.") self._ignore_job = True @rpc_method def precache_files(self, contest_id): """RPC to ask the worker to precache of files in the contest. contest_id (int): the id of the contest """ # Lock is not needed if the admins correctly placed cache and # temp directories in the same filesystem. This is what # usually happens since they are children of the same, # cms-created, directory. logger.info("Precaching files for contest %d." % contest_id) with SessionGen() as session: contest = Contest.get_from_id(contest_id, session) for digest in contest.enumerate_files(skip_submissions=True, skip_user_tests=True): self.file_cacher.load(digest) logger.info("Precaching finished.") @rpc_method def execute_job_group(self, job_group_dict): """Receive a group of jobs in a dict format and executes them one by one. job_group_dict (dict): a dictionary suitable to be imported from JobGroup. """ job_group = JobGroup.import_from_dict(job_group_dict) if self.work_lock.acquire(False): try: self._ignore_job = False for k, job in job_group.jobs.iteritems(): logger.info("Starting job.", extra={"operation": job.info}) #self.rpc_test(job_group_dict) job.shard = self.shard # FIXME This is actually kind of a workaround... # The only TaskType that needs it is OutputOnly. job._key = k # FIXME We're creating a new TaskType for each Job # even if, at the moment, a JobGroup always uses # the same TaskType and the same parameters. Yet, # this could change in the future, so the best # solution is to keep a cache of TaskTypes objects # (like ScoringService does with ScoreTypes, except # that we cannot index by Dataset ID here...). task_type = get_task_type(job.task_type, job.task_type_parameters) task_type.execute_job(job, self.file_cacher) logger.info("Finished job.", extra={"operation": job.info}) if not job.success or self._ignore_job: job_group.success = False break else: job_group.success = True return job_group.export_to_dict() except: err_msg = "Worker failed." logger.error(err_msg, exc_info=True) raise JobException(err_msg) finally: self.work_lock.release() else: err_msg = "Request received, but declined because of acquired " \ "lock (Worker is busy executing another job group, this " \ "should not happen: check if there are more than one ES " \ "running, or for bugs in ES." logger.warning(err_msg) raise JobException(err_msg) @rpc_method def rpc_test(self, mes): logger.info(mes)
class Worker(Service): """This service implement the possibility to compile and evaluate submissions in a sandbox. The instructions to follow for the operations are in the TaskType classes, while the sandbox is in the Sandbox module. """ JOB_TYPE_COMPILATION = "compile" JOB_TYPE_EVALUATION = "evaluate" def __init__(self, shard, fake_worker_time=None): Service.__init__(self, shard) self.file_cacher = FileCacher(self) self.work_lock = gevent.lock.RLock() self._last_end_time = None self._total_free_time = 0 self._total_busy_time = 0 self._number_execution = 0 self._fake_worker_time = fake_worker_time @rpc_method def precache_files(self, contest_id): """RPC to ask the worker to precache of files in the contest. contest_id (int): the id of the contest """ # In order to avoid a long-living connection, first fetch the # complete list of files and then download the files; since # this is just pre-caching, possible race conditions are not # dangerous logger.info("Precaching files for contest %d.", contest_id) with SessionGen() as session: contest = Contest.get_from_id(contest_id, session) files = enumerate_files(session, contest, skip_submissions=True, skip_user_tests=True, skip_print_jobs=True) for digest in files: try: self.file_cacher.load(digest, if_needed=True) except KeyError: # No problem (at this stage) if we cannot find the # file pass logger.info("Precaching finished.") @rpc_method def execute_job_group(self, job_group_dict): """Receive a group of jobs in a list format and executes them one by one. job_group_dict ({}): a JobGroup exported to dict. return ({}): the same JobGroup in dict format, but containing the results. """ start_time = time.time() job_group = JobGroup.import_from_dict(job_group_dict) if self.work_lock.acquire(False): try: logger.info("Starting job group.") for job in job_group.jobs: logger.info("Starting job.", extra={"operation": job.info}) job.shard = self.shard if self._fake_worker_time is None: self._perform_job(job) else: self._fake_work(job) logger.info("Finished job.", extra={"operation": job.info}) logger.info("Finished job group.") return job_group.export_to_dict() except Exception as e: err_msg = "Worker failed: %s." % e logger.error(err_msg, exc_info=True) raise JobException(err_msg) finally: self._finalize(start_time) self.work_lock.release() else: err_msg = "Request received, but declined because of acquired " \ "lock (Worker is busy executing another job, this should " \ "not happen: check if there are more than one ES running, " \ "or for bugs in ES." logger.warning(err_msg) self._finalize(start_time) raise JobException(err_msg) def _perform_job(self, job): task_type = get_task_type(job.task_type, job.task_type_parameters) tries = 0 MAX_TRIES = 5 while tries < MAX_TRIES: try: task_type.execute_job(job, self.file_cacher) except TombstoneError: job.success = False job.plus = {"tombstone": True} return if not isinstance(job, EvaluationJob): return if job.plus is None: return execution_time = job.plus['execution_time'] if job.text[0] != 'Execution timed out': if tries > 0: logger.info("Took: %s (TL: %s)", execution_time, job.time_limit, extra={"operation": job.info}) logger.info("Not a TLE anymore.", extra={"operation": job.info}) return logger.info("Took: %s (TL: %s)", execution_time, job.time_limit, extra={"operation": job.info}) if execution_time > 1.3 * job.time_limit: logger.info("Significant TLE. Not retrying.", extra={"operation": job.info}) return tries += 1 logger.info("Slight TLE. Retrying (%s of %s)", tries, MAX_TRIES, extra={"operation": job.info}) def _fake_work(self, job): """Fill the job with fake success data after waiting for some time.""" time.sleep(self._fake_worker_time) job.success = True job.text = ["ok"] job.plus = { "execution_time": self._fake_worker_time, "execution_wall_clock_time": self._fake_worker_time, "execution_memory": 1000, } if isinstance(job, CompilationJob): job.compilation_success = True elif isinstance(job, EvaluationJob): job.outcome = "1.0" def _finalize(self, start_time): end_time = time.time() busy_time = end_time - start_time free_time = 0.0 if self._last_end_time is not None: free_time = start_time - self._last_end_time self._last_end_time = end_time self._total_busy_time += busy_time self._total_free_time += free_time ratio = self._total_busy_time * 100.0 / \ (self._total_busy_time + self._total_free_time) avg_free_time = 0.0 if self._number_execution > 0: avg_free_time = self._total_free_time / self._number_execution avg_busy_time = 0.0 if self._number_execution > 0: avg_busy_time = self._total_busy_time / self._number_execution self._number_execution += 1 logger.info( "Executed in %.3lf after free for %.3lf; " "busyness is %.1lf%%; avg free time is %.3lf " "avg busy time is %.3lf ", busy_time, free_time, ratio, avg_free_time, avg_busy_time)
class Worker(Service): """This service implement the possibility to compile and evaluate submissions in a sandbox. The instructions to follow for the operations are in the TaskType classes, while the sandbox is in the Sandbox module. """ JOB_TYPE_COMPILATION = "compile" JOB_TYPE_EVALUATION = "evaluate" def __init__(self, shard, fake_worker_time=None): Service.__init__(self, shard) self.file_cacher = FileCacher(self) self.work_lock = gevent.lock.RLock() self._last_end_time = None self._total_free_time = 0 self._total_busy_time = 0 self._number_execution = 0 self._fake_worker_time = fake_worker_time @rpc_method def precache_files(self, contest_id): """RPC to ask the worker to precache of files in the contest. contest_id (int): the id of the contest """ # In order to avoid a long-living connection, first fetch the # complete list of files and then download the files; since # this is just pre-caching, possible race conditions are not # dangerous logger.info("Precaching files for contest %d.", contest_id) with SessionGen() as session: contest = Contest.get_from_id(contest_id, session) files = enumerate_files(session, contest, skip_submissions=True, skip_user_tests=True, skip_print_jobs=True) for digest in files: try: self.file_cacher.load(digest, if_needed=True) except KeyError: # No problem (at this stage) if we cannot find the # file pass logger.info("Precaching finished.") @rpc_method def execute_job_group(self, job_group_dict): """Receive a group of jobs in a list format and executes them one by one. job_group_dict ({}): a JobGroup exported to dict. return ({}): the same JobGroup in dict format, but containing the results. """ start_time = time.time() job_group = JobGroup.import_from_dict(job_group_dict) if self.work_lock.acquire(False): try: logger.info("Starting job group.") for job in job_group.jobs: logger.info("Starting job.", extra={"operation": job.info}) job.shard = self.shard if self._fake_worker_time is None: task_type = get_task_type(job.task_type, job.task_type_parameters) try: task_type.execute_job(job, self.file_cacher) except TombstoneError: job.success = False job.plus = {"tombstone": True} else: self._fake_work(job) logger.info("Finished job.", extra={"operation": job.info}) logger.info("Finished job group.") return job_group.export_to_dict() except Exception as e: err_msg = "Worker failed: %s." % e logger.error(err_msg, exc_info=True) raise JobException(err_msg) finally: self._finalize(start_time) self.work_lock.release() else: err_msg = "Request received, but declined because of acquired " \ "lock (Worker is busy executing another job, this should " \ "not happen: check if there are more than one ES running, " \ "or for bugs in ES." logger.warning(err_msg) self._finalize(start_time) raise JobException(err_msg) def _fake_work(self, job): """Fill the job with fake success data after waiting for some time.""" time.sleep(self._fake_worker_time) job.success = True job.text = ["ok"] job.plus = { "execution_time": self._fake_worker_time, "execution_wall_clock_time": self._fake_worker_time, "execution_memory": 1000, } if isinstance(job, CompilationJob): job.compilation_success = True elif isinstance(job, EvaluationJob): job.outcome = "1.0" def _finalize(self, start_time): end_time = time.time() busy_time = end_time - start_time free_time = 0.0 if self._last_end_time is not None: free_time = start_time - self._last_end_time self._last_end_time = end_time self._total_busy_time += busy_time self._total_free_time += free_time ratio = self._total_busy_time * 100.0 / \ (self._total_busy_time + self._total_free_time) avg_free_time = 0.0 if self._number_execution > 0: avg_free_time = self._total_free_time / self._number_execution avg_busy_time = 0.0 if self._number_execution > 0: avg_busy_time = self._total_busy_time / self._number_execution self._number_execution += 1 logger.info("Executed in %.3lf after free for %.3lf; " "busyness is %.1lf%%; avg free time is %.3lf " "avg busy time is %.3lf ", busy_time, free_time, ratio, avg_free_time, avg_busy_time)
class Worker(Service): """This service implement the possibility to compile and evaluate submissions in a sandbox. The instructions to follow for the operations are in the TaskType classes, while the sandbox is in the Sandbox module. """ JOB_TYPE_COMPILATION = "compile" JOB_TYPE_EVALUATION = "evaluate" def __init__(self, shard): Service.__init__(self, shard) self.file_cacher = FileCacher(self) self.work_lock = gevent.coros.RLock() self._ignore_job = False @rpc_method def ignore_job(self): """RPC that inform the worker that its result for the current action will be discarded. The worker will try to return as soon as possible even if this means that the result are inconsistent. """ # We remember to quit as soon as possible. logger.info("Trying to interrupt job as requested.") self._ignore_job = True @rpc_method def precache_files(self, contest_id): """RPC to ask the worker to precache of files in the contest. contest_id (int): the id of the contest """ # Lock is not needed if the admins correctly placed cache and # temp directories in the same filesystem. This is what # usually happens since they are children of the same, # cms-created, directory. logger.info("Precaching files for contest %d." % contest_id) with SessionGen() as session: contest = Contest.get_from_id(contest_id, session) for digest in contest.enumerate_files(skip_submissions=True, skip_user_tests=True): self.file_cacher.load(digest) logger.info("Precaching finished.") @rpc_method def execute_job_group(self, job_group_dict): """Receive a group of jobs in a dict format and executes them one by one. job_group_dict (dict): a dictionary suitable to be imported from JobGroup. """ job_group = JobGroup.import_from_dict(job_group_dict) if self.work_lock.acquire(False): try: self._ignore_job = False for k, job in job_group.jobs.iteritems(): logger.info("Starting job.", extra={"operation": job.info}) job.shard = self.shard # FIXME This is actually kind of a workaround... # The only TaskType that needs it is OutputOnly. job._key = k # FIXME We're creating a new TaskType for each Job # even if, at the moment, a JobGroup always uses # the same TaskType and the same parameters. Yet, # this could change in the future, so the best # solution is to keep a cache of TaskTypes objects # (like ScoringService does with ScoreTypes, except # that we cannot index by Dataset ID here...). task_type = get_task_type(job.task_type, job.task_type_parameters) task_type.execute_job(job, self.file_cacher) logger.info("Finished job.", extra={"operation": job.info}) if not job.success or self._ignore_job: job_group.success = False break else: job_group.success = True return job_group.export_to_dict() except: err_msg = "Worker failed." logger.error(err_msg, exc_info=True) raise JobException(err_msg) finally: self.work_lock.release() else: err_msg = "Request received, but declined because of acquired " \ "lock (Worker is busy executing another job group, this " \ "should not happen: check if there are more than one ES " \ "running, or for bugs in ES." logger.warning(err_msg) raise JobException(err_msg)
class Worker(Service): """This service implement the possibility to compile and evaluate submissions in a sandbox. The instructions to follow for the operations are in the TaskType classes, while the sandbox is in the Sandbox module. """ JOB_TYPE_COMPILATION = "compile" JOB_TYPE_EVALUATION = "evaluate" def __init__(self, shard): Service.__init__(self, shard) self.file_cacher = FileCacher(self) self.work_lock = gevent.coros.RLock() self._last_end_time = None self._total_free_time = 0 self._total_busy_time = 0 self._number_execution = 0 @rpc_method def precache_files(self, contest_id): """RPC to ask the worker to precache of files in the contest. contest_id (int): the id of the contest """ # In order to avoid a long-living connection, first fetch the # complete list of files and then download the files; since # this is just pre-caching, possible race conditions are not # dangerous logger.info("Precaching files for contest %d.", contest_id) with SessionGen() as session: contest = Contest.get_from_id(contest_id, session) files = contest.enumerate_files(skip_submissions=True, skip_user_tests=True) for digest in files: try: self.file_cacher.load(digest, if_needed=True) except KeyError: # No problem (at this stage) if we cannot find the # file pass logger.info("Precaching finished.") @rpc_method def execute_job(self, job_dict): """Receive a group of jobs in a dict format and executes them one by one. job_dict (dict): a dictionary suitable to be imported from Job. """ start_time = time.time() job = Job.import_from_dict_with_type(job_dict) if self.work_lock.acquire(False): try: logger.info("Starting job.", extra={"operation": job.info}) job.shard = self.shard task_type = get_task_type(job.task_type, job.task_type_parameters) task_type.execute_job(job, self.file_cacher) logger.info("Finished job.", extra={"operation": job.info}) return job.export_to_dict() except: err_msg = "Worker failed." logger.error(err_msg, exc_info=True) raise JobException(err_msg) finally: self._finalize(start_time) self.work_lock.release() else: err_msg = "Request received, but declined because of acquired " \ "lock (Worker is busy executing another job, this should " \ "not happen: check if there are more than one ES running, " \ "or for bugs in ES." logger.warning(err_msg) self._finalize(start_time) raise JobException(err_msg) def _finalize(self, start_time): end_time = time.time() busy_time = end_time - start_time free_time = 0.0 if self._last_end_time is not None: free_time = start_time - self._last_end_time self._last_end_time = end_time self._total_busy_time += busy_time self._total_free_time += free_time ratio = self._total_busy_time * 100.0 / \ (self._total_busy_time + self._total_free_time) avg_free_time = 0.0 if self._number_execution > 0: avg_free_time = self._total_free_time / self._number_execution avg_busy_time = 0.0 if self._number_execution > 0: avg_busy_time = self._total_busy_time / self._number_execution self._number_execution += 1 logger.info( "Executed in %.3lf after free for %.3lf; " "busyness is %.1lf%%; avg free time is %.3lf " "avg busy time is %.3lf ", busy_time, free_time, ratio, avg_free_time, avg_busy_time)