def main(): debug = int(os.environ["GNT_DEBUG"]) logname = pathutils.GetLogFilename("jobs") utils.SetupLogging(logname, "job-startup", debug=debug) (job_id, livelock_name) = _GetMasterInfo() utils.SetupLogging(logname, "job-%s" % (job_id, ), debug=debug) exit_code = 1 try: logging.debug("Preparing the context and the configuration") context = masterd.GanetiContext(livelock_name) logging.debug("Registering a SIGTERM handler") cancel = [False] def _TermHandler(signum, _frame): logging.info("Killed by signal %d", signum) cancel[0] = True signal.signal(signal.SIGTERM, _TermHandler) logging.debug("Picking up job %d", job_id) context.jobqueue.PickupJob(job_id) # waiting for the job to finish time.sleep(1) while not context.jobqueue.HasJobBeenFinalized(job_id): if cancel[0]: logging.debug("Got cancel request, cancelling job %d", job_id) r = context.jobqueue.CancelJob(job_id) logging.debug("CancelJob result for job %d: %s", job_id, r) cancel[0] = False time.sleep(1) # wait until the queue finishes logging.debug("Waiting for the queue to finish") while context.jobqueue.PrepareShutdown(): time.sleep(1) logging.debug("Shutting the queue down") context.jobqueue.Shutdown() exit_code = 0 except Exception: # pylint: disable=W0703 logging.exception("Exception when trying to run job %d", job_id) finally: logging.debug("Job %d finalized", job_id) logging.debug("Removing livelock file %s", livelock_name.GetPath()) os.remove(livelock_name.GetPath()) sys.exit(exit_code)
def main(): debug = int(os.environ["GNT_DEBUG"]) logname = pathutils.GetLogFilename("jobs") utils.SetupLogging(logname, "job-post-hooks-startup", debug=debug) job_id = _GetMasterInfo() utils.SetupLogging(logname, "job-%s-post-hooks" % (job_id, ), debug=debug) try: job = JobQueue.SafeLoadJobFromDisk(None, job_id, try_archived=False, writable=False) assert job.id == job_id, "The job id received %d differs " % job_id + \ "from the serialized one %d" % job.id target_op = None for op in job.ops: if op.start_timestamp is None: break target_op = op # We should run post hooks only if opcode execution has been started. # Note that currently the opcodes inside a job execute sequentially. if target_op is None: sys.exit(0) livelock_name = livelock.LiveLockName("post-hooks-executor-%d" % job_id) context = masterd.GanetiContext(livelock_name) cfg_tmp = context.GetConfig(job_id) # Get static snapshot of the config and release it in order to prevent # further synchronizations. cfg = cfg_tmp.GetDetachedConfig() cfg_tmp.OutDate() hooksmaster.ExecGlobalPostHooks( target_op.input.OP_ID, cfg.GetMasterNodeName(), context.GetRpc(cfg).call_hooks_runner, logging.warning, cfg.GetClusterName(), cfg.GetMasterNode(), job_id, constants.POST_HOOKS_STATUS_DISAPPEARED) except Exception: # pylint: disable=W0703 logging.exception("Exception when trying to run post hooks of job %d", job_id) finally: logging.debug("Post hooks exec for disappeared job %d finalized", job_id) logging.debug("Removing livelock file %s", livelock_name.GetPath()) os.remove(livelock_name.GetPath()) sys.exit(0)
def main(): debug = int(os.environ["GNT_DEBUG"]) logname = pathutils.GetLogFilename("jobs") utils.SetupLogging(logname, "job-startup", debug=debug) (job_id, llock, secret_params_serialized) = _SetupJob() secret_params = "" if secret_params_serialized: secret_params_json = serializer.LoadJson(secret_params_serialized) secret_params = RestorePrivateValueWrapping(secret_params_json) utils.SetupLogging(logname, "job-%s" % (job_id,), debug=debug) try: logging.debug("Preparing the context and the configuration") context = masterd.GanetiContext(llock) logging.debug("Registering signal handlers") cancel = [False] prio_change = [False] def _TermHandler(signum, _frame): logging.info("Killed by signal %d", signum) cancel[0] = True signal.signal(signal.SIGTERM, _TermHandler) def _HupHandler(signum, _frame): logging.debug("Received signal %d, old flag was %s, will set to True", signum, mcpu.sighupReceived) mcpu.sighupReceived[0] = True signal.signal(signal.SIGHUP, _HupHandler) def _User1Handler(signum, _frame): logging.info("Received signal %d, indicating priority change", signum) prio_change[0] = True signal.signal(signal.SIGUSR1, _User1Handler) job = context.jobqueue.SafeLoadJobFromDisk(job_id, False) job.SetPid(os.getpid()) if secret_params: for i in range(0, len(secret_params)): if hasattr(job.ops[i].input, "osparams_secret"): job.ops[i].input.osparams_secret = secret_params[i] execfun = mcpu.Processor(context, job_id, job_id).ExecOpCode proc = _JobProcessor(context.jobqueue, execfun, job) result = _JobProcessor.DEFER while result != _JobProcessor.FINISHED: result = proc() if result == _JobProcessor.WAITDEP and not cancel[0]: # Normally, the scheduler should avoid starting a job where the # dependencies are not yet finalised. So warn, but wait an continue. logging.warning("Got started despite a dependency not yet finished") time.sleep(5) if cancel[0]: logging.debug("Got cancel request, cancelling job %d", job_id) r = context.jobqueue.CancelJob(job_id) job = context.jobqueue.SafeLoadJobFromDisk(job_id, False) proc = _JobProcessor(context.jobqueue, execfun, job) logging.debug("CancelJob result for job %d: %s", job_id, r) cancel[0] = False if prio_change[0]: logging.debug("Received priority-change request") try: fname = os.path.join(pathutils.LUXID_MESSAGE_DIR, "%d.prio" % job_id) new_prio = int(utils.ReadFile(fname)) utils.RemoveFile(fname) logging.debug("Changing priority of job %d to %d", job_id, new_prio) r = context.jobqueue.ChangeJobPriority(job_id, new_prio) job = context.jobqueue.SafeLoadJobFromDisk(job_id, False) proc = _JobProcessor(context.jobqueue, execfun, job) logging.debug("Result of changing priority of %d to %d: %s", job_id, new_prio, r) except Exception: # pylint: disable=W0703 logging.warning("Informed of priority change, but could not" " read new priority") prio_change[0] = False except Exception: # pylint: disable=W0703 logging.exception("Exception when trying to run job %d", job_id) finally: logging.debug("Job %d finalized", job_id) logging.debug("Removing livelock file %s", llock.GetPath()) os.remove(llock.GetPath()) sys.exit(0)
def main(): debug = int(os.environ["GNT_DEBUG"]) logname = pathutils.GetLogFilename("jobs") utils.SetupLogging(logname, "job-startup", debug=debug) (job_id, livelock_name) = _GetMasterInfo() utils.SetupLogging(logname, "job-%s" % (job_id,), debug=debug) exit_code = 1 try: logging.debug("Preparing the context and the configuration") context = masterd.GanetiContext(livelock_name) logging.debug("Registering signal handlers") cancel = [False] prio_change = [False] def _TermHandler(signum, _frame): logging.info("Killed by signal %d", signum) cancel[0] = True signal.signal(signal.SIGTERM, _TermHandler) def _HupHandler(signum, _frame): logging.debug("Received signal %d, old flag was %s, will set to True", signum, mcpu.sighupReceived) mcpu.sighupReceived[0] = True signal.signal(signal.SIGHUP, _HupHandler) def _User1Handler(signum, _frame): logging.info("Received signal %d, indicating priority change", signum) prio_change[0] = True signal.signal(signal.SIGUSR1, _User1Handler) logging.debug("Picking up job %d", job_id) context.jobqueue.PickupJob(job_id) # waiting for the job to finish time.sleep(1) while not context.jobqueue.HasJobBeenFinalized(job_id): if cancel[0]: logging.debug("Got cancel request, cancelling job %d", job_id) r = context.jobqueue.CancelJob(job_id) logging.debug("CancelJob result for job %d: %s", job_id, r) cancel[0] = False if prio_change[0]: logging.debug("Received priority-change request") try: fname = os.path.join(pathutils.LUXID_MESSAGE_DIR, "%d.prio" % job_id) new_prio = int(utils.ReadFile(fname)) utils.RemoveFile(fname) logging.debug("Changing priority of job %d to %d", job_id, new_prio) r = context.jobqueue.ChangeJobPriority(job_id, new_prio) logging.debug("Result of changing priority of %d to %d: %s", job_id, new_prio, r) except Exception: # pylint: disable=W0703 logging.warning("Informed of priority change, but could not" " read new priority") prio_change[0] = False time.sleep(1) # wait until the queue finishes logging.debug("Waiting for the queue to finish") while context.jobqueue.PrepareShutdown(): time.sleep(1) logging.debug("Shutting the queue down") context.jobqueue.Shutdown() exit_code = 0 except Exception: # pylint: disable=W0703 logging.exception("Exception when trying to run job %d", job_id) finally: logging.debug("Job %d finalized", job_id) logging.debug("Removing livelock file %s", livelock_name.GetPath()) os.remove(livelock_name.GetPath()) sys.exit(exit_code)