def testCancelWhileInQueue(self): queue = _FakeQueueForProc() ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(5)] # Create job job_id = 17045 job = self._CreateJob(queue, job_id, ops) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) # Mark as cancelled (success, _) = job.Cancel() self.assert_(success) self.assert_(compat.all(op.status == constants.OP_STATUS_CANCELED for op in job.ops)) opexec = _FakeExecOpCodeForProc(None, None) jqueue._JobProcessor(queue, opexec, job)() # Check result self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_CANCELED) self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_CANCELED]) self.assertFalse(job.start_timestamp) self.assert_(job.end_timestamp) self.assertFalse(compat.any(op.start_timestamp or op.end_timestamp for op in job.ops)) self.assertEqual( job.GetInfo(["opstatus", "opresult"]), [[constants.OP_STATUS_CANCELED for _ in job.ops], ["Job canceled by request" for _ in job.ops]], )
def _TestPartial(self, job, successcount): self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) self.assertEqual(job.start_timestamp, job.ops[0].start_timestamp) queue = _FakeQueueForProc() opexec = _FakeExecOpCodeForProc(None, None) for remaining in reversed(range(len(job.ops) - successcount)): result = jqueue._JobProcessor(queue, opexec, job)() if remaining == 0: # Last opcode self.assert_(result) break self.assertFalse(result) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_SUCCESS) self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_SUCCESS]) self.assertEqual(job.GetInfo(["opresult"]), [[op.input.result for op in job.ops]]) self.assertEqual(job.GetInfo(["opstatus"]), [[constants.OP_STATUS_SUCCESS for _ in job.ops]]) self.assert_(compat.all(op.start_timestamp and op.end_timestamp for op in job.ops)) self._GenericCheckJob(job) # Finished jobs can't be processed any further self.assertRaises(errors.ProgrammerError, jqueue._JobProcessor(queue, opexec, job)) # ... also after being restored job2 = jqueue._QueuedJob.Restore(queue, job.Serialize()) self.assertRaises(errors.ProgrammerError, jqueue._JobProcessor(queue, opexec, job2))
def testOpcodeError(self): queue = _FakeQueueForProc() testdata = [(17077, 1, 0, 0), (1782, 5, 2, 2), (18179, 10, 9, 9), (4744, 10, 3, 8), (23816, 100, 39, 45)] for (job_id, opcount, failfrom, failto) in testdata: # Prepare opcodes ops = [ opcodes.OpTestDummy(result="Res%s" % i, fail=(failfrom <= i and i <= failto)) for i in range(opcount) ] # Create job job = self._CreateJob(queue, job_id, ops) opexec = _FakeExecOpCodeForProc(None, None) for idx in range(len(ops)): result = jqueue._JobProcessor(queue, opexec, job)() if idx in (failfrom, len(ops) - 1): # Last opcode self.assert_(result) break self.assertFalse(result) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) # Check job status self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_ERROR) self.assertEqual(job.GetInfo(["id"]), [job_id]) self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_ERROR]) # Check opcode status data = zip(job.ops, job.GetInfo(["opstatus"])[0], job.GetInfo(["opresult"])[0]) for idx, (op, opstatus, opresult) in enumerate(data): if idx < failfrom: assert not op.input.fail self.assertEqual(opstatus, constants.OP_STATUS_SUCCESS) self.assertEqual(opresult, op.input.result) elif idx <= failto: assert op.input.fail self.assertEqual(opstatus, constants.OP_STATUS_ERROR) self.assertRaises(errors.OpExecError, errors.MaybeRaise, opresult) else: assert not op.input.fail self.assertEqual(opstatus, constants.OP_STATUS_ERROR) self.assertRaises(errors.OpExecError, errors.MaybeRaise, opresult) self.assert_(compat.all(op.start_timestamp and op.end_timestamp for op in job.ops[:failfrom])) self._GenericCheckJob(job) # Finished jobs can't be processed any further self.assertRaises(errors.ProgrammerError, jqueue._JobProcessor(queue, opexec, job))
def testTimeout(self): ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(10)] # Create job job_id = 15801 job = self._CreateJob(self.queue, job_id, ops) self.job = job self.opcounter = itertools.count(0) opexec = _FakeExecOpCodeForProc(self._BeforeStart, self._AfterStart) tsf = self._NewTimeoutStrategy self.assertFalse(self.done_lock_before_blocking) for i in itertools.count(0): proc = jqueue._JobProcessor(self.queue, opexec, job, _timeout_strategy_factory=tsf) result = proc(_nextop_fn=self._NextOpcode) if result: self.assertFalse(job.cur_opctx) break self.assertFalse(result) if self.gave_lock: self.assertFalse(job.cur_opctx) else: self.assert_(job.cur_opctx) self.assertEqual(job.cur_opctx._timeout_strategy._fn, self.timeout_strategy.NextAttempt) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) self.assert_(job.start_timestamp) self.assertFalse(job.end_timestamp) self.assertEqual(self.curop, len(job.ops) - 1) self.assertEqual(self.job, job) self.assertEqual(self.opcounter.next(), len(job.ops)) self.assert_(self.done_lock_before_blocking) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_SUCCESS) self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_SUCCESS]) self.assertEqual(job.GetInfo(["opresult"]), [[op.input.result for op in job.ops]]) self.assertEqual(job.GetInfo(["opstatus"]), [len(job.ops) * [constants.OP_STATUS_SUCCESS]]) self.assert_(compat.all(op.start_timestamp and op.end_timestamp for op in job.ops)) # Finished jobs can't be processed any further self.assertRaises(errors.ProgrammerError, jqueue._JobProcessor(self.queue, opexec, job))
def testPartiallyRun(self): # Tests calling the processor on a job that's been partially run before the # program was restarted queue = _FakeQueueForProc() opexec = _FakeExecOpCodeForProc(queue, None, None) for job_id, successcount in [(30697, 1), (2552, 4), (12489, 9)]: ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(10)] # Create job job = self._CreateJob(queue, job_id, ops) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) for _ in range(successcount): self.assertFalse(jqueue._JobProcessor(queue, opexec, job)()) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) self.assertEqual(job.GetInfo(["opstatus"]), [[constants.OP_STATUS_SUCCESS for _ in range(successcount)] + [constants.OP_STATUS_QUEUED for _ in range(len(ops) - successcount)]]) self.assert_(job.ops_iter) # Serialize and restore (simulates program restart) newjob = jqueue._QueuedJob.Restore(queue, job.Serialize()) self.assertFalse(newjob.ops_iter) self._TestPartial(newjob, successcount)
def testCancelWhileRunning(self): # Tests canceling a job with finished opcodes and more, unprocessed ones queue = _FakeQueueForProc() ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(3)] # Create job job_id = 28492 job = self._CreateJob(queue, job_id, ops) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) opexec = _FakeExecOpCodeForProc(None, None) # Run one opcode self.assertFalse(jqueue._JobProcessor(queue, opexec, job)()) # Job goes back to queued self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) self.assertEqual( job.GetInfo(["opstatus", "opresult"]), [ [constants.OP_STATUS_SUCCESS, constants.OP_STATUS_QUEUED, constants.OP_STATUS_QUEUED], ["Res0", None, None], ], ) # Mark as cancelled (success, _) = job.Cancel() self.assert_(success) # Try processing another opcode (this will actually cancel the job) self.assert_(jqueue._JobProcessor(queue, opexec, job)()) # Check result self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_CANCELED) self.assertEqual(job.GetInfo(["id"]), [job_id]) self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_CANCELED]) self.assertEqual( job.GetInfo(["opstatus", "opresult"]), [ [constants.OP_STATUS_SUCCESS, constants.OP_STATUS_CANCELED, constants.OP_STATUS_CANCELED], ["Res0", "Job canceled by request", "Job canceled by request"], ], )
def testSuccess(self): queue = _FakeQueueForProc() for (job_id, opcount) in [(25351, 1), (6637, 3), (24644, 10), (32207, 100)]: ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(opcount)] # Create job job = self._CreateJob(queue, job_id, ops) def _BeforeStart(timeout, priority): self.assertFalse(queue.IsAcquired()) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK) def _AfterStart(op, cbs): self.assertFalse(queue.IsAcquired()) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_RUNNING) # Job is running, cancelling shouldn't be possible (success, _) = job.Cancel() self.assertFalse(success) opexec = _FakeExecOpCodeForProc(_BeforeStart, _AfterStart) for idx in range(len(ops)): result = jqueue._JobProcessor(queue, opexec, job)() if idx == len(ops) - 1: # Last opcode self.assert_(result) else: self.assertFalse(result) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) self.assert_(job.start_timestamp) self.assertFalse(job.end_timestamp) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_SUCCESS) self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_SUCCESS]) self.assertEqual(job.GetInfo(["opresult"]), [[op.input.result for op in job.ops]]) self.assertEqual(job.GetInfo(["opstatus"]), [len(job.ops) * [constants.OP_STATUS_SUCCESS]]) self.assert_(compat.all(op.start_timestamp and op.end_timestamp for op in job.ops)) self._GenericCheckJob(job) # Finished jobs can't be processed any further self.assertRaises(errors.ProgrammerError, jqueue._JobProcessor(queue, opexec, job))
def testCancelWhileWaitlock(self): queue = _FakeQueueForProc() ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(5)] # Create job job_id = 11009 job = self._CreateJob(queue, job_id, ops) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) def _BeforeStart(timeout, priority): self.assertFalse(queue.IsAcquired()) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK) # Mark as cancelled (success, _) = job.Cancel() self.assert_(success) self.assert_(compat.all(op.status == constants.OP_STATUS_CANCELING for op in job.ops)) def _AfterStart(op, cbs): self.assertFalse(queue.IsAcquired()) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_RUNNING) opexec = _FakeExecOpCodeForProc(_BeforeStart, _AfterStart) jqueue._JobProcessor(queue, opexec, job)() # Check result self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_CANCELED) self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_CANCELED]) self.assert_(job.start_timestamp) self.assert_(job.end_timestamp) self.assertFalse(compat.all(op.start_timestamp and op.end_timestamp for op in job.ops)) self.assertEqual( job.GetInfo(["opstatus", "opresult"]), [[constants.OP_STATUS_CANCELED for _ in job.ops], ["Job canceled by request" for _ in job.ops]], )
def testCancelWhileInQueue(self): queue = _FakeQueueForProc() ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(5)] # Create job job_id = 17045 job = self._CreateJob(queue, job_id, ops) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) # Mark as cancelled (success, _) = job.Cancel() self.assert_(success) self.assertRaises(IndexError, queue.GetNextUpdate) self.assertFalse(job.start_timestamp) self.assertTrue(job.end_timestamp) self.assert_(compat.all(op.status == constants.OP_STATUS_CANCELED for op in job.ops)) # Serialize to check for differences before_proc = job.Serialize() # Simulate processor called in workerpool opexec = _FakeExecOpCodeForProc(queue, None, None) self.assert_(jqueue._JobProcessor(queue, opexec, job)()) # Check result self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_CANCELED) self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_CANCELED]) self.assertFalse(job.start_timestamp) self.assertTrue(job.end_timestamp) self.assertFalse(compat.any(op.start_timestamp or op.end_timestamp for op in job.ops)) self.assertEqual(job.GetInfo(["opstatus", "opresult"]), [[constants.OP_STATUS_CANCELED for _ in job.ops], ["Job canceled by request" for _ in job.ops]]) # Must not have changed or written self.assertEqual(before_proc, job.Serialize()) self.assertRaises(IndexError, queue.GetNextUpdate)
def testProcessorOnRunningJob(self): ops = [opcodes.OpTestDummy(result="result", fail=False)] queue = _FakeQueueForProc() opexec = _FakeExecOpCodeForProc(None, None) # Create job job = self._CreateJob(queue, 9571, ops) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) job.ops[0].status = constants.OP_STATUS_RUNNING assert len(job.ops) == 1 self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_RUNNING) # Calling on running job must fail self.assertRaises(errors.ProgrammerError, jqueue._JobProcessor(queue, opexec, job))
def testCancelWhileWaitlockInQueue(self): queue = _FakeQueueForProc() ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(5)] # Create job job_id = 8645 job = self._CreateJob(queue, job_id, ops) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) job.ops[0].status = constants.OP_STATUS_WAITLOCK assert len(job.ops) == 5 self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK) # Mark as cancelling (success, _) = job.Cancel() self.assert_(success) self.assertRaises(IndexError, queue.GetNextUpdate) self.assert_(compat.all(op.status == constants.OP_STATUS_CANCELING for op in job.ops)) opexec = _FakeExecOpCodeForProc(queue, None, None) self.assert_(jqueue._JobProcessor(queue, opexec, job)()) # Check result self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_CANCELED) self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_CANCELED]) self.assertFalse(job.start_timestamp) self.assert_(job.end_timestamp) self.assertFalse(compat.any(op.start_timestamp or op.end_timestamp for op in job.ops)) self.assertEqual(job.GetInfo(["opstatus", "opresult"]), [[constants.OP_STATUS_CANCELED for _ in job.ops], ["Job canceled by request" for _ in job.ops]])
def main(): debug = int(os.environ["GNT_DEBUG"]) logname = pathutils.GetLogFilename("jobs") utils.SetupLogging(logname, "job-startup", debug=debug) (job_id, llock, secret_params_serialized) = _SetupJob() secret_params = "" if secret_params_serialized: secret_params_json = serializer.LoadJson(secret_params_serialized) secret_params = RestorePrivateValueWrapping(secret_params_json) utils.SetupLogging(logname, "job-%s" % (job_id,), debug=debug) try: logging.debug("Preparing the context and the configuration") context = masterd.GanetiContext(llock) logging.debug("Registering signal handlers") cancel = [False] prio_change = [False] def _TermHandler(signum, _frame): logging.info("Killed by signal %d", signum) cancel[0] = True signal.signal(signal.SIGTERM, _TermHandler) def _HupHandler(signum, _frame): logging.debug("Received signal %d, old flag was %s, will set to True", signum, mcpu.sighupReceived) mcpu.sighupReceived[0] = True signal.signal(signal.SIGHUP, _HupHandler) def _User1Handler(signum, _frame): logging.info("Received signal %d, indicating priority change", signum) prio_change[0] = True signal.signal(signal.SIGUSR1, _User1Handler) job = context.jobqueue.SafeLoadJobFromDisk(job_id, False) job.SetPid(os.getpid()) if secret_params: for i in range(0, len(secret_params)): if hasattr(job.ops[i].input, "osparams_secret"): job.ops[i].input.osparams_secret = secret_params[i] execfun = mcpu.Processor(context, job_id, job_id).ExecOpCode proc = _JobProcessor(context.jobqueue, execfun, job) result = _JobProcessor.DEFER while result != _JobProcessor.FINISHED: result = proc() if result == _JobProcessor.WAITDEP and not cancel[0]: # Normally, the scheduler should avoid starting a job where the # dependencies are not yet finalised. So warn, but wait an continue. logging.warning("Got started despite a dependency not yet finished") time.sleep(5) if cancel[0]: logging.debug("Got cancel request, cancelling job %d", job_id) r = context.jobqueue.CancelJob(job_id) job = context.jobqueue.SafeLoadJobFromDisk(job_id, False) proc = _JobProcessor(context.jobqueue, execfun, job) logging.debug("CancelJob result for job %d: %s", job_id, r) cancel[0] = False if prio_change[0]: logging.debug("Received priority-change request") try: fname = os.path.join(pathutils.LUXID_MESSAGE_DIR, "%d.prio" % job_id) new_prio = int(utils.ReadFile(fname)) utils.RemoveFile(fname) logging.debug("Changing priority of job %d to %d", job_id, new_prio) r = context.jobqueue.ChangeJobPriority(job_id, new_prio) job = context.jobqueue.SafeLoadJobFromDisk(job_id, False) proc = _JobProcessor(context.jobqueue, execfun, job) logging.debug("Result of changing priority of %d to %d: %s", job_id, new_prio, r) except Exception: # pylint: disable=W0703 logging.warning("Informed of priority change, but could not" " read new priority") prio_change[0] = False except Exception: # pylint: disable=W0703 logging.exception("Exception when trying to run job %d", job_id) finally: logging.debug("Job %d finalized", job_id) logging.debug("Removing livelock file %s", llock.GetPath()) os.remove(llock.GetPath()) sys.exit(0)
def testLogMessages(self): # Tests the "Feedback" callback function queue = _FakeQueueForProc() messages = { 1: [ (None, "Hello"), (None, "World"), (constants.ELOG_MESSAGE, "there"), ], 4: [ (constants.ELOG_JQUEUE_TEST, (1, 2, 3)), (constants.ELOG_JQUEUE_TEST, ("other", "type")), ], } ops = [opcodes.OpTestDummy(result="Logtest%s" % i, fail=False, messages=messages.get(i, [])) for i in range(5)] # Create job job = self._CreateJob(queue, 29386, ops) def _BeforeStart(timeout, priority): self.assertEqual(queue.GetNextUpdate(), (job, True)) self.assertRaises(IndexError, queue.GetNextUpdate) self.assertFalse(queue.IsAcquired()) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK) def _AfterStart(op, cbs): self.assertEqual(queue.GetNextUpdate(), (job, True)) self.assertRaises(IndexError, queue.GetNextUpdate) self.assertFalse(queue.IsAcquired()) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_RUNNING) self.assertRaises(AssertionError, cbs.Feedback, "too", "many", "arguments") for (log_type, msg) in op.messages: self.assertRaises(IndexError, queue.GetNextUpdate) if log_type: cbs.Feedback(log_type, msg) else: cbs.Feedback(msg) # Check for job update without replication self.assertEqual(queue.GetNextUpdate(), (job, False)) self.assertRaises(IndexError, queue.GetNextUpdate) opexec = _FakeExecOpCodeForProc(queue, _BeforeStart, _AfterStart) for remaining in reversed(range(len(job.ops))): self.assertRaises(IndexError, queue.GetNextUpdate) result = jqueue._JobProcessor(queue, opexec, job)() self.assertEqual(queue.GetNextUpdate(), (job, True)) self.assertRaises(IndexError, queue.GetNextUpdate) if remaining == 0: # Last opcode self.assert_(result) break self.assertFalse(result) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) self.assertRaises(IndexError, queue.GetNextUpdate) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_SUCCESS) self.assertEqual(job.GetInfo(["opresult"]), [[op.input.result for op in job.ops]]) logmsgcount = sum(len(m) for m in messages.values()) self._CheckLogMessages(job, logmsgcount) # Serialize and restore (simulates program restart) newjob = jqueue._QueuedJob.Restore(queue, job.Serialize()) self._CheckLogMessages(newjob, logmsgcount) # Check each message prevserial = -1 for idx, oplog in enumerate(job.GetInfo(["oplog"])[0]): for (serial, timestamp, log_type, msg) in oplog: (exptype, expmsg) = messages.get(idx).pop(0) if exptype: self.assertEqual(log_type, exptype) else: self.assertEqual(log_type, constants.ELOG_MESSAGE) self.assertEqual(expmsg, msg) self.assert_(serial > prevserial) prevserial = serial
def testTimeout(self): ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(10)] # Create job job_id = 15801 job = self._CreateJob(self.queue, job_id, ops) self.job = job self.opcounter = itertools.count(0) opexec = _FakeExecOpCodeForProc(self.queue, self._BeforeStart, self._AfterStart) tsf = self._NewTimeoutStrategy self.assertFalse(self.done_lock_before_blocking) while True: proc = jqueue._JobProcessor(self.queue, opexec, job, _timeout_strategy_factory=tsf) self.assertRaises(IndexError, self.queue.GetNextUpdate) if self.curop is not None: self.prev_status = self.job.ops[self.curop].status self.lock_acq_prio = None result = proc(_nextop_fn=self._NextOpcode) assert self.curop is not None if result or self.gave_lock: # Got lock and/or job is done, result must've been written self.assertFalse(job.cur_opctx) self.assertEqual(self.queue.GetNextUpdate(), (job, True)) self.assertRaises(IndexError, self.queue.GetNextUpdate) self.assertEqual(self.lock_acq_prio, job.ops[self.curop].priority) self.assert_(job.ops[self.curop].exec_timestamp) if result: self.assertFalse(job.cur_opctx) break self.assertFalse(result) if self.curop == 0: self.assertEqual(job.ops[self.curop].start_timestamp, job.start_timestamp) if self.gave_lock: # Opcode finished, but job not yet done self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) else: # Did not get locks self.assert_(job.cur_opctx) self.assertEqual(job.cur_opctx._timeout_strategy._fn, self.timeout_strategy.NextAttempt) self.assertFalse(job.ops[self.curop].exec_timestamp) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK) # If priority has changed since acquiring locks, the job must've been # updated if self.lock_acq_prio != job.ops[self.curop].priority: self.assertEqual(self.queue.GetNextUpdate(), (job, True)) self.assertRaises(IndexError, self.queue.GetNextUpdate) self.assert_(job.start_timestamp) self.assertFalse(job.end_timestamp) self.assertEqual(self.curop, len(job.ops) - 1) self.assertEqual(self.job, job) self.assertEqual(self.opcounter.next(), len(job.ops)) self.assert_(self.done_lock_before_blocking) self.assertRaises(IndexError, self.queue.GetNextUpdate) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_SUCCESS) self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_SUCCESS]) self.assertEqual(job.GetInfo(["opresult"]), [[op.input.result for op in job.ops]]) self.assertEqual(job.GetInfo(["opstatus"]), [len(job.ops) * [constants.OP_STATUS_SUCCESS]]) self.assert_(compat.all(op.start_timestamp and op.end_timestamp for op in job.ops)) # Calling the processor on a finished job should be a no-op self.assertTrue(jqueue._JobProcessor(self.queue, opexec, job)()) self.assertRaises(IndexError, self.queue.GetNextUpdate)
def testSubmitManyJobs(self): queue = _FakeQueueForProc() job_id = 15656 ops = [ opcodes.OpTestDummy(result="Res0", fail=False, submit_jobs=[]), opcodes.OpTestDummy(result="Res1", fail=False, submit_jobs=[ [opcodes.OpTestDummy(result="r1j0", fail=False)], ]), opcodes.OpTestDummy(result="Res2", fail=False, submit_jobs=[ [opcodes.OpTestDummy(result="r2j0o0", fail=False), opcodes.OpTestDummy(result="r2j0o1", fail=False), opcodes.OpTestDummy(result="r2j0o2", fail=False), opcodes.OpTestDummy(result="r2j0o3", fail=False)], [opcodes.OpTestDummy(result="r2j1", fail=False)], [opcodes.OpTestDummy(result="r2j3o0", fail=False), opcodes.OpTestDummy(result="r2j3o1", fail=False)], ]), ] # Create job job = self._CreateJob(queue, job_id, ops) def _BeforeStart(timeout, priority): self.assertEqual(queue.GetNextUpdate(), (job, True)) self.assertRaises(IndexError, queue.GetNextUpdate) self.assertFalse(queue.IsAcquired()) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK) self.assertFalse(job.cur_opctx) def _AfterStart(op, cbs): self.assertEqual(queue.GetNextUpdate(), (job, True)) self.assertRaises(IndexError, queue.GetNextUpdate) self.assertFalse(queue.IsAcquired()) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_RUNNING) self.assertFalse(job.cur_opctx) # Job is running, cancelling shouldn't be possible (success, _) = job.Cancel() self.assertFalse(success) opexec = _FakeExecOpCodeForProc(queue, _BeforeStart, _AfterStart) for idx in range(len(ops)): self.assertRaises(IndexError, queue.GetNextUpdate) result = jqueue._JobProcessor(queue, opexec, job)() self.assertEqual(queue.GetNextUpdate(), (job, True)) self.assertRaises(IndexError, queue.GetNextUpdate) if idx == len(ops) - 1: # Last opcode self.assert_(result) else: self.assertFalse(result) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) self.assert_(job.start_timestamp) self.assertFalse(job.end_timestamp) self.assertRaises(IndexError, queue.GetNextUpdate) for idx, submitted_ops in enumerate(job_ops for op in ops for job_ops in op.submit_jobs): self.assertEqual(queue.GetNextSubmittedJob(), (1000 + idx, submitted_ops)) self.assertRaises(IndexError, queue.GetNextSubmittedJob) self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_SUCCESS) self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_SUCCESS]) self.assertEqual(job.GetInfo(["opresult"]), [[[], [1000], [1001, 1002, 1003]]]) self.assertEqual(job.GetInfo(["opstatus"]), [len(job.ops) * [constants.OP_STATUS_SUCCESS]]) self._GenericCheckJob(job) # Calling the processor on a finished job should be a no-op self.assertTrue(jqueue._JobProcessor(queue, opexec, job)()) self.assertRaises(IndexError, queue.GetNextUpdate)
def main(): debug = int(os.environ["GNT_DEBUG"]) logname = pathutils.GetLogFilename("jobs") utils.SetupLogging(logname, "job-startup", debug=debug) (job_id, livelock_name, secret_params_serialized) = _GetMasterInfo() secret_params = "" if secret_params_serialized: secret_params_json = serializer.LoadJson(secret_params_serialized) secret_params = RestorePrivateValueWrapping(secret_params_json) utils.SetupLogging(logname, "job-%s" % (job_id,), debug=debug) try: logging.debug("Preparing the context and the configuration") context = masterd.GanetiContext(livelock_name) logging.debug("Registering signal handlers") cancel = [False] prio_change = [False] def _TermHandler(signum, _frame): logging.info("Killed by signal %d", signum) cancel[0] = True signal.signal(signal.SIGTERM, _TermHandler) def _HupHandler(signum, _frame): logging.debug("Received signal %d, old flag was %s, will set to True", signum, mcpu.sighupReceived) mcpu.sighupReceived[0] = True signal.signal(signal.SIGHUP, _HupHandler) def _User1Handler(signum, _frame): logging.info("Received signal %d, indicating priority change", signum) prio_change[0] = True signal.signal(signal.SIGUSR1, _User1Handler) job = context.jobqueue.SafeLoadJobFromDisk(job_id, False) job.SetPid(os.getpid()) if secret_params: for i in range(0, len(secret_params)): if hasattr(job.ops[i].input, "osparams_secret"): job.ops[i].input.osparams_secret = secret_params[i] execfun = mcpu.Processor(context, job_id, job_id).ExecOpCode proc = _JobProcessor(context.jobqueue, execfun, job) result = _JobProcessor.DEFER while result != _JobProcessor.FINISHED: result = proc() if result == _JobProcessor.WAITDEP and not cancel[0]: # Normally, the scheduler should avoid starting a job where the # dependencies are not yet finalised. So warn, but wait an continue. logging.warning("Got started despite a dependency not yet finished") time.sleep(5) if cancel[0]: logging.debug("Got cancel request, cancelling job %d", job_id) r = context.jobqueue.CancelJob(job_id) job = context.jobqueue.SafeLoadJobFromDisk(job_id, False) proc = _JobProcessor(context.jobqueue, execfun, job) logging.debug("CancelJob result for job %d: %s", job_id, r) cancel[0] = False if prio_change[0]: logging.debug("Received priority-change request") try: fname = os.path.join(pathutils.LUXID_MESSAGE_DIR, "%d.prio" % job_id) new_prio = int(utils.ReadFile(fname)) utils.RemoveFile(fname) logging.debug("Changing priority of job %d to %d", job_id, new_prio) r = context.jobqueue.ChangeJobPriority(job_id, new_prio) job = context.jobqueue.SafeLoadJobFromDisk(job_id, False) proc = _JobProcessor(context.jobqueue, execfun, job) logging.debug("Result of changing priority of %d to %d: %s", job_id, new_prio, r) except Exception: # pylint: disable=W0703 logging.warning("Informed of priority change, but could not" " read new priority") prio_change[0] = False except Exception: # pylint: disable=W0703 logging.exception("Exception when trying to run job %d", job_id) finally: logging.debug("Job %d finalized", job_id) logging.debug("Removing livelock file %s", livelock_name.GetPath()) os.remove(livelock_name.GetPath()) sys.exit(0)