Exemplo n.º 1
0
    def testCancelWhileInQueue(self):
        queue = _FakeQueueForProc()

        ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(5)]

        # Create job
        job_id = 17045
        job = self._CreateJob(queue, job_id, ops)

        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)

        # Mark as cancelled
        (success, _) = job.Cancel()
        self.assert_(success)

        self.assert_(compat.all(op.status == constants.OP_STATUS_CANCELED for op in job.ops))

        opexec = _FakeExecOpCodeForProc(None, None)
        jqueue._JobProcessor(queue, opexec, job)()

        # Check result
        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_CANCELED)
        self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_CANCELED])
        self.assertFalse(job.start_timestamp)
        self.assert_(job.end_timestamp)
        self.assertFalse(compat.any(op.start_timestamp or op.end_timestamp for op in job.ops))
        self.assertEqual(
            job.GetInfo(["opstatus", "opresult"]),
            [[constants.OP_STATUS_CANCELED for _ in job.ops], ["Job canceled by request" for _ in job.ops]],
        )
Exemplo n.º 2
0
    def _TestPartial(self, job, successcount):
        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)
        self.assertEqual(job.start_timestamp, job.ops[0].start_timestamp)

        queue = _FakeQueueForProc()
        opexec = _FakeExecOpCodeForProc(None, None)

        for remaining in reversed(range(len(job.ops) - successcount)):
            result = jqueue._JobProcessor(queue, opexec, job)()

            if remaining == 0:
                # Last opcode
                self.assert_(result)
                break

            self.assertFalse(result)

            self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)

        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_SUCCESS)
        self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_SUCCESS])
        self.assertEqual(job.GetInfo(["opresult"]), [[op.input.result for op in job.ops]])
        self.assertEqual(job.GetInfo(["opstatus"]), [[constants.OP_STATUS_SUCCESS for _ in job.ops]])
        self.assert_(compat.all(op.start_timestamp and op.end_timestamp for op in job.ops))

        self._GenericCheckJob(job)

        # Finished jobs can't be processed any further
        self.assertRaises(errors.ProgrammerError, jqueue._JobProcessor(queue, opexec, job))

        # ... also after being restored
        job2 = jqueue._QueuedJob.Restore(queue, job.Serialize())
        self.assertRaises(errors.ProgrammerError, jqueue._JobProcessor(queue, opexec, job2))
Exemplo n.º 3
0
    def testOpcodeError(self):
        queue = _FakeQueueForProc()

        testdata = [(17077, 1, 0, 0), (1782, 5, 2, 2), (18179, 10, 9, 9), (4744, 10, 3, 8), (23816, 100, 39, 45)]

        for (job_id, opcount, failfrom, failto) in testdata:
            # Prepare opcodes
            ops = [
                opcodes.OpTestDummy(result="Res%s" % i, fail=(failfrom <= i and i <= failto)) for i in range(opcount)
            ]

            # Create job
            job = self._CreateJob(queue, job_id, ops)

            opexec = _FakeExecOpCodeForProc(None, None)

            for idx in range(len(ops)):
                result = jqueue._JobProcessor(queue, opexec, job)()

                if idx in (failfrom, len(ops) - 1):
                    # Last opcode
                    self.assert_(result)
                    break

                self.assertFalse(result)

                self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)

            # Check job status
            self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_ERROR)
            self.assertEqual(job.GetInfo(["id"]), [job_id])
            self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_ERROR])

            # Check opcode status
            data = zip(job.ops, job.GetInfo(["opstatus"])[0], job.GetInfo(["opresult"])[0])

            for idx, (op, opstatus, opresult) in enumerate(data):
                if idx < failfrom:
                    assert not op.input.fail
                    self.assertEqual(opstatus, constants.OP_STATUS_SUCCESS)
                    self.assertEqual(opresult, op.input.result)
                elif idx <= failto:
                    assert op.input.fail
                    self.assertEqual(opstatus, constants.OP_STATUS_ERROR)
                    self.assertRaises(errors.OpExecError, errors.MaybeRaise, opresult)
                else:
                    assert not op.input.fail
                    self.assertEqual(opstatus, constants.OP_STATUS_ERROR)
                    self.assertRaises(errors.OpExecError, errors.MaybeRaise, opresult)

            self.assert_(compat.all(op.start_timestamp and op.end_timestamp for op in job.ops[:failfrom]))

            self._GenericCheckJob(job)

            # Finished jobs can't be processed any further
            self.assertRaises(errors.ProgrammerError, jqueue._JobProcessor(queue, opexec, job))
Exemplo n.º 4
0
    def testTimeout(self):
        ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(10)]

        # Create job
        job_id = 15801
        job = self._CreateJob(self.queue, job_id, ops)
        self.job = job

        self.opcounter = itertools.count(0)

        opexec = _FakeExecOpCodeForProc(self._BeforeStart, self._AfterStart)
        tsf = self._NewTimeoutStrategy

        self.assertFalse(self.done_lock_before_blocking)

        for i in itertools.count(0):
            proc = jqueue._JobProcessor(self.queue, opexec, job, _timeout_strategy_factory=tsf)

            result = proc(_nextop_fn=self._NextOpcode)
            if result:
                self.assertFalse(job.cur_opctx)
                break

            self.assertFalse(result)

            if self.gave_lock:
                self.assertFalse(job.cur_opctx)
            else:
                self.assert_(job.cur_opctx)
                self.assertEqual(job.cur_opctx._timeout_strategy._fn, self.timeout_strategy.NextAttempt)

            self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)
            self.assert_(job.start_timestamp)
            self.assertFalse(job.end_timestamp)

        self.assertEqual(self.curop, len(job.ops) - 1)
        self.assertEqual(self.job, job)
        self.assertEqual(self.opcounter.next(), len(job.ops))
        self.assert_(self.done_lock_before_blocking)

        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_SUCCESS)
        self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_SUCCESS])
        self.assertEqual(job.GetInfo(["opresult"]), [[op.input.result for op in job.ops]])
        self.assertEqual(job.GetInfo(["opstatus"]), [len(job.ops) * [constants.OP_STATUS_SUCCESS]])
        self.assert_(compat.all(op.start_timestamp and op.end_timestamp for op in job.ops))

        # Finished jobs can't be processed any further
        self.assertRaises(errors.ProgrammerError, jqueue._JobProcessor(self.queue, opexec, job))
Exemplo n.º 5
0
  def testPartiallyRun(self):
    # Tests calling the processor on a job that's been partially run before the
    # program was restarted
    queue = _FakeQueueForProc()

    opexec = _FakeExecOpCodeForProc(queue, None, None)

    for job_id, successcount in [(30697, 1), (2552, 4), (12489, 9)]:
      ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False)
             for i in range(10)]

      # Create job
      job = self._CreateJob(queue, job_id, ops)

      self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)

      for _ in range(successcount):
        self.assertFalse(jqueue._JobProcessor(queue, opexec, job)())

      self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)
      self.assertEqual(job.GetInfo(["opstatus"]),
                       [[constants.OP_STATUS_SUCCESS
                         for _ in range(successcount)] +
                        [constants.OP_STATUS_QUEUED
                         for _ in range(len(ops) - successcount)]])

      self.assert_(job.ops_iter)

      # Serialize and restore (simulates program restart)
      newjob = jqueue._QueuedJob.Restore(queue, job.Serialize())
      self.assertFalse(newjob.ops_iter)
      self._TestPartial(newjob, successcount)
Exemplo n.º 6
0
    def testCancelWhileRunning(self):
        # Tests canceling a job with finished opcodes and more, unprocessed ones
        queue = _FakeQueueForProc()

        ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(3)]

        # Create job
        job_id = 28492
        job = self._CreateJob(queue, job_id, ops)

        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)

        opexec = _FakeExecOpCodeForProc(None, None)

        # Run one opcode
        self.assertFalse(jqueue._JobProcessor(queue, opexec, job)())

        # Job goes back to queued
        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)
        self.assertEqual(
            job.GetInfo(["opstatus", "opresult"]),
            [
                [constants.OP_STATUS_SUCCESS, constants.OP_STATUS_QUEUED, constants.OP_STATUS_QUEUED],
                ["Res0", None, None],
            ],
        )

        # Mark as cancelled
        (success, _) = job.Cancel()
        self.assert_(success)

        # Try processing another opcode (this will actually cancel the job)
        self.assert_(jqueue._JobProcessor(queue, opexec, job)())

        # Check result
        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_CANCELED)
        self.assertEqual(job.GetInfo(["id"]), [job_id])
        self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_CANCELED])
        self.assertEqual(
            job.GetInfo(["opstatus", "opresult"]),
            [
                [constants.OP_STATUS_SUCCESS, constants.OP_STATUS_CANCELED, constants.OP_STATUS_CANCELED],
                ["Res0", "Job canceled by request", "Job canceled by request"],
            ],
        )
Exemplo n.º 7
0
    def testSuccess(self):
        queue = _FakeQueueForProc()

        for (job_id, opcount) in [(25351, 1), (6637, 3), (24644, 10), (32207, 100)]:
            ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(opcount)]

            # Create job
            job = self._CreateJob(queue, job_id, ops)

            def _BeforeStart(timeout, priority):
                self.assertFalse(queue.IsAcquired())
                self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK)

            def _AfterStart(op, cbs):
                self.assertFalse(queue.IsAcquired())
                self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_RUNNING)

                # Job is running, cancelling shouldn't be possible
                (success, _) = job.Cancel()
                self.assertFalse(success)

            opexec = _FakeExecOpCodeForProc(_BeforeStart, _AfterStart)

            for idx in range(len(ops)):
                result = jqueue._JobProcessor(queue, opexec, job)()
                if idx == len(ops) - 1:
                    # Last opcode
                    self.assert_(result)
                else:
                    self.assertFalse(result)

                    self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)
                    self.assert_(job.start_timestamp)
                    self.assertFalse(job.end_timestamp)

            self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_SUCCESS)
            self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_SUCCESS])
            self.assertEqual(job.GetInfo(["opresult"]), [[op.input.result for op in job.ops]])
            self.assertEqual(job.GetInfo(["opstatus"]), [len(job.ops) * [constants.OP_STATUS_SUCCESS]])
            self.assert_(compat.all(op.start_timestamp and op.end_timestamp for op in job.ops))

            self._GenericCheckJob(job)

            # Finished jobs can't be processed any further
            self.assertRaises(errors.ProgrammerError, jqueue._JobProcessor(queue, opexec, job))
Exemplo n.º 8
0
    def testCancelWhileWaitlock(self):
        queue = _FakeQueueForProc()

        ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) for i in range(5)]

        # Create job
        job_id = 11009
        job = self._CreateJob(queue, job_id, ops)

        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)

        def _BeforeStart(timeout, priority):
            self.assertFalse(queue.IsAcquired())
            self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK)

            # Mark as cancelled
            (success, _) = job.Cancel()
            self.assert_(success)

            self.assert_(compat.all(op.status == constants.OP_STATUS_CANCELING for op in job.ops))

        def _AfterStart(op, cbs):
            self.assertFalse(queue.IsAcquired())
            self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_RUNNING)

        opexec = _FakeExecOpCodeForProc(_BeforeStart, _AfterStart)

        jqueue._JobProcessor(queue, opexec, job)()

        # Check result
        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_CANCELED)
        self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_CANCELED])
        self.assert_(job.start_timestamp)
        self.assert_(job.end_timestamp)
        self.assertFalse(compat.all(op.start_timestamp and op.end_timestamp for op in job.ops))
        self.assertEqual(
            job.GetInfo(["opstatus", "opresult"]),
            [[constants.OP_STATUS_CANCELED for _ in job.ops], ["Job canceled by request" for _ in job.ops]],
        )
Exemplo n.º 9
0
  def testCancelWhileInQueue(self):
    queue = _FakeQueueForProc()

    ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False)
           for i in range(5)]

    # Create job
    job_id = 17045
    job = self._CreateJob(queue, job_id, ops)

    self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)

    # Mark as cancelled
    (success, _) = job.Cancel()
    self.assert_(success)

    self.assertRaises(IndexError, queue.GetNextUpdate)

    self.assertFalse(job.start_timestamp)
    self.assertTrue(job.end_timestamp)
    self.assert_(compat.all(op.status == constants.OP_STATUS_CANCELED
                            for op in job.ops))

    # Serialize to check for differences
    before_proc = job.Serialize()

    # Simulate processor called in workerpool
    opexec = _FakeExecOpCodeForProc(queue, None, None)
    self.assert_(jqueue._JobProcessor(queue, opexec, job)())

    # Check result
    self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_CANCELED)
    self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_CANCELED])
    self.assertFalse(job.start_timestamp)
    self.assertTrue(job.end_timestamp)
    self.assertFalse(compat.any(op.start_timestamp or op.end_timestamp
                                for op in job.ops))
    self.assertEqual(job.GetInfo(["opstatus", "opresult"]),
                     [[constants.OP_STATUS_CANCELED for _ in job.ops],
                      ["Job canceled by request" for _ in job.ops]])

    # Must not have changed or written
    self.assertEqual(before_proc, job.Serialize())
    self.assertRaises(IndexError, queue.GetNextUpdate)
Exemplo n.º 10
0
    def testProcessorOnRunningJob(self):
        ops = [opcodes.OpTestDummy(result="result", fail=False)]

        queue = _FakeQueueForProc()
        opexec = _FakeExecOpCodeForProc(None, None)

        # Create job
        job = self._CreateJob(queue, 9571, ops)

        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)

        job.ops[0].status = constants.OP_STATUS_RUNNING

        assert len(job.ops) == 1

        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_RUNNING)

        # Calling on running job must fail
        self.assertRaises(errors.ProgrammerError, jqueue._JobProcessor(queue, opexec, job))
Exemplo n.º 11
0
  def testCancelWhileWaitlockInQueue(self):
    queue = _FakeQueueForProc()

    ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False)
           for i in range(5)]

    # Create job
    job_id = 8645
    job = self._CreateJob(queue, job_id, ops)

    self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)

    job.ops[0].status = constants.OP_STATUS_WAITLOCK

    assert len(job.ops) == 5

    self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK)

    # Mark as cancelling
    (success, _) = job.Cancel()
    self.assert_(success)

    self.assertRaises(IndexError, queue.GetNextUpdate)

    self.assert_(compat.all(op.status == constants.OP_STATUS_CANCELING
                            for op in job.ops))

    opexec = _FakeExecOpCodeForProc(queue, None, None)
    self.assert_(jqueue._JobProcessor(queue, opexec, job)())

    # Check result
    self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_CANCELED)
    self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_CANCELED])
    self.assertFalse(job.start_timestamp)
    self.assert_(job.end_timestamp)
    self.assertFalse(compat.any(op.start_timestamp or op.end_timestamp
                                for op in job.ops))
    self.assertEqual(job.GetInfo(["opstatus", "opresult"]),
                     [[constants.OP_STATUS_CANCELED for _ in job.ops],
                      ["Job canceled by request" for _ in job.ops]])
Exemplo n.º 12
0
def main():

  debug = int(os.environ["GNT_DEBUG"])

  logname = pathutils.GetLogFilename("jobs")
  utils.SetupLogging(logname, "job-startup", debug=debug)

  (job_id, llock, secret_params_serialized) = _SetupJob()

  secret_params = ""
  if secret_params_serialized:
    secret_params_json = serializer.LoadJson(secret_params_serialized)
    secret_params = RestorePrivateValueWrapping(secret_params_json)

  utils.SetupLogging(logname, "job-%s" % (job_id,), debug=debug)

  try:
    logging.debug("Preparing the context and the configuration")
    context = masterd.GanetiContext(llock)

    logging.debug("Registering signal handlers")

    cancel = [False]
    prio_change = [False]

    def _TermHandler(signum, _frame):
      logging.info("Killed by signal %d", signum)
      cancel[0] = True
    signal.signal(signal.SIGTERM, _TermHandler)

    def _HupHandler(signum, _frame):
      logging.debug("Received signal %d, old flag was %s, will set to True",
                    signum, mcpu.sighupReceived)
      mcpu.sighupReceived[0] = True
    signal.signal(signal.SIGHUP, _HupHandler)

    def _User1Handler(signum, _frame):
      logging.info("Received signal %d, indicating priority change", signum)
      prio_change[0] = True
    signal.signal(signal.SIGUSR1, _User1Handler)

    job = context.jobqueue.SafeLoadJobFromDisk(job_id, False)

    job.SetPid(os.getpid())

    if secret_params:
      for i in range(0, len(secret_params)):
        if hasattr(job.ops[i].input, "osparams_secret"):
          job.ops[i].input.osparams_secret = secret_params[i]

    execfun = mcpu.Processor(context, job_id, job_id).ExecOpCode
    proc = _JobProcessor(context.jobqueue, execfun, job)
    result = _JobProcessor.DEFER
    while result != _JobProcessor.FINISHED:
      result = proc()
      if result == _JobProcessor.WAITDEP and not cancel[0]:
        # Normally, the scheduler should avoid starting a job where the
        # dependencies are not yet finalised. So warn, but wait an continue.
        logging.warning("Got started despite a dependency not yet finished")
        time.sleep(5)
      if cancel[0]:
        logging.debug("Got cancel request, cancelling job %d", job_id)
        r = context.jobqueue.CancelJob(job_id)
        job = context.jobqueue.SafeLoadJobFromDisk(job_id, False)
        proc = _JobProcessor(context.jobqueue, execfun, job)
        logging.debug("CancelJob result for job %d: %s", job_id, r)
        cancel[0] = False
      if prio_change[0]:
        logging.debug("Received priority-change request")
        try:
          fname = os.path.join(pathutils.LUXID_MESSAGE_DIR, "%d.prio" % job_id)
          new_prio = int(utils.ReadFile(fname))
          utils.RemoveFile(fname)
          logging.debug("Changing priority of job %d to %d", job_id, new_prio)
          r = context.jobqueue.ChangeJobPriority(job_id, new_prio)
          job = context.jobqueue.SafeLoadJobFromDisk(job_id, False)
          proc = _JobProcessor(context.jobqueue, execfun, job)
          logging.debug("Result of changing priority of %d to %d: %s", job_id,
                        new_prio, r)
        except Exception: # pylint: disable=W0703
          logging.warning("Informed of priority change, but could not"
                          " read new priority")
        prio_change[0] = False

  except Exception: # pylint: disable=W0703
    logging.exception("Exception when trying to run job %d", job_id)
  finally:
    logging.debug("Job %d finalized", job_id)
    logging.debug("Removing livelock file %s", llock.GetPath())
    os.remove(llock.GetPath())

  sys.exit(0)
Exemplo n.º 13
0
  def testLogMessages(self):
    # Tests the "Feedback" callback function
    queue = _FakeQueueForProc()

    messages = {
      1: [
        (None, "Hello"),
        (None, "World"),
        (constants.ELOG_MESSAGE, "there"),
        ],
      4: [
        (constants.ELOG_JQUEUE_TEST, (1, 2, 3)),
        (constants.ELOG_JQUEUE_TEST, ("other", "type")),
        ],
      }
    ops = [opcodes.OpTestDummy(result="Logtest%s" % i, fail=False,
                               messages=messages.get(i, []))
           for i in range(5)]

    # Create job
    job = self._CreateJob(queue, 29386, ops)

    def _BeforeStart(timeout, priority):
      self.assertEqual(queue.GetNextUpdate(), (job, True))
      self.assertRaises(IndexError, queue.GetNextUpdate)
      self.assertFalse(queue.IsAcquired())
      self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK)

    def _AfterStart(op, cbs):
      self.assertEqual(queue.GetNextUpdate(), (job, True))
      self.assertRaises(IndexError, queue.GetNextUpdate)
      self.assertFalse(queue.IsAcquired())
      self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_RUNNING)

      self.assertRaises(AssertionError, cbs.Feedback,
                        "too", "many", "arguments")

      for (log_type, msg) in op.messages:
        self.assertRaises(IndexError, queue.GetNextUpdate)
        if log_type:
          cbs.Feedback(log_type, msg)
        else:
          cbs.Feedback(msg)
        # Check for job update without replication
        self.assertEqual(queue.GetNextUpdate(), (job, False))
        self.assertRaises(IndexError, queue.GetNextUpdate)

    opexec = _FakeExecOpCodeForProc(queue, _BeforeStart, _AfterStart)

    for remaining in reversed(range(len(job.ops))):
      self.assertRaises(IndexError, queue.GetNextUpdate)
      result = jqueue._JobProcessor(queue, opexec, job)()
      self.assertEqual(queue.GetNextUpdate(), (job, True))
      self.assertRaises(IndexError, queue.GetNextUpdate)

      if remaining == 0:
        # Last opcode
        self.assert_(result)
        break

      self.assertFalse(result)

      self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)

    self.assertRaises(IndexError, queue.GetNextUpdate)

    self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_SUCCESS)
    self.assertEqual(job.GetInfo(["opresult"]),
                     [[op.input.result for op in job.ops]])

    logmsgcount = sum(len(m) for m in messages.values())

    self._CheckLogMessages(job, logmsgcount)

    # Serialize and restore (simulates program restart)
    newjob = jqueue._QueuedJob.Restore(queue, job.Serialize())
    self._CheckLogMessages(newjob, logmsgcount)

    # Check each message
    prevserial = -1
    for idx, oplog in enumerate(job.GetInfo(["oplog"])[0]):
      for (serial, timestamp, log_type, msg) in oplog:
        (exptype, expmsg) = messages.get(idx).pop(0)
        if exptype:
          self.assertEqual(log_type, exptype)
        else:
          self.assertEqual(log_type, constants.ELOG_MESSAGE)
        self.assertEqual(expmsg, msg)
        self.assert_(serial > prevserial)
        prevserial = serial
Exemplo n.º 14
0
  def testTimeout(self):
    ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False)
           for i in range(10)]

    # Create job
    job_id = 15801
    job = self._CreateJob(self.queue, job_id, ops)
    self.job = job

    self.opcounter = itertools.count(0)

    opexec = _FakeExecOpCodeForProc(self.queue, self._BeforeStart,
                                    self._AfterStart)
    tsf = self._NewTimeoutStrategy

    self.assertFalse(self.done_lock_before_blocking)

    while True:
      proc = jqueue._JobProcessor(self.queue, opexec, job,
                                  _timeout_strategy_factory=tsf)

      self.assertRaises(IndexError, self.queue.GetNextUpdate)

      if self.curop is not None:
        self.prev_status = self.job.ops[self.curop].status

      self.lock_acq_prio = None

      result = proc(_nextop_fn=self._NextOpcode)
      assert self.curop is not None

      if result or self.gave_lock:
        # Got lock and/or job is done, result must've been written
        self.assertFalse(job.cur_opctx)
        self.assertEqual(self.queue.GetNextUpdate(), (job, True))
        self.assertRaises(IndexError, self.queue.GetNextUpdate)
        self.assertEqual(self.lock_acq_prio, job.ops[self.curop].priority)
        self.assert_(job.ops[self.curop].exec_timestamp)

      if result:
        self.assertFalse(job.cur_opctx)
        break

      self.assertFalse(result)

      if self.curop == 0:
        self.assertEqual(job.ops[self.curop].start_timestamp,
                         job.start_timestamp)

      if self.gave_lock:
        # Opcode finished, but job not yet done
        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)
      else:
        # Did not get locks
        self.assert_(job.cur_opctx)
        self.assertEqual(job.cur_opctx._timeout_strategy._fn,
                         self.timeout_strategy.NextAttempt)
        self.assertFalse(job.ops[self.curop].exec_timestamp)
        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK)

        # If priority has changed since acquiring locks, the job must've been
        # updated
        if self.lock_acq_prio != job.ops[self.curop].priority:
          self.assertEqual(self.queue.GetNextUpdate(), (job, True))

      self.assertRaises(IndexError, self.queue.GetNextUpdate)

      self.assert_(job.start_timestamp)
      self.assertFalse(job.end_timestamp)

    self.assertEqual(self.curop, len(job.ops) - 1)
    self.assertEqual(self.job, job)
    self.assertEqual(self.opcounter.next(), len(job.ops))
    self.assert_(self.done_lock_before_blocking)

    self.assertRaises(IndexError, self.queue.GetNextUpdate)
    self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_SUCCESS)
    self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_SUCCESS])
    self.assertEqual(job.GetInfo(["opresult"]),
                     [[op.input.result for op in job.ops]])
    self.assertEqual(job.GetInfo(["opstatus"]),
                     [len(job.ops) * [constants.OP_STATUS_SUCCESS]])
    self.assert_(compat.all(op.start_timestamp and op.end_timestamp
                            for op in job.ops))

    # Calling the processor on a finished job should be a no-op
    self.assertTrue(jqueue._JobProcessor(self.queue, opexec, job)())
    self.assertRaises(IndexError, self.queue.GetNextUpdate)
Exemplo n.º 15
0
  def testSubmitManyJobs(self):
    queue = _FakeQueueForProc()

    job_id = 15656
    ops = [
      opcodes.OpTestDummy(result="Res0", fail=False,
                          submit_jobs=[]),
      opcodes.OpTestDummy(result="Res1", fail=False,
                          submit_jobs=[
                            [opcodes.OpTestDummy(result="r1j0", fail=False)],
                            ]),
      opcodes.OpTestDummy(result="Res2", fail=False,
                          submit_jobs=[
                            [opcodes.OpTestDummy(result="r2j0o0", fail=False),
                             opcodes.OpTestDummy(result="r2j0o1", fail=False),
                             opcodes.OpTestDummy(result="r2j0o2", fail=False),
                             opcodes.OpTestDummy(result="r2j0o3", fail=False)],
                            [opcodes.OpTestDummy(result="r2j1", fail=False)],
                            [opcodes.OpTestDummy(result="r2j3o0", fail=False),
                             opcodes.OpTestDummy(result="r2j3o1", fail=False)],
                            ]),
      ]

    # Create job
    job = self._CreateJob(queue, job_id, ops)

    def _BeforeStart(timeout, priority):
      self.assertEqual(queue.GetNextUpdate(), (job, True))
      self.assertRaises(IndexError, queue.GetNextUpdate)
      self.assertFalse(queue.IsAcquired())
      self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK)
      self.assertFalse(job.cur_opctx)

    def _AfterStart(op, cbs):
      self.assertEqual(queue.GetNextUpdate(), (job, True))
      self.assertRaises(IndexError, queue.GetNextUpdate)

      self.assertFalse(queue.IsAcquired())
      self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_RUNNING)
      self.assertFalse(job.cur_opctx)

      # Job is running, cancelling shouldn't be possible
      (success, _) = job.Cancel()
      self.assertFalse(success)

    opexec = _FakeExecOpCodeForProc(queue, _BeforeStart, _AfterStart)

    for idx in range(len(ops)):
      self.assertRaises(IndexError, queue.GetNextUpdate)
      result = jqueue._JobProcessor(queue, opexec, job)()
      self.assertEqual(queue.GetNextUpdate(), (job, True))
      self.assertRaises(IndexError, queue.GetNextUpdate)
      if idx == len(ops) - 1:
        # Last opcode
        self.assert_(result)
      else:
        self.assertFalse(result)

        self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)
        self.assert_(job.start_timestamp)
        self.assertFalse(job.end_timestamp)

    self.assertRaises(IndexError, queue.GetNextUpdate)

    for idx, submitted_ops in enumerate(job_ops
                                        for op in ops
                                        for job_ops in op.submit_jobs):
      self.assertEqual(queue.GetNextSubmittedJob(),
                       (1000 + idx, submitted_ops))
    self.assertRaises(IndexError, queue.GetNextSubmittedJob)

    self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_SUCCESS)
    self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_SUCCESS])
    self.assertEqual(job.GetInfo(["opresult"]),
                     [[[], [1000], [1001, 1002, 1003]]])
    self.assertEqual(job.GetInfo(["opstatus"]),
                     [len(job.ops) * [constants.OP_STATUS_SUCCESS]])

    self._GenericCheckJob(job)

    # Calling the processor on a finished job should be a no-op
    self.assertTrue(jqueue._JobProcessor(queue, opexec, job)())
    self.assertRaises(IndexError, queue.GetNextUpdate)
Exemplo n.º 16
0
def main():

  debug = int(os.environ["GNT_DEBUG"])

  logname = pathutils.GetLogFilename("jobs")
  utils.SetupLogging(logname, "job-startup", debug=debug)

  (job_id, livelock_name, secret_params_serialized) = _GetMasterInfo()

  secret_params = ""
  if secret_params_serialized:
    secret_params_json = serializer.LoadJson(secret_params_serialized)
    secret_params = RestorePrivateValueWrapping(secret_params_json)

  utils.SetupLogging(logname, "job-%s" % (job_id,), debug=debug)

  try:
    logging.debug("Preparing the context and the configuration")
    context = masterd.GanetiContext(livelock_name)

    logging.debug("Registering signal handlers")

    cancel = [False]
    prio_change = [False]

    def _TermHandler(signum, _frame):
      logging.info("Killed by signal %d", signum)
      cancel[0] = True
    signal.signal(signal.SIGTERM, _TermHandler)

    def _HupHandler(signum, _frame):
      logging.debug("Received signal %d, old flag was %s, will set to True",
                    signum, mcpu.sighupReceived)
      mcpu.sighupReceived[0] = True
    signal.signal(signal.SIGHUP, _HupHandler)

    def _User1Handler(signum, _frame):
      logging.info("Received signal %d, indicating priority change", signum)
      prio_change[0] = True
    signal.signal(signal.SIGUSR1, _User1Handler)

    job = context.jobqueue.SafeLoadJobFromDisk(job_id, False)

    job.SetPid(os.getpid())

    if secret_params:
      for i in range(0, len(secret_params)):
        if hasattr(job.ops[i].input, "osparams_secret"):
          job.ops[i].input.osparams_secret = secret_params[i]

    execfun = mcpu.Processor(context, job_id, job_id).ExecOpCode
    proc = _JobProcessor(context.jobqueue, execfun, job)
    result = _JobProcessor.DEFER
    while result != _JobProcessor.FINISHED:
      result = proc()
      if result == _JobProcessor.WAITDEP and not cancel[0]:
        # Normally, the scheduler should avoid starting a job where the
        # dependencies are not yet finalised. So warn, but wait an continue.
        logging.warning("Got started despite a dependency not yet finished")
        time.sleep(5)
      if cancel[0]:
        logging.debug("Got cancel request, cancelling job %d", job_id)
        r = context.jobqueue.CancelJob(job_id)
        job = context.jobqueue.SafeLoadJobFromDisk(job_id, False)
        proc = _JobProcessor(context.jobqueue, execfun, job)
        logging.debug("CancelJob result for job %d: %s", job_id, r)
        cancel[0] = False
      if prio_change[0]:
        logging.debug("Received priority-change request")
        try:
          fname = os.path.join(pathutils.LUXID_MESSAGE_DIR, "%d.prio" % job_id)
          new_prio = int(utils.ReadFile(fname))
          utils.RemoveFile(fname)
          logging.debug("Changing priority of job %d to %d", job_id, new_prio)
          r = context.jobqueue.ChangeJobPriority(job_id, new_prio)
          job = context.jobqueue.SafeLoadJobFromDisk(job_id, False)
          proc = _JobProcessor(context.jobqueue, execfun, job)
          logging.debug("Result of changing priority of %d to %d: %s", job_id,
                        new_prio, r)
        except Exception: # pylint: disable=W0703
          logging.warning("Informed of priority change, but could not"
                          " read new priority")
        prio_change[0] = False

  except Exception: # pylint: disable=W0703
    logging.exception("Exception when trying to run job %d", job_id)
  finally:
    logging.debug("Job %d finalized", job_id)
    logging.debug("Removing livelock file %s", livelock_name.GetPath())
    os.remove(livelock_name.GetPath())

  sys.exit(0)