Exemplo n.º 1
0
    def main_loop(self) -> None:
        dts: Set[str] = set()
        while True:
            begin = time.time()
            try:
                # Check remote worker connectivity
                with transaction.atomic():
                    self.check_workers()

                # Schedule jobs
                schedule(self.logger, dts)
                dts = set()

                # Wait for events
                while not dts and (time.time() - begin) < INTERVAL:
                    timeout = max(INTERVAL - (time.time() - begin), 0)
                    with contextlib.suppress(zmq.ZMQError):
                        self.poller.poll(max(timeout * 1000, 1))
                    dts = self.get_available_dts()

            except (OperationalError, InterfaceError):
                self.logger.info("[RESET] database connection reset.")
                # Closing the database connection will force Django to reopen
                # the connection
                connection.close()
                time.sleep(2)
Exemplo n.º 2
0
 def test_job_limit(self):
     TestJob.objects.create(
         requested_device_type=self.device_type01,
         submitter=self.user,
         queue_timeout=int(timedelta(seconds=1).total_seconds()),
     )
     assert TestJob.objects.all().count() == 1
     # Limit the number of jobs that can run
     schedule(self.logger, [], [])
     assert TestJob.objects.filter(
         state=TestJob.STATE_SUBMITTED).count() == 1
     assert TestJob.objects.filter(
         state=TestJob.STATE_CANCELING).count() == 0
     time.sleep(3)
     schedule(self.logger, [], [])
     assert TestJob.objects.filter(
         state=TestJob.STATE_SUBMITTED).count() == 0
     canceling = TestJob.objects.filter(
         state=TestJob.STATE_CANCELING).count()
     canceled = TestJob.objects.filter(
         health=TestJob.HEALTH_CANCELED).count()
     if canceling == 0:
         assert canceled == 1
     else:
         assert canceling == 1
         assert canceled == 0
Exemplo n.º 3
0
    def test_low_medium_high_with_hc(self):
        # Enable health checks
        self.device_type01.health_denominator = DeviceType.HEALTH_PER_HOUR
        self.device_type01.health_frequency = 24
        self.device_type01.save()
        Device.get_health_check = _minimal_valid_job
        self.assertNotEqual(self.device01.get_health_check(), None)

        jobs = []
        for p in [
            TestJob.LOW,
            TestJob.MEDIUM,
            TestJob.HIGH,
            TestJob.MEDIUM,
            TestJob.LOW,
        ]:
            j = TestJob.objects.create(
                requested_device_type=self.device_type01,
                user=self.user,
                submitter=self.user,
                is_public=True,
                definition=_minimal_valid_job(None),
                priority=p,
            )
            jobs.append(j)

        # Check that an health check will be scheduled before any jobs
        log = DummyLogger()
        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[1], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[2], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[3], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)

        current_hc = self.device01.current_job()
        self.assertEqual(current_hc.state, TestJob.STATE_SCHEDULED)
        current_hc.go_state_finished(TestJob.HEALTH_COMPLETE)
        current_hc.save()

        # Check that the next job is the highest priority
        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[1], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[2], TestJob.STATE_SCHEDULED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)
Exemplo n.º 4
0
    def test_health_frequency_hours(self):
        self.device_type01.health_denominator = DeviceType.HEALTH_PER_HOUR
        self.device_type01.health_frequency = 24
        self.device_type01.save()
        Device.get_health_check = _minimal_valid_job
        self.assertNotEqual(self.device01.get_health_check(), None)
        self.assertNotEqual(self.device02.get_health_check(), None)
        self.assertNotEqual(self.device03.get_health_check(), None)
        # Only device03 is available now
        self.device01.health = Device.HEALTH_BAD
        self.device01.save()
        self.assertTrue(self.device01.is_valid())
        self.device03.health = Device.HEALTH_GOOD
        self.device03.save()
        self.assertTrue(self.device03.is_valid())

        # Create a job that should be scheduled now
        j = TestJob.objects.create(
            requested_device_type=self.device_type01,
            user=self.user,
            submitter=self.user,
            is_public=True,
            definition=_minimal_valid_job(None),
        )
        schedule(DummyLogger())
        self.device01.refresh_from_db()
        j.refresh_from_db()
        self.assertEqual(j.state, TestJob.STATE_SCHEDULED)
        self.assertEqual(j.actual_device, self.device03)
        j.go_state_finished(TestJob.HEALTH_COMPLETE)
        j.save()

        # Create a job that should be scheduled after the health check
        j = TestJob.objects.create(
            requested_device_type=self.device_type01,
            user=self.user,
            submitter=self.user,
            is_public=True,
            definition=_minimal_valid_job(None),
        )
        self.device03.refresh_from_db()
        self.last_hc03.submit_time = timezone.now() - timedelta(hours=25)
        self.last_hc03.save()

        schedule(DummyLogger())
        self.device03.refresh_from_db()
        j.refresh_from_db()
        self.assertEqual(j.state, TestJob.STATE_SUBMITTED)
        current_hc = self.device03.current_job()
        self.assertTrue(current_hc.health_check)
        self.assertEqual(current_hc.state, TestJob.STATE_SCHEDULED)
Exemplo n.º 5
0
    def test_job_limit(self):

        self.jobs = []
        maxjobs = 8
        j = maxjobs
        while j > 0:
            j = j - 1
            job = TestJob.objects.create(
                requested_device_type=self.device_type01,
                submitter=self.user,
                definition=_minimal_valid_job(None),
            )
            self.jobs.append(job)

        j = self.devmax
        while j > 0:
            j = j - 2
            djobs = schedule(DummyLogger())
            self.assertEqual(len(djobs), self.joblimit)
            for job in TestJob.objects.filter(state__in=[
                    TestJob.STATE_SCHEDULING, TestJob.STATE_SCHEDULED
            ]):
                self.assertTrue(job.health_check)
                job.go_state_finished(TestJob.HEALTH_COMPLETE)
                job.actual_device.health = Device.HEALTH_GOOD
                job.actual_device.state = Device.STATE_IDLE
                job.actual_device.save()
                job.save()

        j = maxjobs
        while j > 0:
            j = j - 2
            djobs = schedule(DummyLogger())
            self.assertEqual(len(djobs), self.joblimit)
            devs = 0
            for device in self.devices:
                device.refresh_from_db()
                if device.state != Device.STATE_IDLE:
                    devs = devs + 1
            self.assertEqual(devs, self.joblimit)

            for job in TestJob.objects.filter(state__in=[
                    TestJob.STATE_SCHEDULING, TestJob.STATE_SCHEDULED
            ]):
                self.assertFalse(job.health_check)
                job.go_state_finished(TestJob.HEALTH_COMPLETE)
                job.actual_device.health = Device.HEALTH_GOOD
                job.actual_device.state = Device.STATE_IDLE
                job.actual_device.save()
                job.save()
Exemplo n.º 6
0
 def test_job_limit(self):
     for i in range(0, 4):
         TestJob.objects.create(
             requested_device_type=self.device_type01,
             submitter=self.user,
             definition=_minimal_valid_job(None),
         )
     assert TestJob.objects.all().count() == 4
     # Limit the number of jobs that can run
     schedule(self.logger)
     assert TestJob.objects.filter(
         state=TestJob.STATE_SCHEDULED).count() == 2
     assert TestJob.objects.filter(
         state=TestJob.STATE_SUBMITTED).count() == 2
Exemplo n.º 7
0
    def test_health_frequency_hours(self):
        self.device_type01.health_denominator = DeviceType.HEALTH_PER_HOUR
        self.device_type01.health_frequency = 24
        self.device_type01.save()
        Device.get_health_check = _minimal_valid_job
        self.assertNotEqual(self.device01.get_health_check(), None)
        self.assertNotEqual(self.device02.get_health_check(), None)
        self.assertNotEqual(self.device03.get_health_check(), None)
        # Only device03 is available now
        self.device01.health = Device.HEALTH_BAD
        self.device01.save()
        self.device03.health = Device.HEALTH_GOOD
        self.device03.save()

        # Create a job that should be scheduled now
        j = TestJob.objects.create(requested_device_type=self.device_type01,
                                   user=self.user, submitter=self.user, is_public=True,
                                   definition=_minimal_valid_job(None))
        schedule(DummyLogger())
        self.device01.refresh_from_db()
        j.refresh_from_db()
        self.assertEqual(j.state, TestJob.STATE_SCHEDULED)
        self.assertEqual(j.actual_device, self.device03)
        j.go_state_finished(TestJob.HEALTH_COMPLETE)
        j.save()

        # Create a job that should be scheduled after the health check
        j = TestJob.objects.create(requested_device_type=self.device_type01,
                                   user=self.user, submitter=self.user, is_public=True,
                                   definition=_minimal_valid_job(None))
        self.device03.refresh_from_db()
        self.last_hc03.submit_time = timezone.now() - timedelta(hours=25)
        self.last_hc03.save()

        schedule(DummyLogger())
        self.device03.refresh_from_db()
        j.refresh_from_db()
        self.assertEqual(j.state, TestJob.STATE_SUBMITTED)
        current_hc = self.device03.current_job()
        self.assertTrue(current_hc.health_check)
        self.assertEqual(current_hc.state, TestJob.STATE_SCHEDULED)
Exemplo n.º 8
0
    def _check_scheduling(self, logger, device, current_priority,
                          remaining_priorities):
        schedule(logger, [], ["worker-01"])
        device.refresh_from_db()
        self.assertEqual(device.state, Device.STATE_RESERVED)

        scheduled = TestJob.objects.filter(state=TestJob.STATE_SCHEDULED)
        self.assertEqual(scheduled.count(), 1)

        current = TestJob.objects.get(id=scheduled[0].id)
        self._check_job(current, (current_priority, ), TestJob.STATE_SCHEDULED,
                        device)

        submitted = TestJob.objects.filter(state=TestJob.STATE_SUBMITTED)
        for j in submitted:
            self._check_job(j, remaining_priorities)

        current.go_state_finished(TestJob.HEALTH_COMPLETE)
        current.save()
        self._check_job(current, (current_priority, ), TestJob.STATE_FINISHED,
                        device)
Exemplo n.º 9
0
    def test_low_medium_high_with_hc(self):
        # Enable health checks
        self.device_type01.health_denominator = DeviceType.HEALTH_PER_HOUR
        self.device_type01.health_frequency = 24
        self.device_type01.save()
        Device.get_health_check = _minimal_valid_job
        self.assertNotEqual(self.device01.get_health_check(), None)

        jobs = []
        for p in [TestJob.LOW, TestJob.MEDIUM, TestJob.HIGH, TestJob.MEDIUM, TestJob.LOW]:
            j = TestJob.objects.create(requested_device_type=self.device_type01,
                                       user=self.user, submitter=self.user, is_public=True,
                                       definition=_minimal_valid_job(None), priority=p)
            jobs.append(j)

        # Check that an health check will be scheduled before any jobs
        log = DummyLogger()
        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[1], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[2], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[3], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)

        current_hc = self.device01.current_job()
        self.assertEqual(current_hc.state, TestJob.STATE_SCHEDULED)
        current_hc.go_state_finished(TestJob.HEALTH_COMPLETE)
        current_hc.save()

        # Check that the next job is the highest priority
        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[1], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[2], TestJob.STATE_SCHEDULED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)
Exemplo n.º 10
0
    def test_health_frequency_jobs(self):
        self.device_type01.health_denominator = DeviceType.HEALTH_PER_JOB
        self.device_type01.health_frequency = 2
        self.device_type01.save()
        self.last_hc03.submit_time = timezone.now() - timedelta(hours=2)
        self.last_hc03.save()
        Device.get_health_check = _minimal_valid_job
        self.assertNotEqual(self.device01.get_health_check(), None)
        self.assertNotEqual(self.device02.get_health_check(), None)
        self.assertNotEqual(self.device03.get_health_check(), None)
        # Only device03 is available now
        self.device01.health = Device.HEALTH_BAD
        self.device01.save()
        self.device03.health = Device.HEALTH_GOOD
        self.device03.save()

        # Create a job that should be scheduled now
        j01 = TestJob.objects.create(
            requested_device_type=self.device_type01,
            submitter=self.user,
            definition=_minimal_valid_job(None),
        )
        j02 = TestJob.objects.create(
            requested_device_type=self.device_type01,
            submitter=self.user,
            definition=_minimal_valid_job(None),
        )
        j03 = TestJob.objects.create(
            requested_device_type=self.device_type01,
            submitter=self.user,
            definition=_minimal_valid_job(None),
        )

        schedule(logging.getLogger())
        self.device03.refresh_from_db()
        j01.refresh_from_db()
        self.assertEqual(j01.state, TestJob.STATE_SCHEDULED)
        self.assertEqual(j01.actual_device, self.device03)
        j01.go_state_finished(TestJob.HEALTH_COMPLETE)
        j01.start_time = timezone.now() - timedelta(hours=1)
        j01.save()

        schedule(logging.getLogger())
        self.device03.refresh_from_db()
        j02.refresh_from_db()
        self.assertEqual(j02.state, TestJob.STATE_SCHEDULED)
        self.assertEqual(j02.actual_device, self.device03)
        j02.go_state_finished(TestJob.HEALTH_COMPLETE)
        j02.start_time = timezone.now() - timedelta(hours=1)
        j02.save()

        schedule(logging.getLogger())
        self.device03.refresh_from_db()
        j03.refresh_from_db()
        self.assertEqual(j03.state, TestJob.STATE_SUBMITTED)
        current_hc = self.device03.current_job()
        self.assertTrue(current_hc.health_check)
        self.assertEqual(current_hc.state, TestJob.STATE_SCHEDULED)
Exemplo n.º 11
0
    def test_health_frequency_jobs(self):
        self.device_type01.health_denominator = DeviceType.HEALTH_PER_JOB
        self.device_type01.health_frequency = 2
        self.device_type01.save()
        self.last_hc03.submit_time = timezone.now() - timedelta(hours=2)
        self.last_hc03.save()
        Device.get_health_check = _minimal_valid_job
        self.assertNotEqual(self.device01.get_health_check(), None)
        self.assertNotEqual(self.device02.get_health_check(), None)
        self.assertNotEqual(self.device03.get_health_check(), None)
        # Only device03 is available now
        self.device01.health = Device.HEALTH_BAD
        self.device01.save()
        self.device03.health = Device.HEALTH_GOOD
        self.device03.save()

        # Create three jobs that should be scheduled with a healthcheck preceding the
        # last one
        for i in range(0, 3):
            TestJob.objects.create(
                requested_device_type=self.device_type01,
                submitter=self.user,
                definition=_minimal_valid_job(None),
            )

        schedule(logging.getLogger(), [], ["worker-01", "worker-03"])
        self.device03.refresh_from_db()
        jobs = TestJob.objects.filter(state=TestJob.STATE_SCHEDULED)
        self.assertEqual(jobs.count(), 1)
        j = jobs[0]
        self.assertEqual(j.actual_device, self.device03)
        j.go_state_finished(TestJob.HEALTH_COMPLETE)
        j.start_time = timezone.now() - timedelta(hours=1)
        j.save()

        schedule(logging.getLogger(), [], ["worker-01", "worker-03"])
        self.device03.refresh_from_db()
        jobs = TestJob.objects.filter(state=TestJob.STATE_SCHEDULED)
        self.assertEqual(jobs.count(), 1)
        j = jobs[0]
        self.assertEqual(j.actual_device, self.device03)
        j.go_state_finished(TestJob.HEALTH_COMPLETE)
        j.start_time = timezone.now() - timedelta(hours=1)
        j.save()

        schedule(logging.getLogger(), [], ["worker-01", "worker-03"])
        self.device03.refresh_from_db()
        jobs = TestJob.objects.filter(state=TestJob.STATE_SUBMITTED)
        self.assertEqual(jobs.count(), 1)
        current_hc = self.device03.current_job()
        self.assertTrue(current_hc.health_check)
        self.assertEqual(current_hc.state, TestJob.STATE_SCHEDULED)
Exemplo n.º 12
0
    def main_loop(self, options):
        last_schedule = last_dispatcher_check = time.time()

        while True:
            try:
                try:
                    # Compute the timeout
                    now = time.time()
                    timeout = min(
                        SCHEDULE_INTERVAL - (now - last_schedule),
                        PING_INTERVAL - (now - last_dispatcher_check))
                    # If some actions are remaining, decrease the timeout
                    if any([self.events[k] for k in self.events.keys()]):
                        timeout = min(timeout, 2)
                    # Wait at least for 1ms
                    timeout = max(timeout * 1000, 1)

                    # Wait for data or a timeout
                    sockets = dict(self.poller.poll(timeout))
                except zmq.error.ZMQError:
                    continue

                if sockets.get(self.pipe_r) == zmq.POLLIN:
                    self.logger.info("[POLL] Received a signal, leaving")
                    break

                # Command socket
                if sockets.get(self.controler) == zmq.POLLIN:
                    while self.controler_socket(
                    ):  # Unqueue all pending messages
                        pass

                # Events socket
                if sockets.get(self.event_socket) == zmq.POLLIN:
                    while self.read_event_socket(
                    ):  # Unqueue all pending messages
                        pass
                    # Wait for the next iteration to handle the event.
                    # In fact, the code that generated the event (lava-logs or
                    # lava-server-gunicorn) needs some time to commit the
                    # database transaction.
                    # If we are too fast, the database object won't be
                    # available (or in the right state) yet.
                    continue

                # Inotify socket
                if sockets.get(self.inotify_fd) == zmq.POLLIN:
                    os.read(self.inotify_fd, 4096)
                    self.logger.debug("[AUTH] Reloading certificates from %s",
                                      options['slaves_certs'])
                    self.auth.configure_curve(domain='*',
                                              location=options['slaves_certs'])

                # Check dispatchers status
                now = time.time()
                if now - last_dispatcher_check > PING_INTERVAL:
                    for hostname, dispatcher in self.dispatchers.items():
                        if dispatcher.online and now - dispatcher.last_msg > DISPATCHER_TIMEOUT:
                            if hostname == "lava-logs":
                                self.logger.error(
                                    "[STATE] lava-logs goes OFFLINE")
                            else:
                                self.logger.error(
                                    "[STATE] Dispatcher <%s> goes OFFLINE",
                                    hostname)
                            self.dispatchers[hostname].go_offline()
                    last_dispatcher_check = now

                # Limit accesses to the database. This will also limit the rate of
                # CANCEL and START messages
                if time.time() - last_schedule > SCHEDULE_INTERVAL:
                    if self.dispatchers["lava-logs"].online:
                        schedule(self.logger)

                        # Dispatch scheduled jobs
                        with transaction.atomic():
                            self.start_jobs()
                    else:
                        self.logger.warning(
                            "lava-logs is offline: can't schedule jobs")

                    # Handle canceling jobs
                    with transaction.atomic():
                        self.cancel_jobs()

                    # Do not count the time taken to schedule jobs
                    last_schedule = time.time()
                else:
                    # Cancel the jobs and remove the jobs from the set
                    if self.events["canceling"]:
                        with transaction.atomic():
                            self.cancel_jobs(partial=True)
                        self.events["canceling"] = set()
                    # Schedule for available device-types
                    if self.events["available_dt"]:
                        jobs = schedule(self.logger,
                                        self.events["available_dt"])
                        self.events["available_dt"] = set()
                        # Dispatch scheduled jobs
                        with transaction.atomic():
                            self.start_jobs(jobs)

            except (OperationalError, InterfaceError):
                self.logger.info("[RESET] database connection reset.")
                # Closing the database connection will force Django to reopen
                # the connection
                connection.close()
                time.sleep(2)
Exemplo n.º 13
0
    def test_low_medium_high_without_hc(self):
        # Disable health checks
        Device.get_health_check = lambda cls: None
        jobs = []
        for p in [
                TestJob.LOW,
                TestJob.MEDIUM,
                TestJob.HIGH,
                TestJob.MEDIUM,
                TestJob.LOW,
                40,
        ]:
            j = TestJob.objects.create(
                requested_device_type=self.device_type01,
                submitter=self.user,
                definition=_minimal_valid_job(None),
                priority=p,
            )
            jobs.append(j)

        log = DummyLogger()
        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[1], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[2], TestJob.STATE_SCHEDULED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[5], TestJob.STATE_SUBMITTED)

        jobs[2].go_state_finished(TestJob.HEALTH_COMPLETE)
        jobs[2].save()
        self._check_job(jobs[2], TestJob.STATE_FINISHED, self.device01)

        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[1], TestJob.STATE_SCHEDULED, self.device01)
        self._check_job(jobs[2], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[5], TestJob.STATE_SUBMITTED)

        jobs[1].go_state_finished(TestJob.HEALTH_COMPLETE)
        jobs[1].save()
        self._check_job(jobs[1], TestJob.STATE_FINISHED, self.device01)

        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[1], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[2], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_SCHEDULED, self.device01)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[5], TestJob.STATE_SUBMITTED)

        jobs[3].go_state_finished(TestJob.HEALTH_COMPLETE)
        jobs[3].save()
        self._check_job(jobs[3], TestJob.STATE_FINISHED, self.device01)

        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[1], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[2], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[5], TestJob.STATE_SCHEDULED, self.device01)

        jobs[5].go_state_finished(TestJob.HEALTH_COMPLETE)
        jobs[5].save()
        self._check_job(jobs[5], TestJob.STATE_FINISHED, self.device01)

        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SCHEDULED, self.device01)
        self._check_job(jobs[1], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[2], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[5], TestJob.STATE_FINISHED, self.device01)

        jobs[0].go_state_finished(TestJob.HEALTH_COMPLETE)
        jobs[0].save()
        self._check_job(jobs[0], TestJob.STATE_FINISHED, self.device01)

        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[1], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[2], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[4], TestJob.STATE_SCHEDULED, self.device01)
        self._check_job(jobs[5], TestJob.STATE_FINISHED, self.device01)
Exemplo n.º 14
0
    def main_loop(self, options):
        last_schedule = last_dispatcher_check = time.time()

        while True:
            try:
                try:
                    # Compute the timeout
                    now = time.time()
                    timeout = min(SCHEDULE_INTERVAL - (now - last_schedule),
                                  PING_INTERVAL - (now - last_dispatcher_check))
                    # If some actions are remaining, decrease the timeout
                    if self.events["canceling"]:
                        timeout = min(timeout, 1)
                    # Wait at least for 1ms
                    timeout = max(timeout * 1000, 1)

                    # Wait for data or a timeout
                    sockets = dict(self.poller.poll(timeout))
                except zmq.error.ZMQError:
                    continue

                if sockets.get(self.pipe_r) == zmq.POLLIN:
                    self.logger.info("[POLL] Received a signal, leaving")
                    break

                # Command socket
                if sockets.get(self.controler) == zmq.POLLIN:
                    while self.controler_socket():  # Unqueue all pending messages
                        pass

                # Events socket
                if sockets.get(self.event_socket) == zmq.POLLIN:
                    while self.read_event_socket():  # Unqueue all pending messages
                        pass
                    # Wait for the next iteration to handle the event.
                    # In fact, the code that generated the event (lava-logs or
                    # lava-server-gunicorn) needs some time to commit the
                    # database transaction.
                    # If we are too fast, the database object won't be
                    # available (or in the right state) yet.
                    continue

                # Inotify socket
                if sockets.get(self.inotify_fd) == zmq.POLLIN:
                    os.read(self.inotify_fd, 4096)
                    self.logger.debug("[AUTH] Reloading certificates from %s",
                                      options['slaves_certs'])
                    self.auth.configure_curve(domain='*', location=options['slaves_certs'])

                # Check dispatchers status
                now = time.time()
                if now - last_dispatcher_check > PING_INTERVAL:
                    for hostname, dispatcher in self.dispatchers.items():
                        if dispatcher.online and now - dispatcher.last_msg > DISPATCHER_TIMEOUT:
                            if hostname == "lava-logs":
                                self.logger.error("[STATE] lava-logs goes OFFLINE")
                            else:
                                self.logger.error("[STATE] Dispatcher <%s> goes OFFLINE", hostname)
                            self.dispatchers[hostname].go_offline()
                    last_dispatcher_check = now

                # Limit accesses to the database. This will also limit the rate of
                # CANCEL and START messages
                if time.time() - last_schedule > SCHEDULE_INTERVAL:
                    if self.dispatchers["lava-logs"].online:
                        schedule(self.logger)

                        # Dispatch scheduled jobs
                        with transaction.atomic():
                            self.start_jobs(options)
                    else:
                        self.logger.warning("lava-logs is offline: can't schedule jobs")

                    # Handle canceling jobs
                    self.cancel_jobs()

                    # Do not count the time taken to schedule jobs
                    last_schedule = time.time()
                else:
                    # Cancel the jobs and remove the jobs from the set
                    if self.events["canceling"]:
                        self.cancel_jobs(partial=True)
                        self.events["canceling"] = set()

            except (OperationalError, InterfaceError):
                self.logger.info("[RESET] database connection reset.")
                # Closing the database connection will force Django to reopen
                # the connection
                connection.close()
                time.sleep(2)
Exemplo n.º 15
0
    def test_low_medium_high_without_hc(self):
        # Disable health checks
        Device.get_health_check = lambda cls: None
        jobs = []
        for p in [TestJob.LOW, TestJob.MEDIUM, TestJob.HIGH, TestJob.MEDIUM, TestJob.LOW, 40]:
            j = TestJob.objects.create(requested_device_type=self.device_type01,
                                       user=self.user, submitter=self.user, is_public=True,
                                       definition=_minimal_valid_job(None), priority=p)
            jobs.append(j)

        log = DummyLogger()
        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[1], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[2], TestJob.STATE_SCHEDULED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[5], TestJob.STATE_SUBMITTED)

        jobs[2].go_state_finished(TestJob.HEALTH_COMPLETE)
        jobs[2].save()
        self._check_job(jobs[2], TestJob.STATE_FINISHED, self.device01)

        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[1], TestJob.STATE_SCHEDULED, self.device01)
        self._check_job(jobs[2], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[5], TestJob.STATE_SUBMITTED)

        jobs[1].go_state_finished(TestJob.HEALTH_COMPLETE)
        jobs[1].save()
        self._check_job(jobs[1], TestJob.STATE_FINISHED, self.device01)

        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[1], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[2], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_SCHEDULED, self.device01)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[5], TestJob.STATE_SUBMITTED)

        jobs[3].go_state_finished(TestJob.HEALTH_COMPLETE)
        jobs[3].save()
        self._check_job(jobs[3], TestJob.STATE_FINISHED, self.device01)

        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[1], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[2], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[5], TestJob.STATE_SCHEDULED, self.device01)

        jobs[5].go_state_finished(TestJob.HEALTH_COMPLETE)
        jobs[5].save()
        self._check_job(jobs[5], TestJob.STATE_FINISHED, self.device01)

        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_SCHEDULED, self.device01)
        self._check_job(jobs[1], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[2], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[4], TestJob.STATE_SUBMITTED)
        self._check_job(jobs[5], TestJob.STATE_FINISHED, self.device01)

        jobs[0].go_state_finished(TestJob.HEALTH_COMPLETE)
        jobs[0].save()
        self._check_job(jobs[0], TestJob.STATE_FINISHED, self.device01)

        schedule(log)
        self.device01.refresh_from_db()
        self.assertEqual(self.device01.state, Device.STATE_RESERVED)
        self._check_job(jobs[0], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[1], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[2], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[3], TestJob.STATE_FINISHED, self.device01)
        self._check_job(jobs[4], TestJob.STATE_SCHEDULED, self.device01)
        self._check_job(jobs[5], TestJob.STATE_FINISHED, self.device01)