Пример #1
0
    def _checkJobs(self):
        # Update Worker Heartbeat
        #
        # NOTE: This will recide here till we finalize scheduler refactoring
        #       and a separte module for worker specific daemon gets created.
        self.logger.debug("Worker heartbeat")
        worker = WorkerData()

        # Record the scheduler tick (timestamp).
        worker.record_master_scheduler_tick()

        try:
            worker.put_heartbeat_data()
        except (xmlrpclib.Fault, xmlrpclib.ProtocolError) as err:
            worker.logger.error("Heartbeat update failed!")

        self.logger.debug("Refreshing jobs")
        return self.source.getJobList().addCallback(
            self._startJobs).addErrback(catchall_errback(self.logger))
Пример #2
0
    def _startJobs(self, jobs):
        # Record the scheduler tick (timestamp).
        utils.record_scheduler_tick()

        # Update Worker Heartbeat
        #
        # NOTE: This will recide here till we finalize scheduler refactoring
        #       and a separte module for worker specific daemon gets created.
        worker = WorkerData()
        try:
            worker.put_heartbeat_data()
        except (xmlrpclib.Fault, xmlrpclib.ProtocolError) as err:
            worker.logger.error("Heartbeat update failed!")

        for job in jobs:
            new_job = JobRunner(self.source, job, self.dispatcher,
                                self.reactor, self.daemon_options)
            self.logger.info("Starting Job: %d " % job.id)

            new_job.start()
Пример #3
0
    def _startJobs(self, jobs):
        # Record the scheduler tick (timestamp).
        utils.record_scheduler_tick()

        # Update Worker Heartbeat
        #
        # NOTE: This will recide here till we finalize scheduler refactoring
        #       and a separte module for worker specific daemon gets created.
        worker = WorkerData()
        try:
            worker.put_heartbeat_data()
        except (xmlrpclib.Fault, xmlrpclib.ProtocolError) as err:
            worker.logger.error("Heartbeat update failed!")

        for job in jobs:
            new_job = JobRunner(self.source, job, self.dispatcher,
                                self.reactor, self.daemon_options)
            self.logger.info("Starting Job: %d " % job.id)

            new_job.start()
    def handle(self, *args, **options):
        import os

        from twisted.internet import reactor

        from lava_scheduler_daemon.service import JobQueue
        from lava_scheduler_daemon.worker import WorkerData
        from lava_scheduler_daemon.dbjobsource import DatabaseJobSource
        import xmlrpclib

        daemon_options = self._configure(options)

        source = DatabaseJobSource()

        if options['use_fake']:
            import lava_scheduler_app
            opd = os.path.dirname
            dispatcher = os.path.join(
                opd(opd(os.path.abspath(lava_scheduler_app.__file__))),
                'fake-dispatcher')
        else:
            dispatcher = options['dispatcher']

        # Update complete worker heartbeat data. This will be run once,
        # on every start/restart of the scheduler daemon.
        worker = WorkerData()
        try:
            worker.put_heartbeat_data(restart=True)
        except (xmlrpclib.Fault, xmlrpclib.ProtocolError) as err:
            worker.logger.error("Complete heartbeat update failed!")

        # Start scheduler service.
        service = JobQueue(source,
                           dispatcher,
                           reactor,
                           daemon_options=daemon_options)
        reactor.callWhenRunning(service.startService)
        reactor.run()
Пример #5
0
    def handle(self, *args, **options):
        import os

        from twisted.internet import reactor

        from lava_scheduler_daemon.service import JobQueue
        from lava_scheduler_daemon.worker import WorkerData
        from lava_scheduler_daemon.dbjobsource import DatabaseJobSource
        import xmlrpclib

        daemon_options = self._configure(options)

        source = DatabaseJobSource()

        if options['use_fake']:
            import lava_scheduler_app
            opd = os.path.dirname
            dispatcher = os.path.join(
                opd(opd(os.path.abspath(lava_scheduler_app.__file__))),
                'fake-dispatcher')
        else:
            dispatcher = options['dispatcher']

        # Update complete worker heartbeat data. This will be run once,
        # on every start/restart of the scheduler daemon.
        worker = WorkerData()
        try:
            worker.put_heartbeat_data(restart=True)
        except (xmlrpclib.Fault, xmlrpclib.ProtocolError) as err:
            worker.logger.error("Complete heartbeat update failed!")

        # Start scheduler service.
        service = JobQueue(
            source, dispatcher, reactor, daemon_options=daemon_options)
        reactor.callWhenRunning(service.startService)
        reactor.run()
Пример #6
0
    def _checkJobs(self):
        # Update Worker Heartbeat
        #
        # NOTE: This will recide here till we finalize scheduler refactoring
        #       and a separte module for worker specific daemon gets created.
        self.logger.debug("Worker heartbeat")
        worker = WorkerData()

        # Record the scheduler tick (timestamp).
        worker.record_master_scheduler_tick()

        try:
            worker.put_heartbeat_data()
        except (xmlrpclib.Fault, xmlrpclib.ProtocolError) as err:
            worker.logger.error("Heartbeat update failed!")

        self.logger.debug("Refreshing jobs")
        return self.source.getJobList().addCallback(
            self._startJobs).addErrback(catchall_errback(self.logger))
Пример #7
0
    def jobCompleted_impl(self, job_id, board_name, exit_code, kill_reason):
        if not job_id:
            self.logger.debug('job completion called without a job id on %s',
                              board_name)
            return
        else:
            job = TestJob.objects.get(id=job_id)

        self.logger.debug('marking job as complete on %s', board_name)
        device = Device.objects.get(hostname=board_name)
        old_device_status = device.status
        self.logger.debug('old device status %s, job state %s' %
                          (Device.STATUS_CHOICES[old_device_status][1],
                           TestJob.STATUS_CHOICES[job.status][1]))

        if old_device_status == Device.RUNNING:
            new_device_status = Device.IDLE
        elif old_device_status == Device.OFFLINING:
            new_device_status = Device.OFFLINE
        elif old_device_status == Device.RESERVED:
            new_device_status = Device.IDLE
        else:
            self.logger.error("Unexpected device state in jobCompleted: %s",
                              device.status)
            new_device_status = Device.IDLE
        if new_device_status is None:
            self.logger.debug("unhandled old device state")
            new_device_status = Device.IDLE

        self.logger.debug('new device status %s, job state %s' %
                          (Device.STATUS_CHOICES[new_device_status][1],
                           TestJob.STATUS_CHOICES[job.status][1]))

        # Temporary devices should be marked as RETIRED once the job is
        # complete or canceled.
        if job.is_vmgroup:
            try:
                if device.temporarydevice:
                    new_device_status = Device.RETIRED
                    device.current_job = None
            except TemporaryDevice.DoesNotExist:
                self.logger.debug("%s is not a tmp device", device.hostname)

        if job.status == TestJob.RUNNING:
            if exit_code == 0:
                job.status = TestJob.COMPLETE
            else:
                job.status = TestJob.INCOMPLETE
        elif job.status == TestJob.CANCELING:
            job.status = TestJob.CANCELED
        else:
            self.logger.error(
                "Unexpected job state in jobCompleted: %s, probably we are trying job completion for a different job",
                job.status)
            return

        self.logger.debug('changed job status to %s' %
                          (TestJob.STATUS_CHOICES[job.status][1]))

        if job.health_check:
            device.last_health_report_job = job
            self.logger.debug("old device health status %s" %
                              Device.HEALTH_CHOICES[device.health_status][1])
            if device.health_status != Device.HEALTH_LOOPING:
                if job.status == TestJob.INCOMPLETE:
                    device.health_status = Device.HEALTH_FAIL
                    self.logger.debug(
                        "taking %s offline, failed health check job %s" %
                        (device.hostname, job_id))
                    device.put_into_maintenance_mode(
                        None, "Health Check Job Failed")
                    # update the local variable to track the effect of the external function call
                    new_device_status = device.status
                    if new_device_status == Device.OFFLINING:
                        new_device_status = Device.OFFLINE  # offlining job is complete.
                elif job.status == TestJob.COMPLETE:
                    device.health_status = Device.HEALTH_PASS
                    if old_device_status == Device.RUNNING:
                        new_device_status = Device.IDLE
                device.save()
            self.logger.debug("new device health status %s" %
                              Device.HEALTH_CHOICES[device.health_status][1])

        if job.output_dir and job.output_dir != '':
            bundle_file = os.path.join(job.output_dir, 'result-bundle')
            if os.path.exists(bundle_file):
                with open(bundle_file) as f:
                    results_link = f.read().strip()
                job._results_link = results_link
                sha1 = results_link.strip('/').split('/')[-1]
                try:
                    bundle = Bundle.objects.get(content_sha1=sha1)
                except Bundle.DoesNotExist:
                    pass
                else:
                    job._results_bundle = bundle
                    device.device_version = _get_device_version(
                        job.results_bundle)
        else:
            self.logger.warning("[%d] lacked a usable output_dir", job.id)

        self.logger.debug('new device status %s, job state %s' %
                          (Device.STATUS_CHOICES[new_device_status][1],
                           TestJob.STATUS_CHOICES[job.status][1]))

        job.end_time = timezone.now()

        job.submit_token = None

        device.current_job = None

        msg = "Job %s completed" % job.display_id
        device.state_transition_to(new_device_status, message=msg, job=job)
        self._commit_transaction(src='%s state' % device.hostname)

        device.save()
        job.save()
        self._commit_transaction(src='jobCompleted_impl')
        self.logger.info('job %s completed on %s', job.id, device.hostname)

        if utils.is_master():
            try:
                job.send_summary_mails()
            except:
                # Better to catch all exceptions here and log it than have this
                # method fail.
                self.logger.exception(
                    'sending job summary mails for job %r failed', job.pk)
        else:
            worker = WorkerData()
            worker.notify_on_incomplete(job.id)
Пример #8
0
    def jobCompleted_impl(self, board_name, exit_code, kill_reason):
        self.logger.debug('marking job as complete on %s', board_name)
        device = Device.objects.get(hostname=board_name)
        old_device_status = device.status
        new_device_status = None
        previous_state = device.previous_state()
        MAX_RETRIES = 3

        if old_device_status == Device.RUNNING:
            new_device_status = previous_state
        elif old_device_status == Device.OFFLINING:
            new_device_status = Device.OFFLINE
        elif old_device_status == Device.RESERVED:
            new_device_status = previous_state
        else:
            self.logger.error(
                "Unexpected device state in jobCompleted: %s" % device.status)
            new_device_status = Device.IDLE
        if new_device_status is None:
            new_device_status = Device.IDLE
        job = device.current_job

        # Temporary devices should be marked as RETIRED once the job is
        # complete or canceled.
        if job.is_vmgroup:
            try:
                if device.temporarydevice:
                    new_device_status = Device.RETIRED
            except TemporaryDevice.DoesNotExist:
                self.logger.debug("%s is not a tmp device" % device.hostname)

        device.device_version = _get_device_version(job.results_bundle)
        device.current_job = None
        if job.status == TestJob.RUNNING:
            if exit_code == 0:
                job.status = TestJob.COMPLETE
            else:
                job.status = TestJob.INCOMPLETE
        elif job.status == TestJob.CANCELING:
            job.status = TestJob.CANCELED
        else:
            self.logger.error(
                "Unexpected job state in jobCompleted: %s" % job.status)
            job.status = TestJob.COMPLETE

        msg = "Job %s completed" % job.display_id
        device.state_transition_to(new_device_status, message=msg, job=job)

        if job.health_check:
            device.last_health_report_job = job
            if device.health_status != Device.HEALTH_LOOPING:
                if job.status == TestJob.INCOMPLETE:
                    device.health_status = Device.HEALTH_FAIL
                    device.put_into_maintenance_mode(None, "Health Check Job Failed")
                elif job.status == TestJob.COMPLETE:
                    device.health_status = Device.HEALTH_PASS

        bundle_file = os.path.join(job.output_dir, 'result-bundle')
        if os.path.exists(bundle_file):
            with open(bundle_file) as f:
                results_link = f.read().strip()
            job._results_link = results_link
            sha1 = results_link.strip('/').split('/')[-1]
            try:
                bundle = Bundle.objects.get(content_sha1=sha1)
            except Bundle.DoesNotExist:
                pass
            else:
                job._results_bundle = bundle

        job.end_time = datetime.datetime.utcnow()
        token = job.submit_token
        job.submit_token = None
        device.save()
        job.save()
        # notification needs to have the correct status in the database
        for retry in range(MAX_RETRIES):
            try:
                transaction.commit()
                self.logger.debug('%s job completed and status saved' % job.id)
                break
            except TransactionRollbackError as err:
                self.logger.warn('Retrying %s job completion ... %s' % (job.id, err))
                continue
        if utils.is_master():
            try:
                job.send_summary_mails()
            except:
                # Better to catch all exceptions here and log it than have this
                # method fail.
                self.logger.exception(
                    'sending job summary mails for job %r failed', job.pk)
        else:
            worker = WorkerData()
            worker.notify_on_incomplete(job.id)
        # need the token for the XMLRPC
        token.delete()
Пример #9
0
    def jobCompleted_impl(self, job_id, board_name, exit_code, kill_reason):
        if not job_id:
            self.logger.debug('job completion called without a job id on %s',
                              board_name)
            return
        else:
            job = TestJob.objects.get(id=job_id)

        self.logger.debug('marking job as complete on %s', board_name)
        device = Device.objects.get(hostname=board_name)
        old_device_status = device.status
        self.logger.debug('old device status %s, job state %s' % (
            Device.STATUS_CHOICES[old_device_status][1],
            TestJob.STATUS_CHOICES[job.status][1]))

        if old_device_status == Device.RUNNING:
            new_device_status = Device.IDLE
        elif old_device_status == Device.OFFLINING:
            new_device_status = Device.OFFLINE
        elif old_device_status == Device.RESERVED:
            new_device_status = Device.IDLE
        else:
            self.logger.error(
                "Unexpected device state in jobCompleted: %s", device.status)
            new_device_status = Device.IDLE
        if new_device_status is None:
            self.logger.debug("unhandled old device state")
            new_device_status = Device.IDLE

        self.logger.debug('new device status %s, job state %s' % (
            Device.STATUS_CHOICES[new_device_status][1],
            TestJob.STATUS_CHOICES[job.status][1]))

        # Temporary devices should be marked as RETIRED once the job is
        # complete or canceled.
        if job.is_vmgroup:
            try:
                if device.temporarydevice:
                    new_device_status = Device.RETIRED
                    device.current_job = None
            except TemporaryDevice.DoesNotExist:
                self.logger.debug("%s is not a tmp device", device.hostname)

        if job.status == TestJob.RUNNING:
            if exit_code == 0:
                job.status = TestJob.COMPLETE
            else:
                job.status = TestJob.INCOMPLETE
        elif job.status == TestJob.CANCELING:
            job.status = TestJob.CANCELED
        else:
            self.logger.error("Unexpected job state in jobCompleted: %s, probably we are trying job completion for a different job", job.status)
            return

        self.logger.debug('changed job status to %s' % (
            TestJob.STATUS_CHOICES[job.status][1]))

        if job.health_check:
            device.last_health_report_job = job
            self.logger.debug("old device health status %s" % Device.HEALTH_CHOICES[device.health_status][1])
            if device.health_status != Device.HEALTH_LOOPING:
                if job.status == TestJob.INCOMPLETE:
                    device.health_status = Device.HEALTH_FAIL
                    self.logger.debug("taking %s offline, failed health check job %s" % (
                        device.hostname, job_id))
                    device.put_into_maintenance_mode(None, "Health Check Job Failed")
                    # update the local variable to track the effect of the external function call
                    new_device_status = device.status
                    if new_device_status == Device.OFFLINING:
                        new_device_status = Device.OFFLINE  # offlining job is complete.
                elif job.status == TestJob.COMPLETE:
                    device.health_status = Device.HEALTH_PASS
                    if old_device_status == Device.RUNNING:
                        new_device_status = Device.IDLE
                device.save()
            self.logger.debug("new device health status %s" % Device.HEALTH_CHOICES[device.health_status][1])

        if job.output_dir and job.output_dir != '':
            bundle_file = os.path.join(job.output_dir, 'result-bundle')
            if os.path.exists(bundle_file):
                with open(bundle_file) as f:
                    results_link = f.read().strip()
                job._results_link = results_link
                sha1 = results_link.strip('/').split('/')[-1]
                try:
                    bundle = Bundle.objects.get(content_sha1=sha1)
                except Bundle.DoesNotExist:
                    pass
                else:
                    job._results_bundle = bundle
                    device.device_version = _get_device_version(job.results_bundle)
        else:
            self.logger.warning("[%d] lacked a usable output_dir", job.id)

        self.logger.debug('new device status %s, job state %s' % (
            Device.STATUS_CHOICES[new_device_status][1],
            TestJob.STATUS_CHOICES[job.status][1]))

        job.end_time = timezone.now()

        job.submit_token = None

        device.current_job = None

        msg = "Job %s completed" % job.display_id
        device.state_transition_to(new_device_status, message=msg, job=job)
        self._commit_transaction(src='%s state' % device.hostname)

        device.save()
        job.save()
        self._commit_transaction(src='jobCompleted_impl')
        self.logger.info('job %s completed on %s', job.id, device.hostname)

        if utils.is_master():
            try:
                job.send_summary_mails()
            except:
                # Better to catch all exceptions here and log it than have this
                # method fail.
                self.logger.exception(
                    'sending job summary mails for job %r failed', job.pk)
        else:
            worker = WorkerData()
            worker.notify_on_incomplete(job.id)
Пример #10
0
    def jobCompleted_impl(self, board_name, exit_code, kill_reason):
        self.logger.debug("marking job as complete on %s", board_name)
        device = Device.objects.get(hostname=board_name)
        old_device_status = device.status
        new_device_status = None
        previous_state = device.previous_state()
        MAX_RETRIES = 3

        if old_device_status == Device.RUNNING:
            new_device_status = previous_state
        elif old_device_status == Device.OFFLINING:
            new_device_status = Device.OFFLINE
        elif old_device_status == Device.RESERVED:
            new_device_status = previous_state
        else:
            self.logger.error("Unexpected device state in jobCompleted: %s" % device.status)
            new_device_status = Device.IDLE
        if new_device_status is None:
            new_device_status = Device.IDLE
        job = device.current_job

        # Temporary devices should be marked as RETIRED once the job is
        # complete or canceled.
        if job.is_vmgroup:
            try:
                if device.temporarydevice:
                    new_device_status = Device.RETIRED
            except TemporaryDevice.DoesNotExist:
                self.logger.debug("%s is not a tmp device" % device.hostname)

        device.device_version = _get_device_version(job.results_bundle)
        device.current_job = None
        if job.status == TestJob.RUNNING:
            if exit_code == 0:
                job.status = TestJob.COMPLETE
            else:
                job.status = TestJob.INCOMPLETE
        elif job.status == TestJob.CANCELING:
            job.status = TestJob.CANCELED
        else:
            self.logger.error("Unexpected job state in jobCompleted: %s" % job.status)
            job.status = TestJob.COMPLETE

        msg = "Job %s completed" % job.display_id
        device.state_transition_to(new_device_status, message=msg, job=job)

        if job.health_check:
            device.last_health_report_job = job
            if device.health_status != Device.HEALTH_LOOPING:
                if job.status == TestJob.INCOMPLETE:
                    device.health_status = Device.HEALTH_FAIL
                    device.put_into_maintenance_mode(None, "Health Check Job Failed")
                elif job.status == TestJob.COMPLETE:
                    device.health_status = Device.HEALTH_PASS

        bundle_file = os.path.join(job.output_dir, "result-bundle")
        if os.path.exists(bundle_file):
            with open(bundle_file) as f:
                results_link = f.read().strip()
            job._results_link = results_link
            sha1 = results_link.strip("/").split("/")[-1]
            try:
                bundle = Bundle.objects.get(content_sha1=sha1)
            except Bundle.DoesNotExist:
                pass
            else:
                job._results_bundle = bundle

        job.end_time = datetime.datetime.utcnow()
        token = job.submit_token
        job.submit_token = None
        device.save()
        job.save()
        # notification needs to have the correct status in the database
        for retry in range(MAX_RETRIES):
            try:
                transaction.commit()
                self.logger.debug("%s job completed and status saved" % job.id)
                break
            except TransactionRollbackError as err:
                self.logger.warn("Retrying %s job completion ... %s" % (job.id, err))
                continue
        if utils.is_master():
            try:
                job.send_summary_mails()
            except:
                # Better to catch all exceptions here and log it than have this
                # method fail.
                self.logger.exception("sending job summary mails for job %r failed", job.pk)
        else:
            worker = WorkerData()
            worker.notify_on_incomplete(job.id)
        # need the token for the XMLRPC
        token.delete()