def _checkJobs(self): # Update Worker Heartbeat # # NOTE: This will recide here till we finalize scheduler refactoring # and a separte module for worker specific daemon gets created. self.logger.debug("Worker heartbeat") worker = WorkerData() # Record the scheduler tick (timestamp). worker.record_master_scheduler_tick() try: worker.put_heartbeat_data() except (xmlrpclib.Fault, xmlrpclib.ProtocolError) as err: worker.logger.error("Heartbeat update failed!") self.logger.debug("Refreshing jobs") return self.source.getJobList().addCallback( self._startJobs).addErrback(catchall_errback(self.logger))
def _startJobs(self, jobs): # Record the scheduler tick (timestamp). utils.record_scheduler_tick() # Update Worker Heartbeat # # NOTE: This will recide here till we finalize scheduler refactoring # and a separte module for worker specific daemon gets created. worker = WorkerData() try: worker.put_heartbeat_data() except (xmlrpclib.Fault, xmlrpclib.ProtocolError) as err: worker.logger.error("Heartbeat update failed!") for job in jobs: new_job = JobRunner(self.source, job, self.dispatcher, self.reactor, self.daemon_options) self.logger.info("Starting Job: %d " % job.id) new_job.start()
def handle(self, *args, **options): import os from twisted.internet import reactor from lava_scheduler_daemon.service import JobQueue from lava_scheduler_daemon.worker import WorkerData from lava_scheduler_daemon.dbjobsource import DatabaseJobSource import xmlrpclib daemon_options = self._configure(options) source = DatabaseJobSource() if options['use_fake']: import lava_scheduler_app opd = os.path.dirname dispatcher = os.path.join( opd(opd(os.path.abspath(lava_scheduler_app.__file__))), 'fake-dispatcher') else: dispatcher = options['dispatcher'] # Update complete worker heartbeat data. This will be run once, # on every start/restart of the scheduler daemon. worker = WorkerData() try: worker.put_heartbeat_data(restart=True) except (xmlrpclib.Fault, xmlrpclib.ProtocolError) as err: worker.logger.error("Complete heartbeat update failed!") # Start scheduler service. service = JobQueue(source, dispatcher, reactor, daemon_options=daemon_options) reactor.callWhenRunning(service.startService) reactor.run()
def handle(self, *args, **options): import os from twisted.internet import reactor from lava_scheduler_daemon.service import JobQueue from lava_scheduler_daemon.worker import WorkerData from lava_scheduler_daemon.dbjobsource import DatabaseJobSource import xmlrpclib daemon_options = self._configure(options) source = DatabaseJobSource() if options['use_fake']: import lava_scheduler_app opd = os.path.dirname dispatcher = os.path.join( opd(opd(os.path.abspath(lava_scheduler_app.__file__))), 'fake-dispatcher') else: dispatcher = options['dispatcher'] # Update complete worker heartbeat data. This will be run once, # on every start/restart of the scheduler daemon. worker = WorkerData() try: worker.put_heartbeat_data(restart=True) except (xmlrpclib.Fault, xmlrpclib.ProtocolError) as err: worker.logger.error("Complete heartbeat update failed!") # Start scheduler service. service = JobQueue( source, dispatcher, reactor, daemon_options=daemon_options) reactor.callWhenRunning(service.startService) reactor.run()
def jobCompleted_impl(self, job_id, board_name, exit_code, kill_reason): if not job_id: self.logger.debug('job completion called without a job id on %s', board_name) return else: job = TestJob.objects.get(id=job_id) self.logger.debug('marking job as complete on %s', board_name) device = Device.objects.get(hostname=board_name) old_device_status = device.status self.logger.debug('old device status %s, job state %s' % (Device.STATUS_CHOICES[old_device_status][1], TestJob.STATUS_CHOICES[job.status][1])) if old_device_status == Device.RUNNING: new_device_status = Device.IDLE elif old_device_status == Device.OFFLINING: new_device_status = Device.OFFLINE elif old_device_status == Device.RESERVED: new_device_status = Device.IDLE else: self.logger.error("Unexpected device state in jobCompleted: %s", device.status) new_device_status = Device.IDLE if new_device_status is None: self.logger.debug("unhandled old device state") new_device_status = Device.IDLE self.logger.debug('new device status %s, job state %s' % (Device.STATUS_CHOICES[new_device_status][1], TestJob.STATUS_CHOICES[job.status][1])) # Temporary devices should be marked as RETIRED once the job is # complete or canceled. if job.is_vmgroup: try: if device.temporarydevice: new_device_status = Device.RETIRED device.current_job = None except TemporaryDevice.DoesNotExist: self.logger.debug("%s is not a tmp device", device.hostname) if job.status == TestJob.RUNNING: if exit_code == 0: job.status = TestJob.COMPLETE else: job.status = TestJob.INCOMPLETE elif job.status == TestJob.CANCELING: job.status = TestJob.CANCELED else: self.logger.error( "Unexpected job state in jobCompleted: %s, probably we are trying job completion for a different job", job.status) return self.logger.debug('changed job status to %s' % (TestJob.STATUS_CHOICES[job.status][1])) if job.health_check: device.last_health_report_job = job self.logger.debug("old device health status %s" % Device.HEALTH_CHOICES[device.health_status][1]) if device.health_status != Device.HEALTH_LOOPING: if job.status == TestJob.INCOMPLETE: device.health_status = Device.HEALTH_FAIL self.logger.debug( "taking %s offline, failed health check job %s" % (device.hostname, job_id)) device.put_into_maintenance_mode( None, "Health Check Job Failed") # update the local variable to track the effect of the external function call new_device_status = device.status if new_device_status == Device.OFFLINING: new_device_status = Device.OFFLINE # offlining job is complete. elif job.status == TestJob.COMPLETE: device.health_status = Device.HEALTH_PASS if old_device_status == Device.RUNNING: new_device_status = Device.IDLE device.save() self.logger.debug("new device health status %s" % Device.HEALTH_CHOICES[device.health_status][1]) if job.output_dir and job.output_dir != '': bundle_file = os.path.join(job.output_dir, 'result-bundle') if os.path.exists(bundle_file): with open(bundle_file) as f: results_link = f.read().strip() job._results_link = results_link sha1 = results_link.strip('/').split('/')[-1] try: bundle = Bundle.objects.get(content_sha1=sha1) except Bundle.DoesNotExist: pass else: job._results_bundle = bundle device.device_version = _get_device_version( job.results_bundle) else: self.logger.warning("[%d] lacked a usable output_dir", job.id) self.logger.debug('new device status %s, job state %s' % (Device.STATUS_CHOICES[new_device_status][1], TestJob.STATUS_CHOICES[job.status][1])) job.end_time = timezone.now() job.submit_token = None device.current_job = None msg = "Job %s completed" % job.display_id device.state_transition_to(new_device_status, message=msg, job=job) self._commit_transaction(src='%s state' % device.hostname) device.save() job.save() self._commit_transaction(src='jobCompleted_impl') self.logger.info('job %s completed on %s', job.id, device.hostname) if utils.is_master(): try: job.send_summary_mails() except: # Better to catch all exceptions here and log it than have this # method fail. self.logger.exception( 'sending job summary mails for job %r failed', job.pk) else: worker = WorkerData() worker.notify_on_incomplete(job.id)
def jobCompleted_impl(self, board_name, exit_code, kill_reason): self.logger.debug('marking job as complete on %s', board_name) device = Device.objects.get(hostname=board_name) old_device_status = device.status new_device_status = None previous_state = device.previous_state() MAX_RETRIES = 3 if old_device_status == Device.RUNNING: new_device_status = previous_state elif old_device_status == Device.OFFLINING: new_device_status = Device.OFFLINE elif old_device_status == Device.RESERVED: new_device_status = previous_state else: self.logger.error( "Unexpected device state in jobCompleted: %s" % device.status) new_device_status = Device.IDLE if new_device_status is None: new_device_status = Device.IDLE job = device.current_job # Temporary devices should be marked as RETIRED once the job is # complete or canceled. if job.is_vmgroup: try: if device.temporarydevice: new_device_status = Device.RETIRED except TemporaryDevice.DoesNotExist: self.logger.debug("%s is not a tmp device" % device.hostname) device.device_version = _get_device_version(job.results_bundle) device.current_job = None if job.status == TestJob.RUNNING: if exit_code == 0: job.status = TestJob.COMPLETE else: job.status = TestJob.INCOMPLETE elif job.status == TestJob.CANCELING: job.status = TestJob.CANCELED else: self.logger.error( "Unexpected job state in jobCompleted: %s" % job.status) job.status = TestJob.COMPLETE msg = "Job %s completed" % job.display_id device.state_transition_to(new_device_status, message=msg, job=job) if job.health_check: device.last_health_report_job = job if device.health_status != Device.HEALTH_LOOPING: if job.status == TestJob.INCOMPLETE: device.health_status = Device.HEALTH_FAIL device.put_into_maintenance_mode(None, "Health Check Job Failed") elif job.status == TestJob.COMPLETE: device.health_status = Device.HEALTH_PASS bundle_file = os.path.join(job.output_dir, 'result-bundle') if os.path.exists(bundle_file): with open(bundle_file) as f: results_link = f.read().strip() job._results_link = results_link sha1 = results_link.strip('/').split('/')[-1] try: bundle = Bundle.objects.get(content_sha1=sha1) except Bundle.DoesNotExist: pass else: job._results_bundle = bundle job.end_time = datetime.datetime.utcnow() token = job.submit_token job.submit_token = None device.save() job.save() # notification needs to have the correct status in the database for retry in range(MAX_RETRIES): try: transaction.commit() self.logger.debug('%s job completed and status saved' % job.id) break except TransactionRollbackError as err: self.logger.warn('Retrying %s job completion ... %s' % (job.id, err)) continue if utils.is_master(): try: job.send_summary_mails() except: # Better to catch all exceptions here and log it than have this # method fail. self.logger.exception( 'sending job summary mails for job %r failed', job.pk) else: worker = WorkerData() worker.notify_on_incomplete(job.id) # need the token for the XMLRPC token.delete()
def jobCompleted_impl(self, job_id, board_name, exit_code, kill_reason): if not job_id: self.logger.debug('job completion called without a job id on %s', board_name) return else: job = TestJob.objects.get(id=job_id) self.logger.debug('marking job as complete on %s', board_name) device = Device.objects.get(hostname=board_name) old_device_status = device.status self.logger.debug('old device status %s, job state %s' % ( Device.STATUS_CHOICES[old_device_status][1], TestJob.STATUS_CHOICES[job.status][1])) if old_device_status == Device.RUNNING: new_device_status = Device.IDLE elif old_device_status == Device.OFFLINING: new_device_status = Device.OFFLINE elif old_device_status == Device.RESERVED: new_device_status = Device.IDLE else: self.logger.error( "Unexpected device state in jobCompleted: %s", device.status) new_device_status = Device.IDLE if new_device_status is None: self.logger.debug("unhandled old device state") new_device_status = Device.IDLE self.logger.debug('new device status %s, job state %s' % ( Device.STATUS_CHOICES[new_device_status][1], TestJob.STATUS_CHOICES[job.status][1])) # Temporary devices should be marked as RETIRED once the job is # complete or canceled. if job.is_vmgroup: try: if device.temporarydevice: new_device_status = Device.RETIRED device.current_job = None except TemporaryDevice.DoesNotExist: self.logger.debug("%s is not a tmp device", device.hostname) if job.status == TestJob.RUNNING: if exit_code == 0: job.status = TestJob.COMPLETE else: job.status = TestJob.INCOMPLETE elif job.status == TestJob.CANCELING: job.status = TestJob.CANCELED else: self.logger.error("Unexpected job state in jobCompleted: %s, probably we are trying job completion for a different job", job.status) return self.logger.debug('changed job status to %s' % ( TestJob.STATUS_CHOICES[job.status][1])) if job.health_check: device.last_health_report_job = job self.logger.debug("old device health status %s" % Device.HEALTH_CHOICES[device.health_status][1]) if device.health_status != Device.HEALTH_LOOPING: if job.status == TestJob.INCOMPLETE: device.health_status = Device.HEALTH_FAIL self.logger.debug("taking %s offline, failed health check job %s" % ( device.hostname, job_id)) device.put_into_maintenance_mode(None, "Health Check Job Failed") # update the local variable to track the effect of the external function call new_device_status = device.status if new_device_status == Device.OFFLINING: new_device_status = Device.OFFLINE # offlining job is complete. elif job.status == TestJob.COMPLETE: device.health_status = Device.HEALTH_PASS if old_device_status == Device.RUNNING: new_device_status = Device.IDLE device.save() self.logger.debug("new device health status %s" % Device.HEALTH_CHOICES[device.health_status][1]) if job.output_dir and job.output_dir != '': bundle_file = os.path.join(job.output_dir, 'result-bundle') if os.path.exists(bundle_file): with open(bundle_file) as f: results_link = f.read().strip() job._results_link = results_link sha1 = results_link.strip('/').split('/')[-1] try: bundle = Bundle.objects.get(content_sha1=sha1) except Bundle.DoesNotExist: pass else: job._results_bundle = bundle device.device_version = _get_device_version(job.results_bundle) else: self.logger.warning("[%d] lacked a usable output_dir", job.id) self.logger.debug('new device status %s, job state %s' % ( Device.STATUS_CHOICES[new_device_status][1], TestJob.STATUS_CHOICES[job.status][1])) job.end_time = timezone.now() job.submit_token = None device.current_job = None msg = "Job %s completed" % job.display_id device.state_transition_to(new_device_status, message=msg, job=job) self._commit_transaction(src='%s state' % device.hostname) device.save() job.save() self._commit_transaction(src='jobCompleted_impl') self.logger.info('job %s completed on %s', job.id, device.hostname) if utils.is_master(): try: job.send_summary_mails() except: # Better to catch all exceptions here and log it than have this # method fail. self.logger.exception( 'sending job summary mails for job %r failed', job.pk) else: worker = WorkerData() worker.notify_on_incomplete(job.id)
def jobCompleted_impl(self, board_name, exit_code, kill_reason): self.logger.debug("marking job as complete on %s", board_name) device = Device.objects.get(hostname=board_name) old_device_status = device.status new_device_status = None previous_state = device.previous_state() MAX_RETRIES = 3 if old_device_status == Device.RUNNING: new_device_status = previous_state elif old_device_status == Device.OFFLINING: new_device_status = Device.OFFLINE elif old_device_status == Device.RESERVED: new_device_status = previous_state else: self.logger.error("Unexpected device state in jobCompleted: %s" % device.status) new_device_status = Device.IDLE if new_device_status is None: new_device_status = Device.IDLE job = device.current_job # Temporary devices should be marked as RETIRED once the job is # complete or canceled. if job.is_vmgroup: try: if device.temporarydevice: new_device_status = Device.RETIRED except TemporaryDevice.DoesNotExist: self.logger.debug("%s is not a tmp device" % device.hostname) device.device_version = _get_device_version(job.results_bundle) device.current_job = None if job.status == TestJob.RUNNING: if exit_code == 0: job.status = TestJob.COMPLETE else: job.status = TestJob.INCOMPLETE elif job.status == TestJob.CANCELING: job.status = TestJob.CANCELED else: self.logger.error("Unexpected job state in jobCompleted: %s" % job.status) job.status = TestJob.COMPLETE msg = "Job %s completed" % job.display_id device.state_transition_to(new_device_status, message=msg, job=job) if job.health_check: device.last_health_report_job = job if device.health_status != Device.HEALTH_LOOPING: if job.status == TestJob.INCOMPLETE: device.health_status = Device.HEALTH_FAIL device.put_into_maintenance_mode(None, "Health Check Job Failed") elif job.status == TestJob.COMPLETE: device.health_status = Device.HEALTH_PASS bundle_file = os.path.join(job.output_dir, "result-bundle") if os.path.exists(bundle_file): with open(bundle_file) as f: results_link = f.read().strip() job._results_link = results_link sha1 = results_link.strip("/").split("/")[-1] try: bundle = Bundle.objects.get(content_sha1=sha1) except Bundle.DoesNotExist: pass else: job._results_bundle = bundle job.end_time = datetime.datetime.utcnow() token = job.submit_token job.submit_token = None device.save() job.save() # notification needs to have the correct status in the database for retry in range(MAX_RETRIES): try: transaction.commit() self.logger.debug("%s job completed and status saved" % job.id) break except TransactionRollbackError as err: self.logger.warn("Retrying %s job completion ... %s" % (job.id, err)) continue if utils.is_master(): try: job.send_summary_mails() except: # Better to catch all exceptions here and log it than have this # method fail. self.logger.exception("sending job summary mails for job %r failed", job.pk) else: worker = WorkerData() worker.notify_on_incomplete(job.id) # need the token for the XMLRPC token.delete()