예제 #1
0
    def getJobList_impl(self):
        """
        This method is called in a loop by the scheduler daemon service.
        It's goal is to return a list of jobs that are ready to be started.
        Note: handles both old and pipeline jobs but only so far as putting
        devices into a Reserved state. Running pipeline jobs from Reserved
        is the sole concern of the dispatcher-master.
        """
        self._handle_cancelling_jobs()

        # FIXME: to move into the dispatcher-master
        if utils.is_master():
            submit_health_check_jobs()
            assign_jobs()

        # from here on, ignore pipeline jobs.
        my_devices = get_temporary_devices(self.my_devices())
        my_submitted_jobs = TestJob.objects.filter(
            status=TestJob.SUBMITTED,
            actual_device_id__in=my_devices,
            is_pipeline=False
        )

        my_ready_jobs = filter(lambda job: job.is_ready_to_start, my_submitted_jobs)

        if not connection.in_atomic_block:
            self._commit_transaction(src='getJobList_impl')
        return my_ready_jobs
예제 #2
0
    def getJobList_impl(self):
        """
        This method is called in a loop by the scheduler daemon service.
        It's goal is to return a list of jobs that are ready to be started.
        Note: handles both old and pipeline jobs but only so far as putting
        devices into a Reserved state. Running pipeline jobs from Reserved
        is the sole concern of the dispatcher-master.
        """
        self._handle_cancelling_jobs()

        if utils.is_master():
            # FIXME: move into dispatcher-master
            self._submit_health_check_jobs()
            self._assign_jobs()

        # from here on, ignore pipeline jobs.
        my_devices = get_temporary_devices(self.my_devices())
        my_submitted_jobs = TestJob.objects.filter(
            status=TestJob.SUBMITTED,
            actual_device_id__in=my_devices,
            is_pipeline=False)

        my_ready_jobs = filter(lambda job: job.is_ready_to_start,
                               my_submitted_jobs)

        self._commit_transaction(src='getJobList_impl')
        return my_ready_jobs
예제 #3
0
    def getJobList_impl(self):
        """
        This method is called in a loop by the scheduler daemon service.
        It's goal is to return a list of jobs that are ready to be started.
        """
        self._handle_cancelling_jobs()

        if utils.is_master():
            self._submit_health_check_jobs()
            self._assign_jobs()

        my_devices = get_temporary_devices(self.my_devices())
        my_submitted_jobs = TestJob.objects.filter(status=TestJob.SUBMITTED, actual_device_id__in=my_devices)

        my_ready_jobs = filter(lambda job: job.is_ready_to_start, my_submitted_jobs)

        transaction.commit()
        return my_ready_jobs
예제 #4
0
    def getJobList_impl(self):
        """
        This method is called in a loop by the scheduler daemon service.
        It's goal is to return a list of jobs that are ready to be started.
        """
        self._handle_cancelling_jobs()

        if utils.is_master():
            self._submit_health_check_jobs()
            self._assign_jobs()

        my_devices = get_temporary_devices(self.my_devices())
        my_submitted_jobs = TestJob.objects.filter(
            status=TestJob.SUBMITTED,
            actual_device_id__in=my_devices,
        )

        my_ready_jobs = filter(lambda job: job.is_ready_to_start, my_submitted_jobs)

        transaction.commit()
        return my_ready_jobs
예제 #5
0
    def jobCompleted_impl(self, job_id, board_name, exit_code, kill_reason):
        if not job_id:
            self.logger.debug('job completion called without a job id on %s',
                              board_name)
            return
        else:
            job = TestJob.objects.get(id=job_id)

        self.logger.debug('marking job as complete on %s', board_name)
        device = Device.objects.get(hostname=board_name)
        old_device_status = device.status
        self.logger.debug('old device status %s, job state %s' %
                          (Device.STATUS_CHOICES[old_device_status][1],
                           TestJob.STATUS_CHOICES[job.status][1]))

        if old_device_status == Device.RUNNING:
            new_device_status = Device.IDLE
        elif old_device_status == Device.OFFLINING:
            new_device_status = Device.OFFLINE
        elif old_device_status == Device.RESERVED:
            new_device_status = Device.IDLE
        else:
            self.logger.error("Unexpected device state in jobCompleted: %s",
                              device.status)
            new_device_status = Device.IDLE
        if new_device_status is None:
            self.logger.debug("unhandled old device state")
            new_device_status = Device.IDLE

        self.logger.debug('new device status %s, job state %s' %
                          (Device.STATUS_CHOICES[new_device_status][1],
                           TestJob.STATUS_CHOICES[job.status][1]))

        # Temporary devices should be marked as RETIRED once the job is
        # complete or canceled.
        if job.is_vmgroup:
            try:
                if device.temporarydevice:
                    new_device_status = Device.RETIRED
                    device.current_job = None
            except TemporaryDevice.DoesNotExist:
                self.logger.debug("%s is not a tmp device", device.hostname)

        if job.status == TestJob.RUNNING:
            if exit_code == 0:
                job.status = TestJob.COMPLETE
            else:
                job.status = TestJob.INCOMPLETE
        elif job.status == TestJob.CANCELING:
            job.status = TestJob.CANCELED
        else:
            self.logger.error(
                "Unexpected job state in jobCompleted: %s, probably we are trying job completion for a different job",
                job.status)
            return

        self.logger.debug('changed job status to %s' %
                          (TestJob.STATUS_CHOICES[job.status][1]))

        if job.health_check:
            device.last_health_report_job = job
            self.logger.debug("old device health status %s" %
                              Device.HEALTH_CHOICES[device.health_status][1])
            if device.health_status != Device.HEALTH_LOOPING:
                if job.status == TestJob.INCOMPLETE:
                    device.health_status = Device.HEALTH_FAIL
                    self.logger.debug(
                        "taking %s offline, failed health check job %s" %
                        (device.hostname, job_id))
                    device.put_into_maintenance_mode(
                        None, "Health Check Job Failed")
                    # update the local variable to track the effect of the external function call
                    new_device_status = device.status
                    if new_device_status == Device.OFFLINING:
                        new_device_status = Device.OFFLINE  # offlining job is complete.
                elif job.status == TestJob.COMPLETE:
                    device.health_status = Device.HEALTH_PASS
                    if old_device_status == Device.RUNNING:
                        new_device_status = Device.IDLE
                device.save()
            self.logger.debug("new device health status %s" %
                              Device.HEALTH_CHOICES[device.health_status][1])

        if job.output_dir and job.output_dir != '':
            bundle_file = os.path.join(job.output_dir, 'result-bundle')
            if os.path.exists(bundle_file):
                with open(bundle_file) as f:
                    results_link = f.read().strip()
                job._results_link = results_link
                sha1 = results_link.strip('/').split('/')[-1]
                try:
                    bundle = Bundle.objects.get(content_sha1=sha1)
                except Bundle.DoesNotExist:
                    pass
                else:
                    job._results_bundle = bundle
                    device.device_version = _get_device_version(
                        job.results_bundle)
        else:
            self.logger.warning("[%d] lacked a usable output_dir", job.id)

        self.logger.debug('new device status %s, job state %s' %
                          (Device.STATUS_CHOICES[new_device_status][1],
                           TestJob.STATUS_CHOICES[job.status][1]))

        job.end_time = timezone.now()

        job.submit_token = None

        device.current_job = None

        msg = "Job %s completed" % job.display_id
        device.state_transition_to(new_device_status, message=msg, job=job)
        self._commit_transaction(src='%s state' % device.hostname)

        device.save()
        job.save()
        self._commit_transaction(src='jobCompleted_impl')
        self.logger.info('job %s completed on %s', job.id, device.hostname)

        if utils.is_master():
            try:
                job.send_summary_mails()
            except:
                # Better to catch all exceptions here and log it than have this
                # method fail.
                self.logger.exception(
                    'sending job summary mails for job %r failed', job.pk)
        else:
            worker = WorkerData()
            worker.notify_on_incomplete(job.id)
예제 #6
0
    def jobCompleted_impl(self, board_name, exit_code, kill_reason):
        self.logger.debug('marking job as complete on %s', board_name)
        device = Device.objects.get(hostname=board_name)
        old_device_status = device.status
        new_device_status = None
        previous_state = device.previous_state()
        MAX_RETRIES = 3

        if old_device_status == Device.RUNNING:
            new_device_status = previous_state
        elif old_device_status == Device.OFFLINING:
            new_device_status = Device.OFFLINE
        elif old_device_status == Device.RESERVED:
            new_device_status = previous_state
        else:
            self.logger.error(
                "Unexpected device state in jobCompleted: %s" % device.status)
            new_device_status = Device.IDLE
        if new_device_status is None:
            new_device_status = Device.IDLE
        job = device.current_job

        # Temporary devices should be marked as RETIRED once the job is
        # complete or canceled.
        if job.is_vmgroup:
            try:
                if device.temporarydevice:
                    new_device_status = Device.RETIRED
            except TemporaryDevice.DoesNotExist:
                self.logger.debug("%s is not a tmp device" % device.hostname)

        device.device_version = _get_device_version(job.results_bundle)
        device.current_job = None
        if job.status == TestJob.RUNNING:
            if exit_code == 0:
                job.status = TestJob.COMPLETE
            else:
                job.status = TestJob.INCOMPLETE
        elif job.status == TestJob.CANCELING:
            job.status = TestJob.CANCELED
        else:
            self.logger.error(
                "Unexpected job state in jobCompleted: %s" % job.status)
            job.status = TestJob.COMPLETE

        msg = "Job %s completed" % job.display_id
        device.state_transition_to(new_device_status, message=msg, job=job)

        if job.health_check:
            device.last_health_report_job = job
            if device.health_status != Device.HEALTH_LOOPING:
                if job.status == TestJob.INCOMPLETE:
                    device.health_status = Device.HEALTH_FAIL
                    device.put_into_maintenance_mode(None, "Health Check Job Failed")
                elif job.status == TestJob.COMPLETE:
                    device.health_status = Device.HEALTH_PASS

        bundle_file = os.path.join(job.output_dir, 'result-bundle')
        if os.path.exists(bundle_file):
            with open(bundle_file) as f:
                results_link = f.read().strip()
            job._results_link = results_link
            sha1 = results_link.strip('/').split('/')[-1]
            try:
                bundle = Bundle.objects.get(content_sha1=sha1)
            except Bundle.DoesNotExist:
                pass
            else:
                job._results_bundle = bundle

        job.end_time = datetime.datetime.utcnow()
        token = job.submit_token
        job.submit_token = None
        device.save()
        job.save()
        # notification needs to have the correct status in the database
        for retry in range(MAX_RETRIES):
            try:
                transaction.commit()
                self.logger.debug('%s job completed and status saved' % job.id)
                break
            except TransactionRollbackError as err:
                self.logger.warn('Retrying %s job completion ... %s' % (job.id, err))
                continue
        if utils.is_master():
            try:
                job.send_summary_mails()
            except:
                # Better to catch all exceptions here and log it than have this
                # method fail.
                self.logger.exception(
                    'sending job summary mails for job %r failed', job.pk)
        else:
            worker = WorkerData()
            worker.notify_on_incomplete(job.id)
        # need the token for the XMLRPC
        token.delete()
예제 #7
0
    def jobCompleted_impl(self, job_id, board_name, exit_code, kill_reason):
        if not job_id:
            self.logger.debug('job completion called without a job id on %s',
                              board_name)
            return
        else:
            job = TestJob.objects.get(id=job_id)

        self.logger.debug('marking job as complete on %s', board_name)
        device = Device.objects.get(hostname=board_name)
        old_device_status = device.status
        self.logger.debug('old device status %s, job state %s' % (
            Device.STATUS_CHOICES[old_device_status][1],
            TestJob.STATUS_CHOICES[job.status][1]))

        if old_device_status == Device.RUNNING:
            new_device_status = Device.IDLE
        elif old_device_status == Device.OFFLINING:
            new_device_status = Device.OFFLINE
        elif old_device_status == Device.RESERVED:
            new_device_status = Device.IDLE
        else:
            self.logger.error(
                "Unexpected device state in jobCompleted: %s", device.status)
            new_device_status = Device.IDLE
        if new_device_status is None:
            self.logger.debug("unhandled old device state")
            new_device_status = Device.IDLE

        self.logger.debug('new device status %s, job state %s' % (
            Device.STATUS_CHOICES[new_device_status][1],
            TestJob.STATUS_CHOICES[job.status][1]))

        # Temporary devices should be marked as RETIRED once the job is
        # complete or canceled.
        if job.is_vmgroup:
            try:
                if device.temporarydevice:
                    new_device_status = Device.RETIRED
                    device.current_job = None
            except TemporaryDevice.DoesNotExist:
                self.logger.debug("%s is not a tmp device", device.hostname)

        if job.status == TestJob.RUNNING:
            if exit_code == 0:
                job.status = TestJob.COMPLETE
            else:
                job.status = TestJob.INCOMPLETE
        elif job.status == TestJob.CANCELING:
            job.status = TestJob.CANCELED
        else:
            self.logger.error("Unexpected job state in jobCompleted: %s, probably we are trying job completion for a different job", job.status)
            return

        self.logger.debug('changed job status to %s' % (
            TestJob.STATUS_CHOICES[job.status][1]))

        if job.health_check:
            device.last_health_report_job = job
            self.logger.debug("old device health status %s" % Device.HEALTH_CHOICES[device.health_status][1])
            if device.health_status != Device.HEALTH_LOOPING:
                if job.status == TestJob.INCOMPLETE:
                    device.health_status = Device.HEALTH_FAIL
                    self.logger.debug("taking %s offline, failed health check job %s" % (
                        device.hostname, job_id))
                    device.put_into_maintenance_mode(None, "Health Check Job Failed")
                    # update the local variable to track the effect of the external function call
                    new_device_status = device.status
                    if new_device_status == Device.OFFLINING:
                        new_device_status = Device.OFFLINE  # offlining job is complete.
                elif job.status == TestJob.COMPLETE:
                    device.health_status = Device.HEALTH_PASS
                    if old_device_status == Device.RUNNING:
                        new_device_status = Device.IDLE
                device.save()
            self.logger.debug("new device health status %s" % Device.HEALTH_CHOICES[device.health_status][1])

        if job.output_dir and job.output_dir != '':
            bundle_file = os.path.join(job.output_dir, 'result-bundle')
            if os.path.exists(bundle_file):
                with open(bundle_file) as f:
                    results_link = f.read().strip()
                job._results_link = results_link
                sha1 = results_link.strip('/').split('/')[-1]
                try:
                    bundle = Bundle.objects.get(content_sha1=sha1)
                except Bundle.DoesNotExist:
                    pass
                else:
                    job._results_bundle = bundle
                    device.device_version = _get_device_version(job.results_bundle)
        else:
            self.logger.warning("[%d] lacked a usable output_dir", job.id)

        self.logger.debug('new device status %s, job state %s' % (
            Device.STATUS_CHOICES[new_device_status][1],
            TestJob.STATUS_CHOICES[job.status][1]))

        job.end_time = timezone.now()

        job.submit_token = None

        device.current_job = None

        msg = "Job %s completed" % job.display_id
        device.state_transition_to(new_device_status, message=msg, job=job)
        self._commit_transaction(src='%s state' % device.hostname)

        device.save()
        job.save()
        self._commit_transaction(src='jobCompleted_impl')
        self.logger.info('job %s completed on %s', job.id, device.hostname)

        if utils.is_master():
            try:
                job.send_summary_mails()
            except:
                # Better to catch all exceptions here and log it than have this
                # method fail.
                self.logger.exception(
                    'sending job summary mails for job %r failed', job.pk)
        else:
            worker = WorkerData()
            worker.notify_on_incomplete(job.id)
예제 #8
0
    def jobCompleted_impl(self, board_name, exit_code, kill_reason):
        self.logger.debug("marking job as complete on %s", board_name)
        device = Device.objects.get(hostname=board_name)
        old_device_status = device.status
        new_device_status = None
        previous_state = device.previous_state()
        MAX_RETRIES = 3

        if old_device_status == Device.RUNNING:
            new_device_status = previous_state
        elif old_device_status == Device.OFFLINING:
            new_device_status = Device.OFFLINE
        elif old_device_status == Device.RESERVED:
            new_device_status = previous_state
        else:
            self.logger.error("Unexpected device state in jobCompleted: %s" % device.status)
            new_device_status = Device.IDLE
        if new_device_status is None:
            new_device_status = Device.IDLE
        job = device.current_job

        # Temporary devices should be marked as RETIRED once the job is
        # complete or canceled.
        if job.is_vmgroup:
            try:
                if device.temporarydevice:
                    new_device_status = Device.RETIRED
            except TemporaryDevice.DoesNotExist:
                self.logger.debug("%s is not a tmp device" % device.hostname)

        device.device_version = _get_device_version(job.results_bundle)
        device.current_job = None
        if job.status == TestJob.RUNNING:
            if exit_code == 0:
                job.status = TestJob.COMPLETE
            else:
                job.status = TestJob.INCOMPLETE
        elif job.status == TestJob.CANCELING:
            job.status = TestJob.CANCELED
        else:
            self.logger.error("Unexpected job state in jobCompleted: %s" % job.status)
            job.status = TestJob.COMPLETE

        msg = "Job %s completed" % job.display_id
        device.state_transition_to(new_device_status, message=msg, job=job)

        if job.health_check:
            device.last_health_report_job = job
            if device.health_status != Device.HEALTH_LOOPING:
                if job.status == TestJob.INCOMPLETE:
                    device.health_status = Device.HEALTH_FAIL
                    device.put_into_maintenance_mode(None, "Health Check Job Failed")
                elif job.status == TestJob.COMPLETE:
                    device.health_status = Device.HEALTH_PASS

        bundle_file = os.path.join(job.output_dir, "result-bundle")
        if os.path.exists(bundle_file):
            with open(bundle_file) as f:
                results_link = f.read().strip()
            job._results_link = results_link
            sha1 = results_link.strip("/").split("/")[-1]
            try:
                bundle = Bundle.objects.get(content_sha1=sha1)
            except Bundle.DoesNotExist:
                pass
            else:
                job._results_bundle = bundle

        job.end_time = datetime.datetime.utcnow()
        token = job.submit_token
        job.submit_token = None
        device.save()
        job.save()
        # notification needs to have the correct status in the database
        for retry in range(MAX_RETRIES):
            try:
                transaction.commit()
                self.logger.debug("%s job completed and status saved" % job.id)
                break
            except TransactionRollbackError as err:
                self.logger.warn("Retrying %s job completion ... %s" % (job.id, err))
                continue
        if utils.is_master():
            try:
                job.send_summary_mails()
            except:
                # Better to catch all exceptions here and log it than have this
                # method fail.
                self.logger.exception("sending job summary mails for job %r failed", job.pk)
        else:
            worker = WorkerData()
            worker.notify_on_incomplete(job.id)
        # need the token for the XMLRPC
        token.delete()