Пример #1
0
    def _sync_with_database_thread(self):
        '''This method is a background thread that polls the database to check for updates to the job executions that
        are currently running in the scheduler. This method kills off job executions that have been canceled. It also
        kills and fails job executions that have timed out.
        '''
        throttle = 10

        logger.info('Scheduler database sync background thread started')

        while self.sync_database_running:
            secs_passed = 0
            started = now()

            job_exes = self._get_job_exes()
            job_exe_ids = []
            for job_exe in job_exes:
                job_exe_ids.append(job_exe.job_exe_id)

            try:
                right_now = now()
                for job_exe_model in JobExecution.objects.filter(id__in=job_exe_ids):
                    for job_exe in job_exes:
                        if job_exe.job_exe_id == job_exe_model.id:
                            this_job_exe = job_exe
                            break
                    kill_task = False
                    delete_job_exe = False
                    if job_exe_model.status == 'CANCELED':
                        kill_task = True
                        delete_job_exe = True
                    elif job_exe_model.is_timed_out(right_now):
                        kill_task = True
                        delete_job_exe = True
                        error = get_timeout_error()
                        Queue.objects.handle_job_failure(job_exe_model.id, right_now, error)
                    if kill_task:
                        task_to_kill_id = this_job_exe.current_task()
                        pb_task_to_kill = mesos_pb2.TaskID()
                        pb_task_to_kill.value = task_to_kill_id
                        logger.info('About to kill task: %s', task_to_kill_id)
                        self.driver.killTask(pb_task_to_kill)
                    if delete_job_exe:
                        self._delete_job_exe(this_job_exe)
            except Exception:
                logger.exception('Error syncing scheduler with database')

            ended = now()
            secs_passed = (ended - started).total_seconds()
            if secs_passed < throttle:
                # Delay until full throttle time reached
                delay = math.ceil(throttle - secs_passed)
                time.sleep(delay)

        logger.info('Scheduler database sync background thread stopped')
Пример #2
0
    def task_failed(self, task_id, status):
        '''Indicates that a Mesos task for this job execution has failed

        :param task_id: The ID of the task that failed
        :type task_id: str
        :param status: The task status
        :type status: :class:`mesos_pb2.TaskStatus`
        '''

        if not self.current_task_id == task_id:
            return

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self.job_exe_id)

        stdout = None
        stderr = None
        node = None
        if status.state != mesos_pb2.TASK_LOST:
            try:
                node = self._cached_node
                task_dir = get_slave_task_directory(node.hostname, node.port, self.current_task_id)
                stdout = get_slave_task_file(node.hostname, node.port, task_dir, 'stdout')
                stderr = get_slave_task_file(node.hostname, node.port, task_dir, 'stderr')
            except Exception:
                logger.error('Error getting stdout/stderr for %s', self.current_task_id)

        self.failed = True
        error = None
        if status.state == mesos_pb2.TASK_LOST:
            error = get_mesos_error()
        if status.state == mesos_pb2.TASK_KILLED and self.timed_out:
            error = get_timeout_error()
        when_failed = EPOCH + timedelta(seconds=status.timestamp)

        exit_code = self._parse_exit_code(status)
        if self._is_current_task_pre():
            # Check scale_pre_steps command to see if exit code maps to a specific error
            if exit_code in PRE_EXIT_CODE_DICT:
                error = PRE_EXIT_CODE_DICT[exit_code]()
            JobExecution.objects.pre_steps_failed(self.job_exe_id, when_failed, exit_code, stdout, stderr)
        elif self._is_current_task_job():
            # Do error mapping here to determine error
            error = job_exe.get_error_interface().get_error(exit_code)
            JobExecution.objects.job_failed(self.job_exe_id, when_failed, exit_code, stdout, stderr)
        elif self._is_current_task_post():
            # Check scale_post_steps command to see if exit code maps to a specific error
            if exit_code in POST_EXIT_CODE_DICT:
                error = POST_EXIT_CODE_DICT[exit_code]()
            JobExecution.objects.post_steps_failed(self.job_exe_id, when_failed, exit_code, stdout, stderr)

        if not error:
            error = Error.objects.get_unknown_error()
        Queue.objects.handle_job_failure(self.job_exe_id, when_failed, error)

        # Check for a high number of system errors and decide if we should pause the node
        if error.category == 'SYSTEM' and job_exe.job.num_exes >= job_exe.job.max_tries and node is not None and not node.is_paused:
            # search Job.objects. for the number of system failures in the past (configurable) 1 minute
            # if (configurable) 5 or more have occurred, pause the node
            node_error_period = Scheduler.objects.first().node_error_period
            if node_error_period > 0:
                check_time = datetime.utcnow() - timedelta(minutes=node_error_period)
                # find out how many jobs have recently failed on this node with a system error
                num_node_errors = JobExecution.objects.select_related('error', 'node').filter(
                    status='FAILED', error__category='SYSTEM', ended__gte=check_time, node=node).distinct('job').count()
                max_node_errors = Scheduler.objects.first().max_node_errors
                if num_node_errors >= max_node_errors:
                    logger.warning('%s failed %d jobs in %d minutes, pausing the host' % (node.hostname, num_node_errors, node_sleep))
                    with transaction.atomic():
                        node.is_paused = True
                        node.is_paused_errors = True
                        node.pause_reason = "System Failure Rate Too High"
                        node.save()

        # Remove all remaining tasks
        self.remaining_task_ids = []

        self.current_task_id = None
        self.current_task_stdout_url = None
        self.current_task_stderr_url = None
        JobExecution.objects.set_log_urls(self.job_exe_id, None, None)