def _sync_with_database_thread(self): '''This method is a background thread that polls the database to check for updates to the job executions that are currently running in the scheduler. This method kills off job executions that have been canceled. It also kills and fails job executions that have timed out. ''' throttle = 10 logger.info('Scheduler database sync background thread started') while self.sync_database_running: secs_passed = 0 started = now() job_exes = self._get_job_exes() job_exe_ids = [] for job_exe in job_exes: job_exe_ids.append(job_exe.job_exe_id) try: right_now = now() for job_exe_model in JobExecution.objects.filter(id__in=job_exe_ids): for job_exe in job_exes: if job_exe.job_exe_id == job_exe_model.id: this_job_exe = job_exe break kill_task = False delete_job_exe = False if job_exe_model.status == 'CANCELED': kill_task = True delete_job_exe = True elif job_exe_model.is_timed_out(right_now): kill_task = True delete_job_exe = True error = get_timeout_error() Queue.objects.handle_job_failure(job_exe_model.id, right_now, error) if kill_task: task_to_kill_id = this_job_exe.current_task() pb_task_to_kill = mesos_pb2.TaskID() pb_task_to_kill.value = task_to_kill_id logger.info('About to kill task: %s', task_to_kill_id) self.driver.killTask(pb_task_to_kill) if delete_job_exe: self._delete_job_exe(this_job_exe) except Exception: logger.exception('Error syncing scheduler with database') ended = now() secs_passed = (ended - started).total_seconds() if secs_passed < throttle: # Delay until full throttle time reached delay = math.ceil(throttle - secs_passed) time.sleep(delay) logger.info('Scheduler database sync background thread stopped')
def task_failed(self, task_id, status): '''Indicates that a Mesos task for this job execution has failed :param task_id: The ID of the task that failed :type task_id: str :param status: The task status :type status: :class:`mesos_pb2.TaskStatus` ''' if not self.current_task_id == task_id: return job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self.job_exe_id) stdout = None stderr = None node = None if status.state != mesos_pb2.TASK_LOST: try: node = self._cached_node task_dir = get_slave_task_directory(node.hostname, node.port, self.current_task_id) stdout = get_slave_task_file(node.hostname, node.port, task_dir, 'stdout') stderr = get_slave_task_file(node.hostname, node.port, task_dir, 'stderr') except Exception: logger.error('Error getting stdout/stderr for %s', self.current_task_id) self.failed = True error = None if status.state == mesos_pb2.TASK_LOST: error = get_mesos_error() if status.state == mesos_pb2.TASK_KILLED and self.timed_out: error = get_timeout_error() when_failed = EPOCH + timedelta(seconds=status.timestamp) exit_code = self._parse_exit_code(status) if self._is_current_task_pre(): # Check scale_pre_steps command to see if exit code maps to a specific error if exit_code in PRE_EXIT_CODE_DICT: error = PRE_EXIT_CODE_DICT[exit_code]() JobExecution.objects.pre_steps_failed(self.job_exe_id, when_failed, exit_code, stdout, stderr) elif self._is_current_task_job(): # Do error mapping here to determine error error = job_exe.get_error_interface().get_error(exit_code) JobExecution.objects.job_failed(self.job_exe_id, when_failed, exit_code, stdout, stderr) elif self._is_current_task_post(): # Check scale_post_steps command to see if exit code maps to a specific error if exit_code in POST_EXIT_CODE_DICT: error = POST_EXIT_CODE_DICT[exit_code]() JobExecution.objects.post_steps_failed(self.job_exe_id, when_failed, exit_code, stdout, stderr) if not error: error = Error.objects.get_unknown_error() Queue.objects.handle_job_failure(self.job_exe_id, when_failed, error) # Check for a high number of system errors and decide if we should pause the node if error.category == 'SYSTEM' and job_exe.job.num_exes >= job_exe.job.max_tries and node is not None and not node.is_paused: # search Job.objects. for the number of system failures in the past (configurable) 1 minute # if (configurable) 5 or more have occurred, pause the node node_error_period = Scheduler.objects.first().node_error_period if node_error_period > 0: check_time = datetime.utcnow() - timedelta(minutes=node_error_period) # find out how many jobs have recently failed on this node with a system error num_node_errors = JobExecution.objects.select_related('error', 'node').filter( status='FAILED', error__category='SYSTEM', ended__gte=check_time, node=node).distinct('job').count() max_node_errors = Scheduler.objects.first().max_node_errors if num_node_errors >= max_node_errors: logger.warning('%s failed %d jobs in %d minutes, pausing the host' % (node.hostname, num_node_errors, node_sleep)) with transaction.atomic(): node.is_paused = True node.is_paused_errors = True node.pause_reason = "System Failure Rate Too High" node.save() # Remove all remaining tasks self.remaining_task_ids = [] self.current_task_id = None self.current_task_stdout_url = None self.current_task_stderr_url = None JobExecution.objects.set_log_urls(self.job_exe_id, None, None)