def create_task_update_model(status): """Creates and returns a task update model for the given Mesos task status :param status: The task status :type status: :class:`mesos_pb2.TaskStatus` :returns: The task update model :rtype: :class:`job.models.TaskUpdate` """ task_update = TaskUpdate() task_update.task_id = get_status_task_id(status) task_update.job_exe_id = RunningJobExecution.get_job_exe_id(task_update.task_id) task_update.status = get_status_state(status) task_update.timestamp = get_status_timestamp(status) task_update.source = get_status_source(status) task_update.reason = get_status_reason(status) task_update.message = get_status_message(status) return task_update
def create_task_update_model(status): """Creates and returns a task update model for the given Mesos task status :param status: The task status :type status: :class:`mesos_pb2.TaskStatus` :returns: The task update model :rtype: :class:`job.models.TaskUpdate` """ task_update = TaskUpdate() task_update.task_id = get_status_task_id(status) task_update.job_exe_id = RunningJobExecution.get_job_exe_id( task_update.task_id) task_update.status = get_status_state(status) task_update.timestamp = get_status_timestamp(status) task_update.source = get_status_source(status) task_update.reason = get_status_reason(status) task_update.message = get_status_message(status) return task_update
def statusUpdate(self, driver, status): """ Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc.) Note that returning from this callback acknowledges receipt of this status update. If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered. Note, however, that this is currently not true if the slave sending the status update is lost or fails during that time. See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`. """ started = now() task_id = status.task_id.value job_exe_id = RunningJobExecution.get_job_exe_id(task_id) logger.info('Status update for task %s: %s', task_id, utils.status_to_string(status.state)) # Since we have a status update for this task, remove it from reconciliation set self._recon_thread.remove_task_id(task_id) try: running_job_exe = self._job_exe_manager.get_job_exe(job_exe_id) if running_job_exe: results = TaskResults(task_id) results.exit_code = utils.parse_exit_code(status) results.when = utils.get_status_timestamp(status) if status.state in [mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED]: try: log_start_time = now() hostname = running_job_exe._node_hostname port = running_job_exe._node_port task_dir = get_slave_task_directory(hostname, port, task_id) results.stdout = get_slave_task_file(hostname, port, task_dir, 'stdout') results.stderr = get_slave_task_file(hostname, port, task_dir, 'stderr') log_end_time = now() logger.debug('Time to pull logs for task: %s', str(log_end_time - log_start_time)) except Exception: logger.exception('Error pulling logs for task %s', task_id) # Apply status update to running job execution if status.state == mesos_pb2.TASK_RUNNING: hostname = running_job_exe._node_hostname port = running_job_exe._node_port task_dir = get_slave_task_directory(hostname, port, task_id) stdout_url = get_slave_task_url(hostname, port, task_dir, 'stdout') stderr_url = get_slave_task_url(hostname, port, task_dir, 'stderr') running_job_exe.task_running(task_id, results.when, stdout_url, stderr_url) elif status.state == mesos_pb2.TASK_FINISHED: running_job_exe.task_complete(results) elif status.state == mesos_pb2.TASK_LOST: running_job_exe.task_fail(results, Error.objects.get_builtin_error('mesos-lost')) elif status.state in [mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED]: running_job_exe.task_fail(results) # Remove finished job execution if running_job_exe.is_finished(): self._job_exe_manager.remove_job_exe(job_exe_id) else: # Scheduler doesn't have any knowledge of this job execution Queue.objects.handle_job_failure(job_exe_id, now(), Error.objects.get_builtin_error('scheduler-lost')) except Exception: logger.exception('Error handling status update for job execution: %s', job_exe_id) # Error handling status update, add task so it can be reconciled self._recon_thread.add_task_ids([task_id]) duration = now() - started msg = 'Scheduler statusUpdate() took %.3f seconds' if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def statusUpdate(self, driver, status): ''' Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc.) Note that returning from this callback acknowledges receipt of this status update. If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered. Note, however, that this is currently not true if the slave sending the status update is lost or fails during that time. See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`. ''' started = now() task_id = status.task_id.value job_exe_id = RunningJobExecution.get_job_exe_id(task_id) logger.info('Status update for task %s: %s', task_id, utils.status_to_string(status.state)) # Since we have a status update for this task, remove it from reconciliation set self._recon_thread.remove_task_id(task_id) try: running_job_exe = self._job_exe_manager.get_job_exe(job_exe_id) if running_job_exe: results = TaskResults(task_id) results.exit_code = utils.parse_exit_code(status) results.when = utils.get_status_timestamp(status) if status.state in [ mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED ]: try: log_start_time = now() hostname = running_job_exe._node_hostname port = running_job_exe._node_port task_dir = get_slave_task_directory( hostname, port, task_id) results.stdout = get_slave_task_file( hostname, port, task_dir, 'stdout') results.stderr = get_slave_task_file( hostname, port, task_dir, 'stderr') log_end_time = now() logger.debug('Time to pull logs for task: %s', str(log_end_time - log_start_time)) except Exception: logger.exception('Error pulling logs for task %s', task_id) # Apply status update to running job execution if status.state == mesos_pb2.TASK_RUNNING: hostname = running_job_exe._node_hostname port = running_job_exe._node_port task_dir = get_slave_task_directory( hostname, port, task_id) stdout_url = get_slave_task_url(hostname, port, task_dir, 'stdout') stderr_url = get_slave_task_url(hostname, port, task_dir, 'stderr') running_job_exe.task_running(task_id, results.when, stdout_url, stderr_url) elif status.state == mesos_pb2.TASK_FINISHED: running_job_exe.task_complete(results) elif status.state == mesos_pb2.TASK_LOST: running_job_exe.task_fail( results, Error.objects.get_builtin_error('mesos-lost')) elif status.state in [ mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED ]: running_job_exe.task_fail(results) # Remove finished job execution if running_job_exe.is_finished(): self._job_exe_manager.remove_job_exe(job_exe_id) else: # Scheduler doesn't have any knowledge of this job execution Queue.objects.handle_job_failure( job_exe_id, now(), Error.objects.get_builtin_error('scheduler-lost')) except Exception: logger.exception( 'Error handling status update for job execution: %s', job_exe_id) # Error handling status update, add task so it can be reconciled self._recon_thread.add_task_ids([task_id]) duration = now() - started msg = 'Scheduler statusUpdate() took %.3f seconds' if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())