Пример #1
0
    def handle_task_update(self, task_update):
        """Handles the given task update and returns the associated job execution if it has finished

        :param task_update: The task update
        :type task_update: :class:`job.tasks.update.TaskStatusUpdate`
        :returns: The job execution if it has finished, None otherwise
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        finished_job_exe = None
        if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
            cluster_id = JobExecution.parse_cluster_id(task_update.task_id)
            with self._lock:
                if cluster_id in self._running_job_exes:
                    job_exe = self._running_job_exes[cluster_id]
                    job_exe.task_update(task_update)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        finished_job_exe = job_exe
                        # return job_exe

        # TODO: this can be removed once database operations move to messaging backend
        if finished_job_exe:
            self._handle_finished_job_exe_in_database(finished_job_exe)
            return finished_job_exe

        return None
Пример #2
0
    def handle_task_timeout(self, task, when):
        """Handles the timeout of the given task

        :param task: The task
        :type task: :class:`job.tasks.base_task.Task`
        :param when: The time that the time out occurred
        :type when: :class:`datetime.datetime`
        """

        if task.id.startswith(JOB_TASK_ID_PREFIX):
            cluster_id = JobExecution.parse_cluster_id(task.id)
            with self._lock:
                if cluster_id in self._running_job_exes:
                    job_exe = self._running_job_exes[cluster_id]
                    # We do not remove the failed job execution at this point. We wait for the status update of the
                    # killed task to come back so that job execution cleanup occurs after the task is dead.
                    job_exe.execution_timed_out(task, when)
Пример #3
0
def create_task_update_model(status):
    """Creates and returns a task update model for the given Mesos task status

    :param status: The task status
    :type status: :class:`mesos_pb2.TaskStatus`
    :returns: The task update model
    :rtype: :class:`job.models.TaskUpdate`
    """

    task_update = TaskUpdate()
    task_update.task_id = get_status_task_id(status)
    if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
        task_update.job_exe_id = JobExecution.get_job_exe_id(task_update.task_id)
    task_update.status = get_status_state(status)
    task_update.timestamp = get_status_timestamp(status)
    task_update.source = get_status_source(status)
    task_update.reason = get_status_reason(status)
    task_update.message = get_status_message(status)

    return task_update
Пример #4
0
    def handle_task_update(self, task_update):
        """Handles the given task update and returns the associated job execution if it has finished

        :param task_update: The task update
        :type task_update: :class:`job.tasks.update.TaskStatusUpdate`
        :returns: The job execution if it has finished, None otherwise
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
            cluster_id = JobExecution.parse_cluster_id(task_update.task_id)
            with self._lock:
                if cluster_id in self._running_job_exes:
                    job_exe = self._running_job_exes[cluster_id]
                    job_exe.task_update(task_update)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        return job_exe

        return None
Пример #5
0
    def handle_task_timeout(self, task, when):
        """Handles the timeout of the given task

        :param task: The task
        :type task: :class:`job.tasks.base_task.Task`
        :param when: The time that the time out occurred
        :type when: :class:`datetime.datetime`
        """

        if task.id.startswith(JOB_TASK_ID_PREFIX):
            job_exe_id = JobExecution.get_job_exe_id(task.id)
            with self._lock:
                if job_exe_id in self._running_job_exes:
                    job_exe = self._running_job_exes[job_exe_id]
                    try:
                        job_exe.execution_timed_out(task, when)
                    except DatabaseError:
                        logger.exception(
                            'Error failing timed out job execution %i',
                            job_exe_id)
Пример #6
0
def create_task_update_model(status):
    """Creates and returns a task update model for the given Mesos task status

    :param status: The task status
    :type status: :class:`mesos_pb2.TaskStatus`
    :returns: The task update model
    :rtype: :class:`job.models.TaskUpdate`
    """

    task_update = TaskUpdate()
    task_update.task_id = get_status_task_id(status)
    if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
        task_update.job_exe_id = JobExecution.get_job_exe_id(task_update.task_id)
    task_update.status = get_status_state(status)
    task_update.timestamp = get_status_timestamp(status)
    task_update.source = get_status_source(status)
    task_update.reason = get_status_reason(status)
    task_update.message = get_status_message(status)

    return task_update
Пример #7
0
def create_job_exe(job_type=None,
                   job=None,
                   exe_num=None,
                   node=None,
                   timeout=None,
                   input_file_size=10.0,
                   queued=None,
                   started=None,
                   status='RUNNING',
                   error=None,
                   ended=None,
                   output=None,
                   task_results=None):
    """Creates a job_exe model for unit testing, may also create job_exe_end and job_exe_output models depending on
    status

    :returns: The job_exe model
    :rtype: :class:`job.execution.job_exe.RunningJobExecution`
    """

    when = timezone.now()
    if not job:
        job = create_job(job_type=job_type,
                         status=status,
                         input_file_size=input_file_size)
    job_type = job.job_type

    job_exe = JobExecution()
    job_exe.job = job
    job_exe.job_type = job_type
    if not exe_num:
        exe_num = job.num_exes
    job_exe.exe_num = exe_num
    job_exe.set_cluster_id('1234', job.id, job_exe.exe_num)
    if not node:
        node = node_utils.create_node()
    job_exe.node = node
    if not timeout:
        timeout = job.timeout
    job_exe.timeout = timeout
    job_exe.input_file_size = input_file_size
    job_exe.resources = job.get_resources().get_json().get_dict()
    job_exe.configuration = ExecutionConfiguration().get_dict()
    if not queued:
        queued = when
    job_exe.queued = queued
    if not started:
        started = when + datetime.timedelta(seconds=1)
    job_exe.started = started
    job_exe.save()

    if status in ['COMPLETED', 'FAILED', 'CANCELED']:
        job_exe_end = JobExecutionEnd()
        job_exe_end.job_exe_id = job_exe.id
        job_exe_end.job = job_exe.job
        job_exe_end.job_type = job_exe.job_type
        job_exe_end.exe_num = job_exe.exe_num
        if not task_results:
            task_results = TaskResults()
        job_exe_end.task_results = task_results.get_dict()
        job_exe_end.status = status
        if status == 'FAILED' and not error:
            error = error_test_utils.create_error()
        job_exe_end.error = error
        job_exe_end.node = node
        job_exe_end.queued = queued
        job_exe_end.started = started
        job_exe_end.seed_started = task_results.get_task_started('main')
        job_exe_end.seed_ended = task_results.get_task_ended('main')
        if not ended:
            ended = started + datetime.timedelta(seconds=1)
        job_exe_end.ended = ended
        job_exe_end.save()

    if status == 'COMPLETED' or output:
        job_exe_output = JobExecutionOutput()
        job_exe_output.job_exe_id = job_exe.id
        job_exe_output.job = job_exe.job
        job_exe_output.job_type = job_exe.job_type
        job_exe_output.exe_num = job_exe.exe_num
        if not output:
            output = JobResults()
        job_exe_output.output = output.get_dict()
        job_exe_output.save()

    return job_exe
Пример #8
0
 def test_append_stderr_none(self):
     '''Tests skipping append when no error is provided.'''
     job_exe = JobExecution(stderr='initial')
     job_exe.append_stderr(None)
     
     self.assertEqual(job_exe.stderr, 'initial')
Пример #9
0
 def test_append_stderr_join(self):
     '''Tests appending error to existing error.'''
     job_exe = JobExecution(stderr='initial')
     job_exe.append_stderr('-test1')
     
     self.assertEqual(job_exe.stderr, 'initial-test1')
Пример #10
0
 def test_append_stderr_initial(self):
     '''Tests appending error for the first time.'''
     job_exe = JobExecution()
     job_exe.append_stderr(stderr='initial')
     
     self.assertEqual(job_exe.stderr, 'initial')
Пример #11
0
 def test_append_stdout_none(self):
     '''Tests skipping append when no output is provided.'''
     job_exe = JobExecution(stdout='initial')
     job_exe.append_stdout(None)
     
     self.assertEqual(job_exe.stdout, 'initial')
Пример #12
0
    def create_job_exe_model(self, framework_id, when):
        """Creates and returns a scheduled job execution model

        :param framework_id: The scheduling framework ID
        :type framework_id: string
        :param when: The start time
        :type when: :class:`datetime.datetime`
        :returns: The job execution model
        :rtype: :class:`job.models.JobExecution`
        """

        job_exe = JobExecution()
        job_exe.job_id = self._queue.job_id
        job_exe.job_type_id = self._queue.job_type_id
        job_exe.recipe_id = self._queue.recipe_id
        job_exe.batch_id = self._queue.batch_id
        job_exe.exe_num = self._queue.exe_num
        job_exe.timeout = self._queue.timeout
        job_exe.docker_image = self._queue.docker_image
        job_exe.input_file_size = self._queue.input_file_size
        job_exe.configuration = self.configuration.get_dict()
        job_exe.queued = self._queue.queued

        if self.is_canceled:
            job_exe.node_id = None
            job_exe.resources = NodeResources().get_json().get_dict()
            job_exe.started = None
        else:
            job_exe.node_id = self._scheduled_node_id
            job_exe.resources = self._scheduled_resources.get_json().get_dict()
            job_exe.started = when

        job_exe.set_cluster_id(framework_id, self._queue.job_id, self._queue.exe_num)

        if self.required_resources.gpus > 0:
            if not GPUManager.assign_gpus_for_job(job_exe.node_id,job_exe.job_id, self.required_resources.gpus):
                logger.error("Job %s was unable to assign %s reserved GPUs on node %s. Note: this is not supposed to be able to happen. something has gone horribly wrong.", job_exe.job_id, self.required_resources.gpus, job_exe.node_id)

        return job_exe
Пример #13
0
 def test_append_stdout_initial(self):
     '''Tests appending output for the first time.'''
     job_exe = JobExecution()
     job_exe.append_stdout(stdout='initial')
     
     self.assertEqual(job_exe.stdout, 'initial')
Пример #14
0
    def update(self, status):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.
        """

        started = now()

        model = utils.create_task_update_model(status)
        mesos_status = model.status
        task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status), utils.get_status_data(status))
        task_id = task_update.task_id
        was_task_finished = task_update.status in TaskStatusUpdate.TERMINAL_STATUSES
        was_job_finished = False

        if mesos_status == 'TASK_ERROR':
            logger.error('Status update for task %s: %s', task_id, mesos_status)
        if mesos_status == 'TASK_LOST':
            logger.warning('Status update for task %s: %s', task_id, mesos_status)
        else:
            logger.info('Status update for task %s: %s', task_id, mesos_status)

        # Since we have a status update for this task, remove it from reconciliation set
        recon_mgr.remove_task_id(task_id)

        # Hand off task update to be saved in the database
        if task_id.startswith(JOB_TASK_ID_PREFIX):
            # Grab job execution ID from manager
            cluster_id = JobExecution.parse_cluster_id(task_id)
            job_exe = job_exe_mgr.get_running_job_exe(cluster_id)
            if job_exe:
                model.job_exe_id = job_exe.id
        task_update_mgr.add_task_update(model)

        # Update task with latest status
        # This should happen before the job execution or node manager are updated, since they will assume that the task
        # has already been updated
        task_mgr.handle_task_update(task_update)

        if task_id.startswith(JOB_TASK_ID_PREFIX):
            # Job task, so update the job execution
            try:
                job_exe = job_exe_mgr.handle_task_update(task_update)
                if job_exe and job_exe.is_finished():
                    logger.info("job_exe with job id %s and node id %s is finished", job_exe.job_id, job_exe.node_id)
                    was_job_finished = True
                    cleanup_mgr.add_job_execution(job_exe)
                    GPUManager.release_gpus(job_exe.node_id, job_exe.job_id)

            except Exception:
                cluster_id = JobExecution.parse_cluster_id(task_id)
                logger.exception('Error handling status update for job execution: %s', cluster_id)
                # Error handling status update, add task so it can be reconciled
                task = task_mgr.get_task(task_id)
                if task:
                    recon_mgr.add_tasks([task])
        else:
            # Not a job task, so must be either a node or system task
            node_mgr.handle_task_update(task_update)
            system_task_mgr.handle_task_update(task_update)

        scheduler_mgr.add_task_update_counts(was_task_finished, was_job_finished)

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Пример #15
0
    def statusUpdate(self, driver, status):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.

        See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`.
        """

        started = now()

        model = utils.create_task_update_model(status)
        mesos_status = model.status
        task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status))
        task_id = task_update.task_id

        if mesos_status == 'TASK_LOST':
            logger.warning('Status update for task %s: %s', task_id, mesos_status)
        else:
            logger.info('Status update for task %s: %s', task_id, mesos_status)

        # Since we have a status update for this task, remove it from reconciliation set
        recon_mgr.remove_task_id(task_id)

        # Hand off task update to be saved in the database
        task_update_mgr.add_task_update(model)

        if task_id.startswith(CLEANUP_TASK_ID_PREFIX):
            cleanup_mgr.handle_task_update(task_update)
        else:
            job_exe_id = JobExecution.get_job_exe_id(task_id)

            try:
                running_job_exe = running_job_mgr.get_job_exe(job_exe_id)

                if running_job_exe:
                    running_job_exe.task_update(task_update)

                    # Remove finished job execution
                    if running_job_exe.is_finished():
                        running_job_mgr.remove_job_exe(job_exe_id)
                        cleanup_mgr.add_job_execution(running_job_exe)
                else:
                    # Scheduler doesn't have any knowledge of this job execution
                    Queue.objects.handle_job_failure(job_exe_id, now(), [],
                                                     Error.objects.get_builtin_error('scheduler-lost'))
            except Exception:
                logger.exception('Error handling status update for job execution: %s', job_exe_id)
                # Error handling status update, add task so it can be reconciled
                recon_mgr.add_task_ids([task_id])

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Пример #16
0
def create_running_job_exe(agent_id='agent_1',
                           job_type=None,
                           job=None,
                           node=None,
                           timeout=None,
                           input_file_size=10.0,
                           queued=None,
                           started=None,
                           resources=None,
                           priority=None,
                           num_exes=1):
    """Creates a running job execution for unit testing

    :returns: The running job execution
    :rtype: :class:`job.execution.job_exe.RunningJobExecution`
    """

    when = timezone.now()
    if not job:
        job = create_job(job_type=job_type,
                         status='RUNNING',
                         input_file_size=input_file_size,
                         num_exes=num_exes)
    job_type = job.job_type

    # Configuration that occurs at queue time
    input_files = {}
    input_file_ids = job.get_job_data().get_input_file_ids()
    if input_file_ids:
        for input_file in ScaleFile.objects.get_files_for_queued_jobs(
                input_file_ids):
            input_files[input_file.id] = input_file
    exe_config = QueuedExecutionConfigurator(input_files).configure_queued_job(
        job)

    job_exe = JobExecution()
    job_exe.set_cluster_id('1234', job.id, job.num_exes)
    job_exe.job = job
    job_exe.job_type = job_type
    job_exe.exe_num = job.num_exes
    if not node:
        node = node_utils.create_node()
    job_exe.node = node
    if not timeout:
        timeout = job.timeout
    job_exe.timeout = timeout
    job_exe.input_file_size = input_file_size
    if not resources:
        resources = job.get_resources()
    job_exe.resources = resources.get_json().get_dict()
    job_exe.configuration = exe_config.get_dict()
    if not queued:
        queued = when
    job_exe.queued = queued
    if not started:
        started = when + datetime.timedelta(seconds=1)
    job_exe.started = started
    job_exe.save()

    if not priority:
        priority = job.priority

    # Configuration that occurs at schedule time
    workspaces = {}
    for workspace in Workspace.objects.all():
        workspaces[workspace.name] = workspace
    secret_config = ScheduledExecutionConfigurator(
        workspaces).configure_scheduled_job(job_exe, job_type,
                                            job_type.get_job_interface(),
                                            'INFO')
    return RunningJobExecution(agent_id, job_exe, job_type, secret_config,
                               priority)
Пример #17
0
    def create_job_exe_model(self, framework_id, when):
        """Creates and returns a scheduled job execution model

        :param framework_id: The scheduling framework ID
        :type framework_id: string
        :param when: The start time
        :type when: :class:`datetime.datetime`
        :returns: The job execution model
        :rtype: :class:`job.models.JobExecution`
        """

        job_exe = JobExecution()
        job_exe.job_id = self._queue.job_id
        job_exe.job_type_id = self._queue.job_type_id
        job_exe.recipe_id = self._queue.recipe_id
        job_exe.batch_id = self._queue.batch_id
        job_exe.exe_num = self._queue.exe_num
        job_exe.timeout = self._queue.timeout
        job_exe.input_file_size = self._queue.input_file_size
        job_exe.configuration = self.configuration.get_dict()
        job_exe.queued = self._queue.queued

        if self.is_canceled:
            job_exe.node_id = None
            job_exe.resources = NodeResources().get_json().get_dict()
            job_exe.started = None
        else:
            job_exe.node_id = self._scheduled_node_id
            job_exe.resources = self._scheduled_resources.get_json().get_dict()
            job_exe.started = when

        job_exe.set_cluster_id(framework_id, self._queue.job_id,
                               self._queue.exe_num)

        return job_exe
Пример #18
0
 def test_append_stdout_join(self):
     '''Tests appending output to existing output.'''
     job_exe = JobExecution(stdout='initial')
     job_exe.append_stdout('-test1')
     
     self.assertEqual(job_exe.stdout, 'initial-test1')