Пример #1
0
    def statusUpdate(self, driver, status):
        '''
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.

        See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`.
        '''

        if self.debug:
            connect_remote_debug()

        status_str = utils.status_to_string(status.state)
        task_id = status.task_id.value
        job_exe_id = ScaleJobExecution.get_job_exe_id(task_id)
        logger.info('Status update for task %s: %s', task_id, status_str)

        # Got a status update, so remove task from reconciliation set
        try:
            self.recon_lock.acquire()
            if task_id in self.recon_set:
                self.recon_set.remove(task_id)
        finally:
            self.recon_lock.release()

        try:
            scale_job_exe = self._get_job_exe(job_exe_id)
            if not scale_job_exe:
                # Scheduler doesn't have any knowledge of this job execution
                error = get_scheduler_error()
                Queue.objects.handle_job_failure(job_exe_id, now(), error)
                return

            if status.state == mesos_pb2.TASK_RUNNING:
                scale_job_exe.task_running(task_id, status)
            elif status.state == mesos_pb2.TASK_FINISHED:
                scale_job_exe.task_completed(task_id, status)
            elif status.state in [mesos_pb2.TASK_LOST, mesos_pb2.TASK_ERROR,
                                  mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED]:
                # The task had an error so job execution is failed
                scale_job_exe.task_failed(task_id, status)
            if scale_job_exe.is_finished():
                # No more tasks so job execution is completed
                self._delete_job_exe(scale_job_exe)
        except:
            logger.exception('Error handling status update for job execution: %s', job_exe_id)
            # Error handling status update, add task so it can be reconciled
            try:
                self.recon_lock.acquire()
                self.recon_set.add(task_id)
            finally:
                self.recon_lock.release()
Пример #2
0
    def resourceOffers(self, driver, offers):
        '''
        Invoked when resources have been offered to this framework. A single
        offer will only contain resources from a single slave.  Resources
        associated with an offer will not be re-offered to _this_ framework
        until either (a) this framework has rejected those resources (see
        SchedulerDriver.launchTasks) or (b) those resources have been
        rescinded (see Scheduler.offerRescinded).  Note that resources may be
        concurrently offered to more than one framework at a time (depending
        on the allocator being used).  In that case, the first framework to
        launch tasks using those resources will be able to use them while the
        other frameworks will have those resources rescinded (or if a
        framework has already launched tasks with those resources then those
        tasks will fail with a TASK_LOST status and a message saying as much).

        See documentation for :meth:`mesos_api.mesos.Scheduler.resourceOffers`.
        '''

        if self.debug:
            connect_remote_debug()

        # Compile a list of all of the offers and register nodes
        scale_offers = self._create_scale_offers(driver, offers)

        try:
            for scale_offer in scale_offers:
                logger.debug('Offer of %f CPUs, %f MiB memory, and %f MiB disk space from %s', scale_offer.cpus,
                             scale_offer.mem, scale_offer.disk, scale_offer.hostname)

            # Schedule any needed tasks for Scale jobs that are currently running even if the scheduler or individual nodes
            # are paused
            for scale_offer in scale_offers:
                slave_id = scale_offer.slave_id

                try:
                    Node.objects.update_last_offer(slave_id)
                except:
                    logger.exception('Error updating node last offer for slave_id %s', slave_id)

                with self.current_jobs_lock:
                    current_job_exes = self.current_jobs[slave_id]

                    for scale_job_exe in current_job_exes:
                        # Get updated remaining resources from offer
                        cpus = scale_offer.cpus
                        mem = scale_offer.mem
                        disk = scale_offer.disk
                        if scale_job_exe.is_next_task_ready(cpus, mem, disk):
                            try:
                                # We need to have the current_jobs lock when we do this
                                # and be using the real scale_job_exe not a copy
                                task = scale_job_exe.start_next_task()
                                cpus, mem, disk = scale_job_exe.get_current_task_resources()
                                scale_offer.add_task(task, cpus, mem, disk)
                            except:
                                logger.exception('Error trying to create Mesos task for job execution: %s',
                                                 scale_job_exe.job_exe_id)

            # Schedule jobs off of the queue. If the scheduler is paused, don't add new jobs
            #TODO: discuss into first() instead
            if models.Scheduler.objects.is_master_active():
                for scale_offer in scale_offers:
                    if scale_offer.can_run_new_jobs:
                        try:
                            scheduled_job_exes = Queue.objects.schedule_jobs_on_node(scale_offer.cpus, scale_offer.mem,
                                                                                     scale_offer.disk, scale_offer.node)
                            for job_exe in scheduled_job_exes:
                                scale_job_exe = ScaleJobExecution(job_exe, job_exe.cpus_scheduled, job_exe.mem_scheduled,
                                                                  job_exe.disk_in_scheduled, job_exe.disk_out_scheduled,
                                                                  job_exe.disk_total_scheduled)
                                task = scale_job_exe.start_next_task()
                                cpus, mem, disk = scale_job_exe.get_current_task_resources()
                                self._add_job_exe(scale_offer.slave_id, scale_job_exe)
                                scale_offer.add_task(task, cpus, mem, disk)
                        except:
                            logger.exception('Error trying to schedule a job off of the queue')

            # Tell Mesos to launch tasks!
            while len(scale_offers) > 0:
                scale_offer = scale_offers.pop(0)
                num_tasks = len(scale_offer.tasks)
                if num_tasks > 0:
                    logger.info('Scheduling %i task(s) on node: %s', num_tasks, scale_offer.hostname)
                else:
                    logger.debug('No tasks to schedule on node: %s', scale_offer.hostname)

                driver.launchTasks(scale_offer.offer_id, scale_offer.tasks)
        except: # we must accept or decline all offers so there's a catch all here to ensure this happens
            for scale_offer in scale_offers:
                driver.launchTasks(scale_offer.offer_id, [])