def create_tasks(MV, available_offers, driver, command, ns): """ Launch up to `MV` mesos tasks, depending on availability of mesos resources. `MV` max number of mesos tasks to spin up. Relay chooses this number `available_offers` a dict of mesos offers and num tasks they can support `driver` a mesos driver instance """ n_fulfilled = 0 for offer, ntasks in available_offers: if n_fulfilled >= MV: driver.declineOffer(offer.id) continue tasks = [] for ID in range(ntasks): if n_fulfilled >= MV: break n_fulfilled += 1 tid = "%s.%s.%s" % ( ID, offer.id.value, random.randint(1, sys.maxint)) log.debug( "Accepting offer to start a task", extra=dict( offer_host=offer.hostname, task_id=tid, mesos_framework_name=ns.mesos_framework_name)) task = _create_task( tid, offer, command, ns) tasks.append(task) driver.launchTasks(offer.id, tasks) return n_fulfilled
def create_tasks(MV, available_offers, driver, command, ns): """ Launch up to `MV` mesos tasks, depending on availability of mesos resources. `MV` max number of mesos tasks to spin up. Relay chooses this number `available_offers` a dict of mesos offers and num tasks they can support `driver` a mesos driver instance """ n_fulfilled = 0 for offer, ntasks in available_offers: if n_fulfilled >= MV: driver.declineOffer(offer.id) continue tasks = [] for ID in range(ntasks): if n_fulfilled >= MV: break n_fulfilled += 1 tid = "%s.%s.%s" % (ID, offer.id.value, random.randint(1, sys.maxint)) log.debug("Accepting offer to start a task", extra=dict(offer_host=offer.hostname, task_id=tid, mesos_framework_name=ns.mesos_framework_name)) task = _create_task(tid, offer, command, ns) tasks.append(task) driver.launchTasks(offer.id, tasks) return n_fulfilled
def _get_and_update_relay(self, available_offers): """ Get num tasks I should create and evaluate whether to use Relay's warmer or cooler command. Update the MV with number of commands about to be created. Competes for the MV with these other threads, and will wait indefinitely for it: - other Mesos resourceOffers(...) calls to the Framework scheduler - Relay warmer and cooler functions attempting to ask the Framework to execute more tasks. """ command = None with self.MV.get_lock(): MV, t = self.MV # create tasks that fulfill relay's requests or return if MV == 0: log.debug( 'mesos scheduler has received no requests from relay', extra=dict( mesos_framework_name=self.ns.mesos_framework_name)) else: if MV > 0 and self.ns.warmer: command = self.ns.warmer elif MV < 0 and self.ns.cooler: command = self.ns.cooler if abs(MV) < len(available_offers): self.MV[:] = [0, time.time()] else: new_MV = MV - (MV > 0 or -1) * max(abs(MV), len(available_offers)) self.MV[:] = [new_MV, time.time()] return (MV, command)
def init_relay(ns_relay, mesos_ready, mesos_framework_name): log.debug( 'Relay waiting to start until mesos framework is registered', extra=dict(mesos_framework_name=mesos_framework_name)) mesos_ready.acquire() mesos_ready.wait() log.debug( 'Relay notified that mesos framework is registered', extra=dict(mesos_framework_name=mesos_framework_name)) relay_main(ns_relay)
def offerRescinded(self, driver, offerId): """ Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc). Note that returning from this callback _acknowledges_ receipt of this status update! If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered (note, however, that this is currently not true if the slave sending the status update is lost/fails during that time). """ log.debug('offer rescinded', extra=dict( offer_id=offerId.value, mesos_framework_name=self.ns.mesos_framework_name))
def _warmer_cooler_wrapper(n): # inform mesos that it should spin up n tasks of type f, where f is # either the warmer or cooler. Since Relay assumes that the choice of # `f` (either a warmer or cooler func) is determined by the sign of n, # we can too! log.debug( 'asking mesos to spawn tasks', extra=dict( mesos_framework_name=ns.mesos_framework_name, task_num=n, task_type="warmer" if n > 0 else "cooler")) t = time.time() with MV.get_lock(): if MV[1] < t: MV[:] = (n, t) log.debug( '...finished asking mesos to spawn tasks', extra=dict( mesos_framework_name=ns.mesos_framework_name, task_num=n, task_type="warmer" if n > 0 else "cooler"))
def _statusUpdate(self, driver, update): log.debug('task status update: %s' % str(update.message), extra=dict( task_id=update.task_id.value, task_state=update.state, slave_id=update.slave_id.value, timestamp=update.timestamp, mesos_framework_name=self.ns.mesos_framework_name)) if self.ns.max_failures == -1: return # don't quit even if you are getting failures m = mesos_pb2 if update.state in [m.TASK_FAILED, m.TASK_LOST]: self.failures += 1 elif update.state in [m.TASK_FINISHED, m.TASK_STARTING]: self.failures = max(self.failures - 1, 0) if self.failures >= self.ns.max_failures: log.error( "Max allowable number of failures reached", extra=dict( max_failures=self.failures, mesos_framework_name=self.ns.mesos_framework_name)) driver.stop() raise MaxFailuresReached(self.failures)
def _statusUpdate(self, driver, update): log.debug('task status update: %s' % str(update.message), extra=dict( task_id=update.task_id.value, task_state=update.state, slave_id=update.slave_id.value, timestamp=update.timestamp, mesos_framework_name=self.ns.mesos_framework_name)) if self.ns.max_failures == -1: return # don't quit even if you are getting failures m = mesos_pb2 if update.state in [m.TASK_FAILED, m.TASK_LOST]: self.failures += 1 elif update.state in [m.TASK_FINISHED, m.TASK_STARTING]: self.failures = max(self.failures - 1, 0) if self.failures >= self.ns.max_failures: log.error("Max allowable number of failures reached", extra=dict( max_failures=self.failures, mesos_framework_name=self.ns.mesos_framework_name)) driver.stop() raise MaxFailuresReached(self.failures)
def _resourceOffers(self, driver, offers): """ Invoked when resources have been offered to this framework. A single offer will only contain resources from a single slave. Resources associated with an offer will not be re-offered to _this_ framework until either (a) this framework has rejected those resources (see SchedulerDriver.launchTasks) or (b) those resources have been rescinded (see Scheduler.offerRescinded). Note that resources may be concurrently offered to more than one framework at a time (depending on the allocator being used). In that case, the first framework to launch tasks using those resources will be able to use them while the other frameworks will have those resources rescinded (or if a framework has already launched tasks with those resources then those tasks will fail with a TASK_LOST status and a message saying as much). """ log.debug("Got resource offers", extra=dict( num_offers=len(offers), mesos_framework_name=self.ns.mesos_framework_name)) available_offers, decline_offers = filter_offers( offers, dict(self.ns.mesos_task_resources)) for offer in decline_offers: driver.declineOffer(offer.id) if not available_offers: log.debug( 'None of the mesos offers had enough relevant resources', extra=dict(mesos_framework_name=self.ns.mesos_framework_name)) return log.debug('Mesos has offers available', extra=dict( available_offers=len(available_offers), max_runnable_tasks=sum(x[1] for x in available_offers), mesos_framework_name=self.ns.mesos_framework_name)) MV, command = self._get_and_update_relay(available_offers) if command is None: for offer, _ in available_offers: driver.declineOffer(offer.id) return create_tasks(MV=abs(MV), available_offers=available_offers, driver=driver, command=command, ns=self.ns) driver.reviveOffers()
def _resourceOffers(self, driver, offers): """ Invoked when resources have been offered to this framework. A single offer will only contain resources from a single slave. Resources associated with an offer will not be re-offered to _this_ framework until either (a) this framework has rejected those resources (see SchedulerDriver.launchTasks) or (b) those resources have been rescinded (see Scheduler.offerRescinded). Note that resources may be concurrently offered to more than one framework at a time (depending on the allocator being used). In that case, the first framework to launch tasks using those resources will be able to use them while the other frameworks will have those resources rescinded (or if a framework has already launched tasks with those resources then those tasks will fail with a TASK_LOST status and a message saying as much). """ log.debug("Got resource offers", extra=dict( num_offers=len(offers), mesos_framework_name=self.ns.mesos_framework_name)) available_offers, decline_offers = filter_offers( offers, dict(self.ns.mesos_task_resources)) for offer in decline_offers: driver.declineOffer(offer.id) if not available_offers: log.debug( 'None of the mesos offers had enough relevant resources', extra=dict(mesos_framework_name=self.ns.mesos_framework_name)) return log.debug( 'Mesos has offers available', extra=dict( available_offers=len(available_offers), max_runnable_tasks=sum(x[1] for x in available_offers), mesos_framework_name=self.ns.mesos_framework_name)) MV, command = self._get_and_update_relay(available_offers) if command is None: for offer, _ in available_offers: driver.declineOffer(offer.id) return create_tasks( MV=abs(MV), available_offers=available_offers, driver=driver, command=command, ns=self.ns ) driver.reviveOffers()