def resourceOffers(self, driver, offers): tpn = self.options.task_per_node random.shuffle(offers) self.last_offer_time = time.time() if not self.total_tasks: driver.suppressOffers() for o in offers: driver.declineOffer(o.id) return for offer in offers: try: if conf.ban(offer.hostname): logger.debug("skip offer on banned node: %s", offer.hostname) continue except: logger.exception("bad ban() func in dpark.conf") attrs = self.getAttributes(offer) group = attrs.get('group', 'None') if (self.options.group or group.startswith( '_')) and group not in self.options.group: driver.declineOffer(offer.id, REFUSE_FILTER) continue cpus, mem, gpus = self.getResource(offer) logger.debug('got resource offer %s: cpus:%s, mem:%s, gpus:%s at %s', offer.id.value, cpus, mem, gpus, offer.hostname) sid = offer.agent_id.value tasks = [] while (self.total_tasks and cpus >= self.cpus + EXECUTOR_CPUS and mem >= self.mem + EXECUTOR_MEMORY and gpus >= self.gpus and ( tpn == 0 or tpn > 0 and len(self.agentTasks.get(sid, set())) < tpn )): logger.debug('Accepting slot on agent %s (%s)', offer.agent_id.value, offer.hostname) t = self.total_tasks.pop() task = self.create_task(offer, t) tasks.append(task) t.state = 'TASK_STARTING' t.state_time = time.time() self.task_launched[t.id] = t self.agentTasks.setdefault(sid, set()).add(t.id) cpus -= self.cpus mem -= self.mem gpus -= self.gpus if not self.total_tasks: break logger.debug( 'dispatch %d tasks to agent %s', len(tasks), offer.hostname) driver.launchTasks(offer.id, tasks, REFUSE_FILTER)
def resourceOffers(self, driver, offers): rf = Dict() if not self.active_tasksets: driver.suppressOffers() rf.refuse_seconds = 60 * 5 for o in offers: driver.declineOffer(o.id, rf) return start = time.time() filter_offer = [] for o in offers: try: if conf.ban(o.hostname): logger.debug("skip offer on banned node: %s", o.hostname) continue except: logger.exception("bad ban() func in dpark.conf") group = (self.getAttribute(o.attributes, 'group') or 'None') if (self.group or group.startswith('_')) and group not in self.group: driver.declineOffer(o.id, filters=Dict(refuse_seconds=0xFFFFFFFF)) continue if self.task_host_manager.is_unhealthy_host(o.hostname): logger.warning('the host %s is unhealthy so skip it', o.hostname) driver.declineOffer(o.id, filters=Dict(refuse_seconds=1800)) continue self.task_host_manager.register_host(o.hostname) filter_offer.append(o) offers = filter_offer cpus = [self.getResource(o.resources, 'cpus') for o in offers] gpus = [self.getResource(o.resources, 'gpus') for o in offers] mems = [ self.getResource(o.resources, 'mem') - (o.agent_id.value not in self.agent_id_to_ttids and EXECUTOR_MEMORY or 0) for o in offers ] # logger.debug('get %d offers (%s cpus, %s mem, %s gpus), %d tasksets', # len(offers), sum(cpus), sum(mems), sum(gpus), len(self.active_tasksets)) tasks = {} for taskset in self.active_tasksets.values(): while True: host_offers = {} for i, o in enumerate(offers): if self.agent_id_to_ttids.get(o.agent_id.value, 0) >= self.task_per_node: logger.debug('the task limit exceeded at host %s', o.hostname) continue if (mems[i] < self.mem + EXECUTOR_MEMORY or cpus[i] < self.cpus + EXECUTOR_CPUS): continue host_offers[o.hostname] = (i, o) assigned_list = taskset.taskOffer(host_offers, cpus, mems, gpus) if not assigned_list: break for i, o, t in assigned_list: task = self.createTask(o, t) tasks.setdefault(o.id.value, []).append(task) logger.debug('dispatch %s into %s', t, o.hostname) ttid = task.task_id.value agent_id = o.agent_id.value taskset.ttids.add(ttid) self.ttid_to_agent_id[ttid] = agent_id self.agent_id_to_ttids[ agent_id] = self.agent_id_to_ttids.get(agent_id, 0) + 1 cpus[i] -= min(cpus[i], t.cpus) mems[i] -= t.mem gpus[i] -= t.gpus used = time.time() - start if used > 10: logger.error('use too much time in resourceOffers: %.2fs', used) for o in offers: if o.id.value in tasks: driver.launchTasks(o.id, tasks[o.id.value]) else: driver.declineOffer(o.id)
def resourceOffers(self, driver, offers): tpn = self.options.task_per_node random.shuffle(offers) self.last_offer_time = time.time() if not self.total_tasks: driver.suppressOffers() for o in offers: driver.declineOffer(o.id) return for offer in offers: try: if conf.ban(offer.hostname): logger.debug("skip offer on banned node: %s", offer.hostname) continue except: logger.exception("bad ban() func in dpark.conf") unavailability = offer.get('unavailability') if (unavailability is not None and sec2nanosec(time.time() + conf.DEFAULT_TASK_TIME) >= unavailability['start']['nanoseconds']): logger.debug('the host %s plan to maintain, so skip it', offer.hostname) driver.declineOffer(offer.id, filters=Dict(refuse_seconds=600)) continue attrs = self.getAttributes(offer) group = attrs.get('group', 'None') if (self.options.group or group.startswith( '_')) and group not in self.options.group: driver.declineOffer(offer.id, REFUSE_FILTER) continue cpus, mem, gpus = self.getResource(offer) logger.debug('got resource offer %s: cpus:%s, mem:%s, gpus:%s at %s', offer.id.value, cpus, mem, gpus, offer.hostname) sid = offer.agent_id.value tasks = [] while (self.total_tasks and cpus >= self.cpus + EXECUTOR_CPUS and mem >= self.mem + EXECUTOR_MEMORY and gpus >= self.gpus and ( tpn == 0 or tpn > 0 and len(self.agentTasks.get(sid, set())) < tpn )): logger.debug('Accepting slot on agent %s (%s)', offer.agent_id.value, offer.hostname) t = self.total_tasks.pop() task = self.create_task(offer, t) tasks.append(task) t.state = 'TASK_STARTING' self.stats['submit_times'][t.id] = t.state_time = time.time() self.task_launched[t.id] = t self.agentTasks.setdefault(sid, set()).add(t.id) cpus -= self.cpus mem -= self.mem gpus -= self.gpus if not self.total_tasks: break logger.debug( 'dispatch %d tasks to agent %s', len(tasks), offer.hostname) driver.launchTasks(offer.id, tasks, REFUSE_FILTER)