def test_send_framework_message(mocker): ID = str(uuid.uuid4()) sched = mocker.Mock() framework = {'id': {'value': ID}} master = mocker.Mock() driver = MesosSchedulerDriver(sched, framework, master) driver._send = mocker.Mock() executor_id = {'value': str(uuid.uuid4())} agent_id = {'value': str(uuid.uuid4())} message = ''.join( random.choice(string.printable) for _ in range(random.randint(1, 100))) message = encode_data(message.encode('utf-8')) driver.sendFrameworkMessage(executor_id, agent_id, message) driver._send.assert_not_called() driver._stream_id = 'a-stream-id' driver.sendFrameworkMessage(executor_id, agent_id, message) driver._send.assert_called_once_with({ 'type': 'MESSAGE', 'framework_id': { 'value': ID }, 'message': { 'agent_id': agent_id, 'executor_id': executor_id, 'data': message, } })
def test_send_framework_message(mocker): ID = str(uuid.uuid4()) sched = mocker.Mock() framework = {'id': {'value': ID}} master = mocker.Mock() driver = MesosSchedulerDriver(sched, framework, master) driver._send = mocker.Mock() executor_id = {'value': str(uuid.uuid4())} agent_id = {'value': str(uuid.uuid4())} message = ''.join(random.choice(string.printable) for _ in range(random.randint(1, 100))) message = encode_data(message.encode('utf-8')) driver.sendFrameworkMessage(executor_id, agent_id, message) driver._send.assert_not_called() driver._stream_id = 'a-stream-id' driver.sendFrameworkMessage(executor_id, agent_id, message) driver._send.assert_called_once_with({ 'type': 'MESSAGE', 'framework_id': { 'value': ID }, 'message': { 'agent_id': agent_id, 'executor_id': executor_id, 'data': message, } })
class ProcScheduler(Scheduler): def __init__(self): self.framework_id = None self.framework = self._init_framework() self.executor = None self.master = str(CONFIG.get("master", os.environ["MESOS_MASTER"])) self.driver = MesosSchedulerDriver(self, self.framework, self.master) self.procs_pending = {} self.procs_launched = {} self.slave_to_proc = {} self._lock = RLock() def _init_framework(self): framework = mesos_pb2.FrameworkInfo() framework.user = getpass.getuser() framework.name = repr(self) framework.hostname = socket.gethostname() return framework def _init_executor(self): executor = mesos_pb2.ExecutorInfo() executor.executor_id.value = "default" executor.command.value = "%s -m %s.executor" % (sys.executable, __package__) mem = executor.resources.add() mem.name = "mem" mem.type = mesos_pb2.Value.SCALAR mem.scalar.value = MIN_MEMORY cpus = executor.resources.add() cpus.name = "cpus" cpus.type = mesos_pb2.Value.SCALAR cpus.scalar.value = MIN_CPUS if "PYTHONPATH" in os.environ: var = executor.command.environment.variables.add() var.name = "PYTHONPATH" var.value = os.environ["PYTHONPATH"] executor.framework_id.value = str(self.framework_id.value) return executor def _init_task(self, proc, offer): task = mesos_pb2.TaskInfo() task.task_id.value = str(proc.id) task.slave_id.value = offer.slave_id.value task.name = repr(proc) task.executor.MergeFrom(self.executor) task.data = pickle.dumps(proc.params) cpus = task.resources.add() cpus.name = "cpus" cpus.type = mesos_pb2.Value.SCALAR cpus.scalar.value = proc.cpus mem = task.resources.add() mem.name = "mem" mem.type = mesos_pb2.Value.SCALAR mem.scalar.value = proc.mem return task def _filters(self, seconds): f = mesos_pb2.Filters() f.refuse_seconds = seconds return f def __repr__(self): return "%s[%s]: %s" % (self.__class__.__name__, os.getpid(), " ".join(sys.argv)) def registered(self, driver, framework_id, master_info): with self._lock: logger.info("Framework registered with id=%s, master=%s" % (framework_id, master_info)) self.framework_id = framework_id self.executor = self._init_executor() def resourceOffers(self, driver, offers): def get_resources(offer): cpus, mem = 0.0, 0.0 for r in offer.resources: if r.name == "cpus": cpus = float(r.scalar.value) elif r.name == "mem": mem = float(r.scalar.value) return cpus, mem with self._lock: random.shuffle(offers) for offer in offers: if not self.procs_pending: logger.debug("Reject offers forever for no pending procs, " "offers=%s" % (offers,)) driver.launchTasks(offer.id, [], self._filters(FOREVER)) continue cpus, mem = get_resources(offer) tasks = [] for proc in self.procs_pending.values(): if cpus >= proc.cpus and mem >= proc.mem: tasks.append(self._init_task(proc, offer)) del self.procs_pending[proc.id] self.procs_launched[proc.id] = proc cpus -= proc.cpus mem -= proc.mem seconds = 5 + random.random() * 5 driver.launchTasks(offer.id, tasks, self._filters(seconds)) if tasks: logger.info( "Accept offer for procs, offer=%s, " "procs=%s, filter_time=%s" % (offer, [int(t.task_id.value) for t in tasks], seconds) ) else: logger.info("Retry offer for procs later, offer=%s, " "filter_time=%s" % (offer, seconds)) def _call_finished(self, proc_id, success, message, data, slave_id=None): with self._lock: proc = self.procs_launched.pop(proc_id) if slave_id is not None: if slave_id in self.slave_to_proc: self.slave_to_proc[slave_id].remove(proc_id) else: for slave_id, procs in self.slave_to_proc.iteritems(): if proc_id in procs: procs.remove(proc_id) proc._finished(success, message, data) def statusUpdate(self, driver, update): with self._lock: proc_id = int(update.task_id.value) logger.info("Status update for proc, id=%s, state=%s" % (proc_id, update.state)) if update.state == mesos_pb2.TASK_RUNNING: if update.slave_id.value in self.slave_to_proc: self.slave_to_proc[update.slave_id.value].add(proc_id) else: self.slave_to_proc[update.slave_id.value] = set([proc_id]) proc = self.procs_launched[proc_id] proc._started() elif update.state >= mesos_pb2.TASK_FINISHED: slave_id = update.slave_id.value success = update.state == mesos_pb2.TASK_FINISHED message = update.message data = update.data and pickle.loads(update.data) self._call_finished(proc_id, success, message, data, slave_id) driver.reviveOffers() def offerRescinded(self, driver, offer_id): with self._lock: if self.procs_pending: logger.info("Revive offers for pending procs") driver.reviveOffers() def slaveLost(self, driver, slave_id): with self._lock: for proc_id in self.slave_to_proc.pop(slave_id, []): self._call_finished(proc_id, False, "Slave lost", None, slave_id) def error(self, driver, message): with self._lock: for proc in self.procs_pending.values(): self._call_finished(proc.id, False, "Stopped", None) for proc in self.procs_launched.values(): self._call_finished(proc.id, False, "Stopped", None) self.stop() def start(self): self.driver.start() def stop(self): assert not self.driver.aborted self.driver.stop() def submit(self, proc): if self.driver.aborted: raise RuntimeError("driver already aborted") with self._lock: if proc.id not in self.procs_pending: logger.info("Try submit proc, id=%s", (proc.id,)) self.procs_pending[proc.id] = proc if len(self.procs_pending) == 1: logger.info("Revive offers for pending procs") self.driver.reviveOffers() else: raise ValueError("Proc with same id already submitted") def cancel(self, proc): if self.driver.aborted: raise RuntimeError("driver already aborted") with self._lock: if proc.id in self.procs_pending: del self.procs_pending[proc.id] elif proc.id in self.procs_launched: del self.procs_launched[proc.id] self.driver.killTask(mesos_pb2.TaskID(value=str(proc.id))) for slave_id, procs in self.slave_to_proc.items(): procs.pop(proc.id) if not procs: del self.slave_to_proc[slave_id] def send_data(self, pid, type, data): if self.driver.aborted: raise RuntimeError("driver already aborted") msg = pickle.dumps((pid, type, data)) for slave_id, procs in self.slave_to_proc.iteritems(): if pid in procs: self.driver.sendFrameworkMessage(self.executor.executor_id, mesos_pb2.SlaveID(value=slave_id), msg) return raise RuntimeError("Cannot find slave for pid %s" % (pid,))
class ProcScheduler(Scheduler): def __init__(self): self.framework_id = None self.framework = self._init_framework() self.executor = None self.master = str(CONFIG.get('master', os.environ['MESOS_MASTER'])) self.driver = MesosSchedulerDriver(self, self.framework, self.master) self.procs_pending = {} self.procs_launched = {} self.slave_to_proc = {} self._lock = RLock() def _init_framework(self): framework = mesos_pb2.FrameworkInfo() framework.user = getpass.getuser() framework.name = repr(self) framework.hostname = socket.gethostname() return framework def _init_executor(self): executor = mesos_pb2.ExecutorInfo() executor.executor_id.value = 'default' executor.command.value = '%s -m %s.executor' % ( sys.executable, __package__) mem = executor.resources.add() mem.name = 'mem' mem.type = mesos_pb2.Value.SCALAR mem.scalar.value = MIN_MEMORY cpus = executor.resources.add() cpus.name = 'cpus' cpus.type = mesos_pb2.Value.SCALAR cpus.scalar.value = MIN_CPUS if 'PYTHONPATH' in os.environ: var = executor.command.environment.variables.add() var.name = 'PYTHONPATH' var.value = os.environ['PYTHONPATH'] executor.framework_id.value = str(self.framework_id.value) return executor def _init_task(self, proc, offer): task = mesos_pb2.TaskInfo() task.task_id.value = str(proc.id) task.slave_id.value = offer.slave_id.value task.name = repr(proc) task.executor.MergeFrom(self.executor) task.data = pickle.dumps(proc.params) cpus = task.resources.add() cpus.name = 'cpus' cpus.type = mesos_pb2.Value.SCALAR cpus.scalar.value = proc.cpus mem = task.resources.add() mem.name = 'mem' mem.type = mesos_pb2.Value.SCALAR mem.scalar.value = proc.mem return task def _filters(self, seconds): f = mesos_pb2.Filters() f.refuse_seconds = seconds return f def __repr__(self): return "%s[%s]: %s" % ( self.__class__.__name__, os.getpid(), ' '.join(sys.argv)) def registered(self, driver, framework_id, master_info): with self._lock: logger.info('Framework registered with id=%s, master=%s' % ( framework_id, master_info)) self.framework_id = framework_id self.executor = self._init_executor() def resourceOffers(self, driver, offers): def get_resources(offer): cpus, mem = 0.0, 0.0 for r in offer.resources: if r.name == 'cpus': cpus = float(r.scalar.value) elif r.name == 'mem': mem = float(r.scalar.value) return cpus, mem with self._lock: random.shuffle(offers) for offer in offers: if not self.procs_pending: logger.debug('Reject offers forever for no pending procs, ' 'offers=%s' % (offers, )) driver.launchTasks(offer.id, [], self._filters(FOREVER)) continue cpus, mem = get_resources(offer) tasks = [] for proc in self.procs_pending.values(): if cpus >= proc.cpus and mem >= proc.mem: tasks.append(self._init_task(proc, offer)) del self.procs_pending[proc.id] self.procs_launched[proc.id] = proc cpus -= proc.cpus mem -= proc.mem seconds = 5 + random.random() * 5 driver.launchTasks(offer.id, tasks, self._filters(seconds)) if tasks: logger.info('Accept offer for procs, offer=%s, ' 'procs=%s, filter_time=%s' % ( offer, [int(t.task_id.value) for t in tasks], seconds)) else: logger.info('Retry offer for procs later, offer=%s, ' 'filter_time=%s' % ( offer, seconds)) def _call_finished(self, proc_id, success, message, data, slave_id=None): with self._lock: proc = self.procs_launched.pop(proc_id) if slave_id is not None: if slave_id in self.slave_to_proc: self.slave_to_proc[slave_id].remove(proc_id) else: for slave_id, procs in self.slave_to_proc.iteritems(): if proc_id in procs: procs.remove(proc_id) proc._finished(success, message, data) def statusUpdate(self, driver, update): with self._lock: proc_id = int(update.task_id.value) logger.info('Status update for proc, id=%s, state=%s' % ( proc_id, update.state)) if update.state == mesos_pb2.TASK_RUNNING: if update.slave_id.value in self.slave_to_proc: self.slave_to_proc[update.slave_id.value].add(proc_id) else: self.slave_to_proc[update.slave_id.value] = set([proc_id]) proc = self.procs_launched[proc_id] proc._started() elif update.state >= mesos_pb2.TASK_FINISHED: slave_id = update.slave_id.value success = (update.state == mesos_pb2.TASK_FINISHED) message = update.message data = update.data and pickle.loads(update.data) self._call_finished(proc_id, success, message, data, slave_id) driver.reviveOffers() def offerRescinded(self, driver, offer_id): with self._lock: if self.procs_pending: logger.info('Revive offers for pending procs') driver.reviveOffers() def slaveLost(self, driver, slave_id): with self._lock: for proc_id in self.slave_to_proc.pop(slave_id, []): self._call_finished( proc_id, False, 'Slave lost', None, slave_id) def error(self, driver, message): with self._lock: for proc in self.procs_pending.values(): self._call_finished(proc.id, False, 'Stopped', None) for proc in self.procs_launched.values(): self._call_finished(proc.id, False, 'Stopped', None) self.stop() def start(self): self.driver.start() def stop(self): assert not self.driver.aborted self.driver.stop() def submit(self, proc): if self.driver.aborted: raise RuntimeError('driver already aborted') with self._lock: if proc.id not in self.procs_pending: logger.info('Try submit proc, id=%s', (proc.id,)) self.procs_pending[proc.id] = proc if len(self.procs_pending) == 1: logger.info('Revive offers for pending procs') self.driver.reviveOffers() else: raise ValueError('Proc with same id already submitted') def cancel(self, proc): if self.driver.aborted: raise RuntimeError('driver already aborted') with self._lock: if proc.id in self.procs_pending: del self.procs_pending[proc.id] elif proc.id in self.procs_launched: del self.procs_launched[proc.id] self.driver.killTask(mesos_pb2.TaskID(value=str(proc.id))) for slave_id, procs in self.slave_to_proc.items(): procs.pop(proc.id) if not procs: del self.slave_to_proc[slave_id] def send_data(self, pid, type, data): if self.driver.aborted: raise RuntimeError('driver already aborted') msg = pickle.dumps((pid, type, data)) for slave_id, procs in self.slave_to_proc.iteritems(): if pid in procs: self.driver.sendFrameworkMessage( self.executor.executor_id, mesos_pb2.SlaveID(value=slave_id), msg) return raise RuntimeError('Cannot find slave for pid %s' % (pid,))
class ProcScheduler(Scheduler): def __init__(self): self.framework_id = None self.framework = self._init_framework() self.executor = None self.master = str(CONFIG.get('master', os.environ['MESOS_MASTER'])) self.driver = MesosSchedulerDriver(self, self.framework, self.master) self.procs_pending = {} self.procs_launched = {} self.agent_to_proc = {} self._lock = RLock() def _init_framework(self): framework = dict( user=getpass.getuser(), name=repr(self), hostname=socket.gethostname(), ) return framework def _init_executor(self): executor = dict( executor_id=dict(value='default'), framework_id=self.framework_id, command=dict(value='%s -m %s.executor' % (sys.executable, __package__)), resources=[ dict( name='mem', type='SCALAR', scalar=dict(value=MIN_MEMORY), ), dict(name='cpus', type='SCALAR', scalar=dict(value=MIN_CPUS)), ], ) if 'PYTHONPATH' in os.environ: executor['command.environment'] = dict(variables=[ dict( name='PYTHONPATH', value=os.environ['PYTHONPATH'], ), ]) return executor def _init_task(self, proc, offer): resources = [ dict( name='cpus', type='SCALAR', scalar=dict(value=proc.cpus), ), dict( name='mem', type='SCALAR', scalar=dict(value=proc.mem), ) ] if proc.gpus > 0: resources.append( dict( name='gpus', type='SCALAR', scalar=dict(value=proc.gpus), )) task = dict( task_id=dict(value=str(proc.id)), name=repr(proc), executor=self.executor, agent_id=offer['agent_id'], data=b2a_base64(pickle.dumps(proc.params)).strip(), resources=resources, ) return task def _filters(self, seconds): f = dict(refuse_seconds=seconds) return f def __repr__(self): return "%s[%s]: %s" % (self.__class__.__name__, os.getpid(), ' '.join( sys.argv)) def registered(self, driver, framework_id, master_info): with self._lock: logger.info('Framework registered with id=%s, master=%s' % (framework_id, master_info)) self.framework_id = framework_id self.executor = self._init_executor() def resourceOffers(self, driver, offers): def get_resources(offer): cpus, mem, gpus = 0.0, 0.0, 0 for r in offer['resources']: if r['name'] == 'cpus': cpus = float(r['scalar']['value']) elif r['name'] == 'mem': mem = float(r['scalar']['value']) elif r['name'] == 'gpus': gpus = int(r['scalar']['value']) return cpus, mem, gpus with self._lock: random.shuffle(offers) for offer in offers: if not self.procs_pending: logger.debug('Reject offers forever for no pending procs, ' 'offers=%s' % (offers, )) driver.declineOffer(offer['id'], self._filters(FOREVER)) continue cpus, mem, gpus = get_resources(offer) tasks = [] for proc in list(self.procs_pending.values()): if (cpus >= proc.cpus + MIN_CPUS and mem >= proc.mem + MIN_MEMORY and gpus >= proc.gpus): tasks.append(self._init_task(proc, offer)) del self.procs_pending[proc.id] self.procs_launched[proc.id] = proc cpus -= proc.cpus mem -= proc.mem gpus -= proc.gpus seconds = 5 + random.random() * 5 if tasks: logger.info( 'Accept offer for procs, offer=%s, ' 'procs=%s, filter_time=%s' % (offer, [int(t['task_id']['value']) for t in tasks], seconds)) driver.launchTasks(offer['id'], tasks, self._filters(seconds)) else: logger.info('Retry offer for procs later, offer=%s, ' 'filter_time=%s' % (offer, seconds)) driver.declineOffer(offer['id'], self._filters(seconds)) def _call_finished(self, proc_id, success, message, data, agent_id=None): with self._lock: proc = self.procs_launched.pop(proc_id) if agent_id is not None: if agent_id in self.agent_to_proc: self.agent_to_proc[agent_id].remove(proc_id) else: for agent_id, procs in list(self.agent_to_proc.items()): if proc_id in procs: procs.remove(proc_id) proc._finished(success, message, data) def statusUpdate(self, driver, update): with self._lock: proc_id = int(update['task_id']['value']) logger.info('Status update for proc, id=%s, state=%s' % (proc_id, update['state'])) agent_id = update['agent_id']['value'] if update['state'] == 'TASK_RUNNING': if agent_id in self.agent_to_proc: self.agent_to_proc[agent_id].add(proc_id) else: self.agent_to_proc[agent_id] = set([proc_id]) proc = self.procs_launched[proc_id] proc._started() elif update['state'] not in { 'TASK_STAGING', 'TASK_STARTING', 'TASK_RUNNING' }: success = (update['state'] == 'TASK_FINISHED') message = update.get('message') data = update.get('data') if data: data = pickle.loads(a2b_base64(data)) self._call_finished(proc_id, success, message, data, agent_id) driver.reviveOffers() def offerRescinded(self, driver, offer_id): with self._lock: if self.procs_pending: logger.info('Revive offers for pending procs') driver.reviveOffers() def executorLost(self, driver, executor_id, agent_id, status): agent_id = agent_id['value'] with self._lock: for proc_id in self.agent_to_proc.pop(agent_id, []): self._call_finished(proc_id, False, 'Executor lost', None, agent_id) def slaveLost(self, driver, agent_id): agent_id = agent_id['value'] with self._lock: for proc_id in self.agent_to_proc.pop(agent_id, []): self._call_finished(proc_id, False, 'Agent lost', None, agent_id) def error(self, driver, message): with self._lock: for proc in list(self.procs_pending.values()): self._call_finished(proc.id, False, message, None) for proc in list(self.procs_launched.values()): self._call_finished(proc.id, False, message, None) self.stop() def start(self): self.driver.start() def stop(self): assert not self.driver.aborted self.driver.stop() def submit(self, proc): if self.driver.aborted: raise RuntimeError('driver already aborted') with self._lock: if proc.id not in self.procs_pending: logger.info('Try submit proc, id=%s', (proc.id, )) self.procs_pending[proc.id] = proc if len(self.procs_pending) == 1: logger.info('Revive offers for pending procs') self.driver.reviveOffers() else: raise ValueError('Proc with same id already submitted') def cancel(self, proc): if self.driver.aborted: raise RuntimeError('driver already aborted') with self._lock: if proc.id in self.procs_pending: del self.procs_pending[proc.id] elif proc.id in self.procs_launched: del self.procs_launched[proc.id] self.driver.killTask(dict(value=str(proc.id))) for agent_id, procs in list(self.agent_to_proc.items()): procs.pop(proc.id) if not procs: del self.agent_to_proc[agent_id] def send_data(self, pid, type, data): if self.driver.aborted: raise RuntimeError('driver already aborted') msg = b2a_base64(pickle.dumps((pid, type, data))) for agent_id, procs in list(self.agent_to_proc.items()): if pid in procs: self.driver.sendFrameworkMessage(self.executor['executor_id'], dict(value=agent_id), msg) return raise RuntimeError('Cannot find agent for pid %s' % (pid, ))