def start_driver(self): name = 'OpenCluster' if self.options.name: name = "%s-%s" % (name, self.options.name) else: name = "%s-%s" % ( name, datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")) if len(name) > 256: name = name[:256] + '...' framework = mesos_pb2.FrameworkInfo() framework.user = getuser() if framework.user == 'root': raise Exception("OpenCluster is not allowed to run as 'root'") framework.name = name framework.hostname = socket.gethostname() self.driver = MesosSchedulerDriver(self, framework, self.master) self.driver.start() logger.debug("Mesos Scheudler driver started") self.shuttingdown = False self.last_finish_time = time.time() self.stopped = False
def main(master): logging.basicConfig(level=logging.INFO, format='[%(asctime)s %(levelname)s] %(message)s') # Create a new executor executor = mesos_pb2.ExecutorInfo() executor.executor_id.value = 'ExampleExecutor' executor.name = executor.executor_id.value executor.command.value = os.path.abspath('./executor-skeleton.py') # Create a new framework framework = mesos_pb2.FrameworkInfo() framework.user = '' # the current user framework.name = 'ExampleFramework' framework.checkpoint = True implicitAcknowledgements = 1 if os.getenv('EXAMPLE_AUTHENTICATE'): logging.info('Enabling framework authentication') credential = mesos_pb2.Credential() credential.principal = os.getenv('EXAMPLE_PRINCIPAL') credential.secret = os.getenv('EXAMPLE_SECRET') framework.principal = os.getenv('EXAMPLE_PRINCIPAL') driver = MesosSchedulerDriver(ExampleScheduler(executor), framework, master, implicitAcknowledgements, credential) else: framework.principal = framework.name driver = MesosSchedulerDriver(ExampleScheduler(executor), framework, master, implicitAcknowledgements) def signal_handler(signal, frame): logging.info('Shutting down') driver.stop() # driver.run() blocks, so we run it in a separate thread. # This way, we can catch a SIGINT to kill the framework. def run_driver_thread(): status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1 driver.stop() # Ensure the driver process terminates sys.exit(status) driver_thread = Thread(target=run_driver_thread, args=()) driver_thread.start() logging.info('Scheduler running, Ctrl-C to exit') signal.signal(signal.SIGINT, signal_handler) # Block the main thread while the driver thread is alive while driver_thread.is_alive(): time.sleep(1) logging.info('Framework finished.') sys.exit(0)
def handle(self, **options): '''See :meth:`django.core.management.base.BaseCommand.handle`. This method starts the scheduler. ''' # Register a listener to handle clean shutdowns signal.signal(signal.SIGTERM, self._onsigterm) # TODO: clean this up mesos_master = options.get('master') logger.info(u'Command starting: scale_scheduler') logger.info(u' - Master: %s', mesos_master) executor = mesos_pb2.ExecutorInfo() executor.executor_id.value = 'scale' executor.command.value = '%s %s scale_executor' % (settings.PYTHON_EXECUTABLE, settings.MANAGE_FILE) executor.name = 'Scale Executor (Python)' self.scheduler = ScaleScheduler(executor) framework = mesos_pb2.FrameworkInfo() framework.user = '' # Have Mesos fill in the current user. framework.name = 'Scale Framework (Python)' # TODO(vinod): Make checkpointing the default when it is default on the slave. if MESOS_CHECKPOINT: logger.info('Enabling checkpoint for the framework') framework.checkpoint = True if MESOS_AUTHENTICATE: logger.info('Enabling authentication for the framework') if not DEFAULT_PRINCIPLE: logger.error('Expecting authentication principal in the environment') sys.exit(1) if not DEFAULT_SECRET: logger.error('Expecting authentication secret in the environment') sys.exit(1) credential = mesos_pb2.Credential() credential.principal = DEFAULT_PRINCIPLE credential.secret = DEFAULT_SECRET self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master, credential) else: self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master) status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1 # Perform any required clean up operations like stopping background threads status = status or self._shutdown() logger.info(u'Command completed: scale_scheduler') sys.exit(status)
def run_scheduler(self, mesos_master): logger.info("I am the leader") self.scheduler = ScaleScheduler() self.scheduler.initialize() scheduler_mgr.hostname = socket.getfqdn() framework = mesos_pb2.FrameworkInfo() framework.user = '' # Have Mesos fill in the current user. framework.name = os.getenv('DCOS_PACKAGE_FRAMEWORK_NAME', 'Scale') webserver_address = os.getenv('SCALE_WEBSERVER_ADDRESS') if webserver_address: framework.webui_url = webserver_address logger.info('Connecting to Mesos master at %s', mesos_master) # TODO(vinod): Make checkpointing the default when it is default on the slave. if MESOS_CHECKPOINT: logger.info('Enabling checkpoint for the framework') framework.checkpoint = True if MESOS_AUTHENTICATE: logger.info('Enabling authentication for the framework') if not DEFAULT_PRINCIPLE: logger.error( 'Expecting authentication principal in the environment') sys.exit(1) if not DEFAULT_SECRET: logger.error( 'Expecting authentication secret in the environment') sys.exit(1) credential = mesos_pb2.Credential() credential.principal = DEFAULT_PRINCIPLE credential.secret = DEFAULT_SECRET self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master, credential) else: self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master) try: status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1 except: status = 1 logger.exception('Mesos Scheduler Driver returned an exception') #Perform a shut down and return any non-zero status shutdown_status = self._shutdown() status = status or shutdown_status logger.info('Exiting...') sys.exit(status)
def __init__(self, scheduler, name, user='', master=os.getenv('MESOS_MASTER'), implicit_acknowledge=1, *args, **kwargs): framework = FrameworkInfo(name=name, user=user, *args, **kwargs) scheduler = SchedulerProxy(scheduler) self.driver = MesosSchedulerDriver(scheduler, encode(framework), master, implicit_acknowledge) def shutdown(signal, frame): self.stop() signal.signal(signal.SIGINT, shutdown) signal.signal(signal.SIGTERM, shutdown) atexit.register(self.stop)
def run_scheduler(self, mesos_master): logger.info("I am the leader") self.scheduler = ScaleScheduler() framework = mesos_pb2.FrameworkInfo() framework.user = '' # Have Mesos fill in the current user. framework.name = os.getenv('DCOS_PACKAGE_FRAMEWORK_NAME', 'Scale') webserver_address = os.getenv('SCALE_WEBSERVER_ADDRESS') if webserver_address: framework.webui_url = webserver_address logger.info('Connecting to Mesos master at %s', mesos_master) # TODO(vinod): Make checkpointing the default when it is default on the slave. if MESOS_CHECKPOINT: logger.info('Enabling checkpoint for the framework') framework.checkpoint = True if MESOS_AUTHENTICATE: logger.info('Enabling authentication for the framework') if not DEFAULT_PRINCIPLE: logger.error('Expecting authentication principal in the environment') sys.exit(1) if not DEFAULT_SECRET: logger.error('Expecting authentication secret in the environment') sys.exit(1) credential = mesos_pb2.Credential() credential.principal = DEFAULT_PRINCIPLE credential.secret = DEFAULT_SECRET self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master, credential) else: self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master) try: status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1 except: status = 1 logger.exception('Mesos Scheduler Driver returned an exception') #Perform a shut down and return any non-zero status shutdown_status = self._shutdown status = status or shutdown_status logger.info('Exiting...') sys.exit(status)
def create_driver(framework_name, scheduler, system_paasta_config, implicit_acks=False): framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = framework_name framework.failover_timeout = 604800 framework.id.value = find_existing_id_if_exists_or_gen_new(framework.name) framework.checkpoint = True credential = mesos_pb2.Credential() credential.principal = system_paasta_config.get_paasta_native_config( )['principal'] credential.secret = system_paasta_config.get_paasta_native_config( )['secret'] framework.principal = system_paasta_config.get_paasta_native_config( )['principal'] driver = MesosSchedulerDriver( scheduler, framework, '%s:%d' % (mesos_tools.get_mesos_leader(), mesos_tools.MESOS_MASTER_PORT), implicit_acks, credential) return driver
def __init__(self, scheduler, name, user='', master=os.getenv('MESOS_MASTER'), implicit_acknowledge=1, *args, **kwargs): scheduler = SchedulerProxy(scheduler) framework = FrameworkInfo(name=name, user=user, *args, **kwargs) self.driver = MesosSchedulerDriver(scheduler, encode(framework), master, implicit_acknowledge) def shutdown(signal, frame): self.driver.stop() signal.signal(signal.SIGINT, shutdown) signal.signal(signal.SIGTERM, shutdown) atexit.register(self.driver.stop)
class Running(object): def __init__(self, scheduler, name, user='', master=os.getenv('MESOS_MASTER'), implicit_acknowledge=1, *args, **kwargs): scheduler = SchedulerProxy(scheduler) framework = FrameworkInfo(name=name, user=user, *args, **kwargs) self.driver = MesosSchedulerDriver(scheduler, encode(framework), master, implicit_acknowledge) def shutdown(signal, frame): self.stop() signal.signal(signal.SIGINT, shutdown) signal.signal(signal.SIGTERM, shutdown) atexit.register(self.stop) def run(self): return self.driver.run() def start(self): status = self.driver.start() assert status == mesos_pb2.DRIVER_RUNNING return status def stop(self): return self.driver.stop() def join(self): return self.driver.join() def __enter__(self): self.start() return self def __exit__(self, exc_type, exc_value, traceback): self.stop() self.join() if exc_type: raise exc_type, exc_value, traceback
class Running(object): def __init__(self, scheduler, name, user='', master=os.getenv('MESOS_MASTER'), implicit_acknowledge=1, *args, **kwargs): framework = FrameworkInfo(name=name, user=user, *args, **kwargs) scheduler = SchedulerProxy(scheduler) self.driver = MesosSchedulerDriver(scheduler, encode(framework), master, implicit_acknowledge) def shutdown(signal, frame): self.stop() signal.signal(signal.SIGINT, shutdown) signal.signal(signal.SIGTERM, shutdown) atexit.register(self.stop) def run(self): return self.driver.run() def start(self): status = self.driver.start() assert status == mesos_pb2.DRIVER_RUNNING return status def stop(self): return self.driver.stop() def join(self): return self.driver.join() def __enter__(self): self.start() return self def __exit__(self, exc_type, exc_value, traceback): self.stop() self.join() if exc_type: raise exc_type, exc_value, traceback
class Running(object): def __init__(self, scheduler, name, user='', master=os.getenv('MESOS_MASTER'), implicit_acknowledge=1, *args, **kwargs): scheduler = SchedulerProxy(scheduler) framework = FrameworkInfo(name=name, user=user, *args, **kwargs) self.driver = MesosSchedulerDriver(scheduler, encode(framework), master, implicit_acknowledge) def shutdown(signal, frame): self.driver.stop() signal.signal(signal.SIGINT, shutdown) signal.signal(signal.SIGTERM, shutdown) atexit.register(self.driver.stop) def run(self): return self.driver.run() def start(self): status = self.driver.start() assert status == mesos_pb2.DRIVER_RUNNING return status def stop(self): logging.info("Stopping Mesos driver") self.driver.stop() logging.info("Joining Mesos driver") result = self.driver.join() logging.info("Joined Mesos driver") if result != mesos_pb2.DRIVER_STOPPED: raise RuntimeError("Mesos driver failed with %i", result) def join(self): return self.driver.join() def __enter__(self): self.start() return self def __exit__(self, type, value, traceback): self.stop()
def main(master): executor = mesos_pb2.ExecutorInfo() executor.executor_id.value = 'MinimalExecutor' executor.name = executor.executor_id.value executor.command.value = os.path.abspath('./executor-minimal.py') framework = mesos_pb2.FrameworkInfo() framework.user = '' # the current user framework.name = 'MinimalFramework' framework.checkpoint = True framework.principal = framework.name implicitAcknowledgements = 1 driver = MesosSchedulerDriver( MinimalScheduler(executor), framework, master, implicitAcknowledgements ) def signal_handler(signal, frame): driver.stop() def run_driver_thread(): status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1 driver.stop() sys.exit(status) driver_thread = Thread(target=run_driver_thread, args=()) driver_thread.start() print('Scheduler running, Ctrl-C to quit.') signal.signal(signal.SIGINT, signal_handler) while driver_thread.is_alive(): time.sleep(1) sys.exit(0)
def start_driver(self): name = 'OpenCluster' if self.options.name : name = "%s-%s" % (name,self.options.name) else: name = "%s-%s" % (name,datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")) if len(name) > 256: name = name[:256] + '...' framework = mesos_pb2.FrameworkInfo() framework.user = getuser() if framework.user == 'root': raise Exception("OpenCluster is not allowed to run as 'root'") framework.name = name framework.hostname = socket.gethostname() self.driver = MesosSchedulerDriver(self, framework, self.master) self.driver.start() logger.debug("Mesos Scheudler driver started") self.shuttingdown = False self.last_finish_time = time.time() self.stopped = False
class MesosScheduler(Scheduler): def __init__(self, manager, master, options): Scheduler.__init__(self, manager) self.master = master self.cpus = options.cpus self.mem = parse_mem(options.mem) self.gpus = options.gpus self.task_per_node = options.parallel or multiprocessing.cpu_count() self.options = options self.group = options.group self.last_finish_time = 0 self.executor = None self.driver = None self.lock = threading.RLock() self.task_waiting = [] self.task_launched = {} self.slaveTasks = {} self.starting = False def start_driver(self): name = 'OpenCluster' if self.options.name: name = "%s-%s" % (name, self.options.name) else: name = "%s-%s" % ( name, datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")) if len(name) > 256: name = name[:256] + '...' framework = mesos_pb2.FrameworkInfo() framework.user = getuser() if framework.user == 'root': raise Exception("OpenCluster is not allowed to run as 'root'") framework.name = name framework.hostname = socket.gethostname() self.driver = MesosSchedulerDriver(self, framework, self.master) self.driver.start() logger.debug("Mesos Scheudler driver started") self.shuttingdown = False self.last_finish_time = time.time() self.stopped = False # # def check(): # while self.started: # now = time.time() # if not self.task_waiting and now - self.last_finish_time > MAX_IDLE_TIME: # logger.info("stop mesos scheduler after %d seconds idle", now - self.last_finish_time) # self.shutdown() # break # time.sleep(1) # # if len(self.task_success()) + len(self.task_failed) == self.taskNum: # self.shutdown() # spawn(check) @safe def registered(self, driver, frameworkId, masterInfo): self.started = True logger.debug("connect to master %s:%s(%s), registered as %s", int2ip(masterInfo.ip), masterInfo.port, masterInfo.id, frameworkId.value) self.executor = self.getExecutorInfo(str(frameworkId.value)) @safe def reregistered(self, driver, masterInfo): logger.warning("re-connect to mesos master %s:%s(%s)", int2ip(masterInfo.ip), masterInfo.port, masterInfo.id) @safe def disconnected(self, driver): logger.debug("framework is disconnected") @safe def getExecutorInfo(self, framework_id): execInfo = mesos_pb2.ExecutorInfo() execInfo.executor_id.value = "multiframework" execInfo.command.value = '%s %s' % ( sys.executable, # /usr/bin/python.exe or .../python os.path.abspath( os.path.join(os.path.dirname(__file__), 'simpleexecutor.py'))) v = execInfo.command.environment.variables.add() v.name = 'UID' v.value = str(os.getuid()) v = execInfo.command.environment.variables.add() v.name = 'GID' v.value = str(os.getgid()) if hasattr(execInfo, 'framework_id'): execInfo.framework_id.value = str(framework_id) Script = os.path.realpath(sys.argv[0]) if hasattr(execInfo, 'name'): execInfo.name = Script execInfo.data = marshal.dumps( (Script, os.getcwd(), sys.path, dict(os.environ), self.task_per_node, env.environ)) return execInfo @safe def clearCache(self): self.task_launched.clear() self.slaveTasks.clear() @safe def submitTasks(self, tasks): if not tasks: return self.completionEvents.join( ) #Blocks until all items in the events queue have been gotten and processed. self.clearCache() self.task_waiting.extend(tasks) self.taskNum = self.taskNum + len(tasks) logger.debug("Got job with %d tasks", len(tasks)) if not self.started and not self.starting: self.starting = True self.start_driver() while not self.started: self.lock.release() time.sleep(0.01) self.lock.acquire() self.requestMoreResources() self.manager.statusUpdate() def requestMoreResources(self): if self.started: self.driver.reviveOffers() @safe def resourceOffers(self, driver, offers): rf = mesos_pb2.Filters() if not self.task_waiting: rf.refuse_seconds = 5 for o in offers: driver.launchTasks(o.id, [], rf) return random.shuffle(offers) self.last_offer_time = time.time() for offer in offers: if self.shuttingdown: print "Shutting down: declining offer on [%s]" % offer.hostname driver.declineOffer(offer.id) continue attrs = self.getAttributes(offer) if self.options.group and attrs.get( 'group', 'None') not in self.options.group: driver.launchTasks(offer.id, [], rf) continue cpus, mem, gpus = self.getResources(offer) logger.debug( "got resource offer %s: cpus:%s, mem:%s, gpus:%s at %s", offer.id.value, cpus, mem, gpus, offer.hostname) logger.debug("attributes,gpus:%s", attrs.get('gpus', None)) sid = offer.slave_id.value tasks = [] while (len(self.task_waiting) > 0 and cpus >= self.cpus and mem >= self.mem and (self.gpus == 0 or attrs.get('gpus', None) is not None)): logger.debug("Accepting resource on slave %s (%s)", offer.slave_id.value, offer.hostname) t = self.task_waiting.pop() t.state = mesos_pb2.TASK_STARTING t.state_time = time.time() task = self.create_task(offer, t, cpus) tasks.append(task) self.task_launched[t.id] = t self.slaveTasks.setdefault(sid, set()).add(t.id) cpus -= self.cpus mem -= self.mem # gpus -= self.gpus operation = mesos_pb2.Offer.Operation() operation.type = mesos_pb2.Offer.Operation.LAUNCH operation.launch.task_infos.extend(tasks) driver.acceptOffers([offer.id], [operation]) @safe def offerRescinded(self, driver, offer_id): logger.debug("rescinded offer: %s", offer_id) if self.task_waiting: self.requestMoreResources() def getResources(self, offer): cpus, mem, gpus = 0, 0, 0 for r in offer.resources: if r.name == 'gpus': gpus = float(r.scalar.value) elif r.name == 'cpus': cpus = float(r.scalar.value) elif r.name == 'mem': mem = float(r.scalar.value) return cpus, mem, gpus def getResource(self, res, name): for r in res: if r.name == name: return r.scalar.value return 0 def getAttribute(self, attrs, name): for r in attrs: if r.name == name: return r.scalar.value def getAttributes(self, offer): attrs = {} for a in offer.attributes: attrs[a.name] = a.scalar.value return attrs def create_task(self, offer, t, cpus): task = mesos_pb2.TaskInfo() task.task_id.value = t.id task.slave_id.value = offer.slave_id.value task.name = "task(%s/%d)" % (t.id, self.taskNum) task.executor.MergeFrom(self.executor) task.data = compress(cPickle.dumps((t, t.tried), -1)) cpu = task.resources.add() cpu.name = "cpus" cpu.type = 0 # mesos_pb2.Value.SCALAR cpu.scalar.value = min(self.cpus, cpus) mem = task.resources.add() mem.name = "mem" mem.type = 0 # mesos_pb2.Value.SCALAR mem.scalar.value = self.mem # # gpu = task.resources.add() # gpu.name = "gpus" # gpu.type = 0 # mesos_pb2.Value.SCALAR # gpu.scalar.value = self.gpus return task @safe def statusUpdate(self, driver, update): logger.debug( "Task %s in state [%s]" % (update.task_id.value, mesos_pb2.TaskState.Name(update.state))) tid = str(update.task_id.value) if tid not in self.task_launched: # check failed after launched for t in self.task_waiting: if t.id == tid: self.task_launched[tid] = t self.task_waiting.remove(t) break else: logger.debug("Task %s is finished, ignore it", tid) return t = self.task_launched[tid] t.state = update.state t.state_time = time.time() self.last_finish_time = t.state_time if update.state == mesos_pb2.TASK_RUNNING: self.started = True # to do task timeout handler elif update.state == mesos_pb2.TASK_LOST: self.task_launched.pop(tid) if t.tried < self.options.retry: t.tried += 1 logger.warning("task %s lost, retry %s", t.id, update.state, t.tried) self.task_waiting.append(t) # try again else: self.taskEnded( t, OtherFailure("task lost,exception:" + str(update.data)), "task lost") elif update.state in (mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_ERROR, mesos_pb2.TASK_KILLED): self.task_launched.pop(tid) slave = None for s in self.slaveTasks: if tid in self.slaveTasks[s]: slave = s self.slaveTasks[s].remove(tid) break if update.state == mesos_pb2.TASK_FINISHED: self.taskEnded(t, Success(), update.data) if update.state == mesos_pb2.TASK_ERROR: logger.error(update.message) self.taskEnded(t, OtherFailure(update.message), update.message) driver.abort() self.shutdown() if update.state == mesos_pb2.TASK_FAILED or update.state == mesos_pb2.TASK_KILLED or update.state == mesos_pb2.TASK_LOST: if t.tried < self.options.retry: t.tried += 1 logger.warning("task %s failed with %s, retry %s", t.id, update.state, t.tried) self.task_waiting.append(t) # try again else: self.taskEnded( t, OtherFailure("exception:" + str(update.data)), None) logger.error("task %s failed on %s", t.id, slave) if not self.task_waiting: self.requestMoreResources() # request more offers again @safe def check(self, driver): now = time.time() for tid, t in self.task_launched.items(): if t.state == mesos_pb2.TASK_STARTING and t.state_time + 30 < now: logger.warning("task %s lauched failed, assign again", tid) if not self.task_waiting: self.requestMoreResources() t.tried += 1 t.state = -1 self.task_launched.pop(tid) self.task_waiting.append(t) # TODO: check run time @safe def shutdown(self): if not self.started: return wait_started = datetime.datetime.now() while (len(self.task_launched) > 0) and \ (SHUTDOWN_TIMEOUT > (datetime.datetime.now() - wait_started).seconds): time.sleep(1) logger.debug("total:%d, task finished: %d,task failed: %d", self.taskNum, self.finished_count, self.fail_count) self.shuttingdown = True # self.driver.join() self.driver.stop(False) #self.driver = None logger.debug("scheduler stop!!!") self.stopped = True self.started = False @safe def error(self, driver, code): logger.warning("Mesos error message: %s", code) def defaultParallelism(self): return 16 def frameworkMessage(self, driver, executor, slave, data): logger.warning("[slave %s] %s", slave.value, data) def executorLost(self, driver, executorId, slaveId, status): logger.warning("executor at %s %s lost: %s", slaveId.value, executorId.value, status) self.slaveTasks.pop(slaveId.value, None) def slaveLost(self, driver, slaveId): logger.warning("slave %s lost", slaveId.value) self.slaveTasks.pop(slaveId.value, None) def killTask(self, job_id, task_id, tried): tid = mesos_pb2.TaskID() tid.value = "%s:%s:%s" % (job_id, task_id, tried) self.driver.killTask(tid)
logging.info("task {task} status is {status}".format( task=update.task_id.value, status=mesos_pb2.TaskState.Name(update.state))) if update.state == mesos_pb2.TASK_FINISHED: self.finished_task += 1 if self.finished_task == len(self.numbers): logging.info("all task has finished") driver.stop() if __name__ == '__main__': # make us a framework framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "hello-world" executor = mesos_pb2.ExecutorInfo() executor.executor_id.value = "default" executor.command.value = "python {}".format( os.path.abspath("./Executor.py")) executor.name = "Test Executor(Python)" executor.source = "python test" driver = MesosSchedulerDriver( MyScheduler(executor), framework, "192.168.12.179:5050" # assumes running on the master ) driver.run()
hello_executor.name = "Hello" hello_executor.command.value = "python hello_executor.py" uri_proto = hello_executor.command.uris.add() uri_proto.value = "http://kit-mesos-master:9000/hello_executor.py" uri_proto.extract = False goodbye_executor = mesos_pb2.ExecutorInfo() goodbye_executor.executor_id.value = "goodbye-executor" goodbye_executor.name = "GoodBye" goodbye_executor.command.value = "python goodbye_executor.py" uri_proto = goodbye_executor.command.uris.add() uri_proto.value = "http://kit-mesos-master:9000/goodbye_executor.py" uri_proto.extract = False framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "hello-world" helloWorldScheduler = HelloWorldScheduler(hello_executor, goodbye_executor) driver = MesosSchedulerDriver(helloWorldScheduler, framework, "kit-mesos-master:5050") driver.start() logging.info("Listening for Ctrl-C") signal.signal(signal.SIGINT, graceful_shutdown) while True: time.sleep(5) sys.exit(0)
class MesosScheduler(Scheduler): def __init__(self, manager, master, options): Scheduler.__init__(self,manager) self.master = master self.cpus = options.cpus self.mem = parse_mem(options.mem) self.gpus = options.gpus self.task_per_node = options.parallel or multiprocessing.cpu_count() self.options = options self.group = options.group self.last_finish_time = 0 self.executor = None self.driver = None self.lock = threading.RLock() self.task_waiting = [] self.task_launched = {} self.slaveTasks = {} self.starting = False def start_driver(self): name = 'OpenCluster' if self.options.name : name = "%s-%s" % (name,self.options.name) else: name = "%s-%s" % (name,datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")) if len(name) > 256: name = name[:256] + '...' framework = mesos_pb2.FrameworkInfo() framework.user = getuser() if framework.user == 'root': raise Exception("OpenCluster is not allowed to run as 'root'") framework.name = name framework.hostname = socket.gethostname() self.driver = MesosSchedulerDriver(self, framework, self.master) self.driver.start() logger.debug("Mesos Scheudler driver started") self.shuttingdown = False self.last_finish_time = time.time() self.stopped = False # # def check(): # while self.started: # now = time.time() # if not self.task_waiting and now - self.last_finish_time > MAX_IDLE_TIME: # logger.info("stop mesos scheduler after %d seconds idle", now - self.last_finish_time) # self.shutdown() # break # time.sleep(1) # # if len(self.task_success()) + len(self.task_failed) == self.taskNum: # self.shutdown() # spawn(check) @safe def registered(self, driver, frameworkId, masterInfo): self.started = True logger.debug("connect to master %s:%s(%s), registered as %s", int2ip(masterInfo.ip), masterInfo.port, masterInfo.id, frameworkId.value) self.executor = self.getExecutorInfo(str(frameworkId.value)) @safe def reregistered(self, driver, masterInfo): logger.warning("re-connect to mesos master %s:%s(%s)", int2ip(masterInfo.ip), masterInfo.port, masterInfo.id) @safe def disconnected(self, driver): logger.debug("framework is disconnected") @safe def getExecutorInfo(self, framework_id): execInfo = mesos_pb2.ExecutorInfo() execInfo.executor_id.value = "multiframework" execInfo.command.value = '%s %s' % ( sys.executable, # /usr/bin/python.exe or .../python os.path.abspath(os.path.join(os.path.dirname(__file__), 'simpleexecutor.py')) ) v = execInfo.command.environment.variables.add() v.name = 'UID' v.value = str(os.getuid()) v = execInfo.command.environment.variables.add() v.name = 'GID' v.value = str(os.getgid()) if hasattr(execInfo, 'framework_id'): execInfo.framework_id.value = str(framework_id) Script = os.path.realpath(sys.argv[0]) if hasattr(execInfo, 'name'): execInfo.name = Script execInfo.data = marshal.dumps((Script, os.getcwd(), sys.path, dict(os.environ), self.task_per_node, env.environ)) return execInfo @safe def clearCache(self): self.task_launched.clear() self.slaveTasks.clear() @safe def submitTasks(self, tasks): if not tasks: return self.completionEvents.join() #Blocks until all items in the events queue have been gotten and processed. self.clearCache() self.task_waiting.extend(tasks) self.taskNum = self.taskNum + len(tasks) logger.debug("Got job with %d tasks", len(tasks)) if not self.started and not self.starting: self.starting = True self.start_driver() while not self.started: self.lock.release() time.sleep(0.01) self.lock.acquire() self.requestMoreResources() self.manager.statusUpdate() def requestMoreResources(self): if self.started: self.driver.reviveOffers() @safe def resourceOffers(self, driver, offers): rf = mesos_pb2.Filters() if not self.task_waiting: rf.refuse_seconds = 5 for o in offers: driver.launchTasks(o.id, [], rf) return random.shuffle(offers) self.last_offer_time = time.time() for offer in offers: if self.shuttingdown: print "Shutting down: declining offer on [%s]" % offer.hostname driver.declineOffer(offer.id) continue attrs = self.getAttributes(offer) if self.options.group and attrs.get('group', 'None') not in self.options.group: driver.launchTasks(offer.id, [], rf) continue cpus, mem, gpus = self.getResources(offer) logger.debug("got resource offer %s: cpus:%s, mem:%s, gpus:%s at %s", offer.id.value, cpus, mem, gpus, offer.hostname) logger.debug("attributes,gpus:%s",attrs.get('gpus', None)) sid = offer.slave_id.value tasks = [] while (len(self.task_waiting)>0 and cpus >= self.cpus and mem >= self.mem and (self.gpus==0 or attrs.get('gpus', None) is not None)): logger.debug("Accepting resource on slave %s (%s)", offer.slave_id.value, offer.hostname) t = self.task_waiting.pop() t.state = mesos_pb2.TASK_STARTING t.state_time = time.time() task = self.create_task(offer, t, cpus) tasks.append(task) self.task_launched[t.id] = t self.slaveTasks.setdefault(sid, set()).add(t.id) cpus -= self.cpus mem -= self.mem # gpus -= self.gpus operation = mesos_pb2.Offer.Operation() operation.type = mesos_pb2.Offer.Operation.LAUNCH operation.launch.task_infos.extend(tasks) driver.acceptOffers([offer.id], [operation]) @safe def offerRescinded(self, driver, offer_id): logger.debug("rescinded offer: %s", offer_id) if self.task_waiting: self.requestMoreResources() def getResources(self, offer): cpus, mem, gpus = 0, 0, 0 for r in offer.resources: if r.name == 'gpus': gpus = float(r.scalar.value) elif r.name == 'cpus': cpus = float(r.scalar.value) elif r.name == 'mem': mem = float(r.scalar.value) return cpus, mem, gpus def getResource(self, res, name): for r in res: if r.name == name: return r.scalar.value return 0 def getAttribute(self, attrs, name): for r in attrs: if r.name == name: return r.scalar.value def getAttributes(self, offer): attrs = {} for a in offer.attributes: attrs[a.name] = a.scalar.value return attrs def create_task(self, offer, t, cpus): task = mesos_pb2.TaskInfo() task.task_id.value = t.id task.slave_id.value = offer.slave_id.value task.name = "task(%s/%d)" % (t.id, self.taskNum) task.executor.MergeFrom(self.executor) task.data = compress(cPickle.dumps((t, t.tried), -1)) cpu = task.resources.add() cpu.name = "cpus" cpu.type = 0 # mesos_pb2.Value.SCALAR cpu.scalar.value = min(self.cpus, cpus) mem = task.resources.add() mem.name = "mem" mem.type = 0 # mesos_pb2.Value.SCALAR mem.scalar.value = self.mem # # gpu = task.resources.add() # gpu.name = "gpus" # gpu.type = 0 # mesos_pb2.Value.SCALAR # gpu.scalar.value = self.gpus return task @safe def statusUpdate(self, driver, update): logger.debug("Task %s in state [%s]" % (update.task_id.value, mesos_pb2.TaskState.Name(update.state))) tid = str(update.task_id.value) if tid not in self.task_launched: # check failed after launched for t in self.task_waiting: if t.id == tid: self.task_launched[tid] = t self.task_waiting.remove(t) break else: logger.debug("Task %s is finished, ignore it", tid) return t = self.task_launched[tid] t.state = update.state t.state_time = time.time() self.last_finish_time = t.state_time if update.state == mesos_pb2.TASK_RUNNING: self.started = True # to do task timeout handler elif update.state == mesos_pb2.TASK_LOST: self.task_launched.pop(tid) if t.tried < self.options.retry: t.tried += 1 logger.warning("task %s lost, retry %s", t.id, update.state, t.tried) self.task_waiting.append(t) # try again else: self.taskEnded(t, OtherFailure("task lost,exception:" + str(update.data)), "task lost") elif update.state in (mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_ERROR, mesos_pb2.TASK_KILLED): self.task_launched.pop(tid) slave = None for s in self.slaveTasks: if tid in self.slaveTasks[s]: slave = s self.slaveTasks[s].remove(tid) break if update.state == mesos_pb2.TASK_FINISHED : self.taskEnded(t, Success(), update.data) if update.state == mesos_pb2.TASK_ERROR : logger.error(update.message) self.taskEnded(t, OtherFailure(update.message), update.message) driver.abort() self.shutdown() if update.state == mesos_pb2.TASK_FAILED or update.state == mesos_pb2.TASK_KILLED or update.state == mesos_pb2.TASK_LOST: if t.tried < self.options.retry: t.tried += 1 logger.warning("task %s failed with %s, retry %s", t.id, update.state, t.tried) self.task_waiting.append(t) # try again else: self.taskEnded(t, OtherFailure("exception:" + str(update.data)), None) logger.error("task %s failed on %s", t.id, slave) if not self.task_waiting: self.requestMoreResources() # request more offers again @safe def check(self, driver): now = time.time() for tid, t in self.task_launched.items(): if t.state == mesos_pb2.TASK_STARTING and t.state_time + 30 < now: logger.warning("task %s lauched failed, assign again", tid) if not self.task_waiting: self.requestMoreResources() t.tried += 1 t.state = -1 self.task_launched.pop(tid) self.task_waiting.append(t) # TODO: check run time @safe def shutdown(self): if not self.started: return wait_started = datetime.datetime.now() while (len(self.task_launched) > 0) and \ (SHUTDOWN_TIMEOUT > (datetime.datetime.now() - wait_started).seconds): time.sleep(1) logger.debug("total:%d, task finished: %d,task failed: %d", self.taskNum, self.finished_count, self.fail_count) self.shuttingdown = True # self.driver.join() self.driver.stop(False) #self.driver = None logger.debug("scheduler stop!!!") self.stopped = True self.started = False @safe def error(self, driver, code): logger.warning("Mesos error message: %s", code) def defaultParallelism(self): return 16 def frameworkMessage(self, driver, executor, slave, data): logger.warning("[slave %s] %s", slave.value, data) def executorLost(self, driver, executorId, slaveId, status): logger.warning("executor at %s %s lost: %s", slaveId.value, executorId.value, status) self.slaveTasks.pop(slaveId.value, None) def slaveLost(self, driver, slaveId): logger.warning("slave %s lost", slaveId.value) self.slaveTasks.pop(slaveId.value, None) def killTask(self, job_id, task_id, tried): tid = mesos_pb2.TaskID() tid.value = "%s:%s:%s" % (job_id, task_id, tried) self.driver.killTask(tid)
print "NASEL FRAMEWORK" reuse_framework_id = framework['id'] #exit(1) #exit(1) except ZookeeperNoMaster as ex: logging.error("Could not find any Mesos Master: {}".format(ex.message)) exit(1) framework = mesos_pb2.FrameworkInfo() framework.user = "******" # Have Mesos fill in the current user. framework.name = "vizceral" framework.failover_timeout=300 #print framework if reuse_framework_id!="": mesos_framework = mesos_pb2.FrameworkID() mesos_framework.value=reuse_framework_id framework.id.MergeFrom(mesos_framework) logging.error("re-registring existing framework "+reuse_framework_id) driver = MesosSchedulerDriver( VizCeralScheduler(), framework, ZK_HOSTS ) driver.run()
goodbye_executor.name = "GoodBye" goodbye_executor.command.value = "python goodbye_executor.py" uri_proto = goodbye_executor.command.uris.add() uri_proto.value = "http://kit-mesos-master:9000/goodbye_executor.py" uri_proto.extract = False framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "hello-world" httpd = SocketServer.TCPServer(("", 9000), SimpleHTTPServer.SimpleHTTPRequestHandler) def create_web_server(): print "serving at port", 9000 httpd.serve_forever() web_thread = threading.Thread(target=create_web_server) web_thread.start() driver = MesosSchedulerDriver( HelloWorldScheduler(hello_executor, goodbye_executor), framework, "zk://localhost:2181/mesos") driver.start() logging.info("Listening for Ctrl-C") signal.signal(signal.SIGINT, shutdown) while True: time.sleep(5) sys.exit(0)
def run(api_url, mesos_master, user, config_dir, state_file, changes_request_limit, http_port, stats=None): scheduler = ChangesScheduler(state_file, api=ChangesAPI(api_url), stats=stats, blacklist=FileBlacklist( os.path.join(config_dir, 'blacklist')), changes_request_limit=changes_request_limit) executor = mesos_pb2.ExecutorInfo() executor.executor_id.value = "default" executor.command.value = os.path.abspath("./executor.py") executor.name = "Changes Executor" executor.source = "changes" framework = mesos_pb2.FrameworkInfo() framework.user = user framework.name = "Changes Scheduler" framework.principal = "changes" # Give the scheduler 1 week to restart before mesos cancels the tasks. # this is the setting recommended by the docs. framework.failover_timeout = 3600 * 24 * 7 if scheduler.framework_id: framework.id.value = scheduler.framework_id executor.framework_id.value = scheduler.framework_id driver = MesosSchedulerDriver(scheduler, framework, mesos_master) stopped = threading.Event() def handle_interrupt(signal, frame): stopped.set() logging.info("Received interrupt, shutting down") logging.warning( "Not saving state. Will wait for running tasks to finish.") scheduler.shuttingDown.set() while scheduler.activeTasks > 0: logging.info("Waiting for %d tasks to finish running", scheduler.activeTasks) sleep(5) driver.stop() def handle_sigterm(signal, frame): # TODO: Avoid save_state race conditions by having handle_sigterm() # only set shuttingDown, then do the actual save-state and driver.stop() # in the main thread after all other threads are join()ed. # Also, stopped doesn't appear to be used. stopped.set() logging.info("Received sigterm, shutting down") scheduler.shuttingDown.set() if scheduler.state_file: try: scheduler.save_state() logging.info("Successfully saved state to %s.", state_file) except Exception: logging.exception("Failed to save state") driver.stop() return # With `failover` set to true, we do not tell Mesos to stop the existing tasks # started by this framework. Instead, the tasks will run for # `fail_timeout` more seconds set above or we start a scheduler with # the same framework id. driver.stop(True) else: logging.warning( "State file location not set. Not saving state. Existing builds will be cancelled." ) driver.stop() signal.signal(signal.SIGINT, handle_interrupt) signal.signal(signal.SIGTERM, handle_sigterm) driver.start() logging.info("Driver started") app = Flask("Changes Mesos Scheduler") app.add_url_rule('/api/state_json', 'state_json', json_handler(scheduler.state_json)) http_thread = threading.Thread(target=app.run, kwargs={'port': http_port}) http_thread.start() scheduler.poll_changes_until_shutdown(driver, 5) status = 0 if driver.join() == mesos_pb2.DRIVER_STOPPED: logging.info("Driver stopped cleanly.") else: # Ensure that the driver process terminates. status = 1 logging.info("Stopping driver forcibly.") driver.stop() logging.info("Stopping HTTP server.") http_thread.terminate() http_thread.join() logging.info("Clean shutdown complete. Exiting status %d.", status) sys.exit(status)
class Command(BaseCommand): """Command that launches the Scale scheduler """ help = 'Launches the Scale scheduler' def add_arguments(self, parser): parser.add_argument('-m', '--master', action='store', default=settings.MESOS_MASTER, help='The master to connect to') def handle(self, *args, **options): """See :meth:`django.core.management.base.BaseCommand.handle`. This method starts the scheduler. """ # Register a listener to handle clean shutdowns signal.signal(signal.SIGTERM, self._onsigterm) # TODO: clean this up mesos_master = options.get('master') logger.info('Scale Scheduler %s', settings.VERSION) try: scheduler_zk = settings.SCHEDULER_ZK except: scheduler_zk = None if scheduler_zk is not None: import socket from scheduler import cluster_utils my_id = socket.gethostname() cluster_utils.wait_for_leader(scheduler_zk, my_id, self.run_scheduler, mesos_master) else: # leader election is disabled self.run_scheduler(mesos_master) def run_scheduler(self, mesos_master): logger.info("I am the leader") self.scheduler = ScaleScheduler() framework = mesos_pb2.FrameworkInfo() framework.user = '' # Have Mesos fill in the current user. framework.name = os.getenv('DCOS_PACKAGE_FRAMEWORK_NAME', 'Scale') webserver_address = os.getenv('SCALE_WEBSERVER_ADDRESS') if webserver_address: framework.webui_url = webserver_address logger.info('Connecting to Mesos master at %s', mesos_master) # TODO(vinod): Make checkpointing the default when it is default on the slave. if MESOS_CHECKPOINT: logger.info('Enabling checkpoint for the framework') framework.checkpoint = True if MESOS_AUTHENTICATE: logger.info('Enabling authentication for the framework') if not DEFAULT_PRINCIPLE: logger.error( 'Expecting authentication principal in the environment') sys.exit(1) if not DEFAULT_SECRET: logger.error( 'Expecting authentication secret in the environment') sys.exit(1) credential = mesos_pb2.Credential() credential.principal = DEFAULT_PRINCIPLE credential.secret = DEFAULT_SECRET self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master, credential) else: self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master) try: status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1 except: status = 1 logger.exception('Mesos Scheduler Driver returned an exception') #Perform a shut down and return any non-zero status shutdown_status = self._shutdown status = status or shutdown_status logger.info('Exiting...') sys.exit(status) def _onsigterm(self, signum, _frame): """See signal callback registration: :py:func:`signal.signal`. This callback performs a clean shutdown when a TERM signal is received. """ logger.info('Scheduler command terminated due to signal: %i', signum) self._shutdown() sys.exit(1) def _shutdown(self): """Performs any clean up required by this command. :returns: The exit status code based on whether the shutdown operation was clean with no exceptions. :rtype: int """ status = 0 try: if self.scheduler: self.scheduler.shutdown() except: logger.exception('Failed to properly shutdown Scale scheduler.') status = 1 try: if self.driver: self.driver.stop() except: logger.exception('Failed to properly stop Mesos driver.') status = 1 return status
class Command(BaseCommand): """Command that launches the Scale scheduler """ option_list = BaseCommand.option_list + ( make_option('-m', '--master', action='store', type='str', default=settings.MESOS_MASTER, help=('The master to connect to')), ) help = 'Launches the Scale scheduler' def handle(self, **options): """See :meth:`django.core.management.base.BaseCommand.handle`. This method starts the scheduler. """ # Register a listener to handle clean shutdowns signal.signal(signal.SIGTERM, self._onsigterm) # TODO: clean this up mesos_master = options.get('master') logger.info('Scale Scheduler %s', settings.VERSION) try: scheduler_zk = settings.SCHEDULER_ZK except: scheduler_zk = None if scheduler_zk is not None: import socket from scheduler import cluster_utils my_id = socket.gethostname() cluster_utils.wait_for_leader(scheduler_zk, my_id, self.run_scheduler, mesos_master) else: # leader election is disabled self.run_scheduler(mesos_master) def run_scheduler(self, mesos_master): logger.info("I am the leader") self.scheduler = ScaleScheduler() framework = mesos_pb2.FrameworkInfo() framework.user = '' # Have Mesos fill in the current user. framework.name = 'Scale' logger.info('Connecting to Mesos master at %s', mesos_master) # TODO(vinod): Make checkpointing the default when it is default on the slave. if MESOS_CHECKPOINT: logger.info('Enabling checkpoint for the framework') framework.checkpoint = True if MESOS_AUTHENTICATE: logger.info('Enabling authentication for the framework') if not DEFAULT_PRINCIPLE: logger.error('Expecting authentication principal in the environment') sys.exit(1) if not DEFAULT_SECRET: logger.error('Expecting authentication secret in the environment') sys.exit(1) credential = mesos_pb2.Credential() credential.principal = DEFAULT_PRINCIPLE credential.secret = DEFAULT_SECRET self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master, credential) else: self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master) try: status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1 except: status = 1 logger.exception('Mesos Scheduler Driver returned an exception') #Perform a shut down and return any non-zero status shutdown_status = self._shutdown status = status or shutdown_status logger.info('Exiting...') sys.exit(status) def _onsigterm(self, signum, _frame): """See signal callback registration: :py:func:`signal.signal`. This callback performs a clean shutdown when a TERM signal is received. """ logger.info('Scheduler command terminated due to signal: %i', signum) self._shutdown() sys.exit(1) def _shutdown(self): """Performs any clean up required by this command. :returns: The exit status code based on whether the shutdown operation was clean with no exceptions. :rtype: int """ status = 0 try: if self.scheduler: self.scheduler.shutdown() except: logger.exception('Failed to properly shutdown Scale scheduler.') status = 1 try: if self.driver: self.driver.stop() except: logger.exception('Failed to properly stop Mesos driver.') status = 1 return status
uri_proto.extract = False vilfredo = VilfredoMesosScheduler(paretoExecutor, sys.argv[2]) # Use credentials for authentication if they are provided. credential = None if len(sys.argv) > 3: credentialsPath = sys.argv[3] with open(credentialsPath) as f: words = f.readline().split() credential = mesos_pb2.Credential() credential.principal = words[0] credential.secret = words[1] if credential is None: driver = MesosSchedulerDriver(vilfredo, framework, sys.argv[1]) else: driver = MesosSchedulerDriver(vilfredo, framework, sys.argv[1], credential) # driver.run() blocks; we run it in a separate thread. def run_driver_async(): status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1 driver.stop() sys.exit(status) framework_thread = threading.Thread(target=run_driver_async) framework_thread.start() print "(Listening for Ctrl-C)" signal.signal(signal.SIGINT, hard_shutdown)
raise ValueError('ALL DONE') logging.info("Recieved resource offers: {}".format( [o.id.value for o in offers])) # whenever we get an offer, we accept it and use it to launch a task that # just echos hello world to stdout for offer in offers: step = steps.pop() task = new_task(offer) task.command.value = "echo Running step {}".format(step) logging.info("Launching task {task} " "using offer {offer}.".format(task=task.task_id.value, offer=offer.id.value)) tasks = [task] driver.launchTasks(offer.id, tasks) if not steps: break if __name__ == '__main__': framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "runner" scheduler = HelloWorldScheduler() scheduler.define_tasks() driver = MesosSchedulerDriver( scheduler, framework, "zk://localhost:2181/mesos" # assumes running on the master ) driver.run()
mem.type = mesos_pb2.Value.SCALAR mem.scalar.value = 1 time.sleep(2) logging.info("Launching task {task} " "using offer {offer}.".format(task=task.task_id.value, offer=offer.id.value)) tasks = [task] driver.launchTasks(offer.id, tasks) else: driver.stop() if __name__ == '__main__': # make us a framework framework = mesos_pb2.FrameworkInfo() framework.user = "******" # Have Mesos fill in the current user. framework.name = "hello-world" framework.checkpoint = True driver = MesosSchedulerDriver( HelloWorldScheduler(), framework, "127.0.0.1:5050/" # assumes running on the master ) sys.exit(0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1)
uri_proto.extract = False renderExecutor.name = "Renderer" framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "RENDLER" try: maxRenderTasks = int(sys.argv[3]) except: maxRenderTasks = 0 rendler = RenderingCrawler(sys.argv[1], maxRenderTasks, crawlExecutor, renderExecutor) driver = MesosSchedulerDriver(rendler, framework, sys.argv[2]) # driver.run() blocks; we run it in a separate thread def run_driver_async(): status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1 driver.stop() sys.exit(status) framework_thread = Thread(target=run_driver_async, args=()) framework_thread.start() print "(Listening for Ctrl-C)" signal.signal(signal.SIGINT, graceful_shutdown) while framework_thread.is_alive(): time.sleep(1)
def run(api_url, mesos_master, user, config_dir, state_file, stats=None): scheduler = ChangesScheduler(config_dir, state_file, api=ChangesAPI(api_url), stats=stats) executor = mesos_pb2.ExecutorInfo() executor.executor_id.value = "default" executor.command.value = os.path.abspath("./executor.py") executor.name = "Changes Executor" executor.source = "changes" framework = mesos_pb2.FrameworkInfo() framework.user = user framework.name = "Changes Scheduler" framework.principal = "changes" # Give the scheduler 30s to restart before mesos cancels the tasks. framework.failover_timeout = 30 if scheduler.framework_id: framework.id.value = scheduler.framework_id executor.framework_id.value = scheduler.framework_id driver = MesosSchedulerDriver( scheduler, framework, mesos_master) stopped = threading.Event() def handle_interrupt(signal, frame): stopped.set() logging.info("Received interrupt, shutting down") logging.warning("Not saving state. Will wait for running tasks to finish.") scheduler.shuttingDown.set() while scheduler.activeTasks > 0: logging.info("Waiting for %d tasks to finish running", scheduler.activeTasks) sleep(5) driver.stop() def handle_sigterm(signal, frame): stopped.set() logging.info("Received sigterm, shutting down") scheduler.shuttingDown.set() if scheduler.state_file: try: scheduler.save_state() logging.info("Successfully saved state to %s.", state_file) except Exception: logging.exception("Failed to save state") driver.stop() return # With `failover` set to true, we do not tell Mesos to stop the existing tasks # started by this framework. Instead, the tasks will run for # `fail_timeout` more seconds set above or we start a scheduler with # the same framework id. driver.stop(True) else: logging.warning("State file location not set. Not saving state. Existing builds will be cancelled.") driver.stop() signal.signal(signal.SIGINT, handle_interrupt) signal.signal(signal.SIGTERM, handle_sigterm) driver.start() logging.info("Driver started") while not stopped.is_set(): stopped.wait(3) status = 0 if driver.join() == mesos_pb2.DRIVER_STOPPED else 1 # Ensure that the driver process terminates. if status == 1: driver.stop() sys.exit(status)
task.command.value = "python install_nuage_cni.py %s %s" % ( sys.argv[2], sys.argv[3]) time.sleep(2) logging.info( "Launching task {task} " "using offer {offer}.".format(task=task.task_id.value, offer=offer.id.value)) tasks = [task] driver.launchTasks(offer.id, tasks) def statusUpdate(self, driver, update): print "Task %s is in state %d" % (update.task_id.value, update.state) if update.state == mesos_pb2.TASK_FINISHED: print "Task successfully executed on agent node" self.serviced_agents += 1 if self.serviced_agents == len(agent_list): driver.stop() if __name__ == '__main__': # Create a Mesos framework for CNI installation framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "install-cni" driver = MesosSchedulerDriver( InstallCNIScheduler(), framework, sys.argv[1] + ":5050" # assumes running on the master ) driver.run()
elif update.task_id.value == "1": if update.message == "Command terminated with signal Killed: 9": print "#### Passed forced shutdown" else: print "!!!! Failed forced shutdown" self.tasksFinished += 1 if self.tasksFinished == TOTAL_TASKS: driver.stop() if __name__ == "__main__": if len(sys.argv) != 2: print "Usage: %s master" % sys.argv[0] sys.exit(1) framework = mesos_pb2.FrameworkInfo() framework.user = "" framework.name = "Escalation Framework (Python)" driver = MesosSchedulerDriver( EscalationScheduler(), framework, sys.argv[1]) status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1 # Ensure that the driver process terminates. driver.stop(); sys.exit(status)
def hard_shutdown(signal, frame): print "Shutting down..." driver.stop() if __name__ == "__main__": if len(sys.argv) != 3: print "Usage: %s master command" % sys.argv[0] sys.exit(1) framework = mesos_pb2.FrameworkInfo() framework.user = "" framework.name = "Lanceur (Simple Launcher Framework in python)" driver = MesosSchedulerDriver(LaunchScheduler(sys.argv[2]), framework, sys.argv[1]) print "Starting Mesos driver" # driver.run() blocks; we run it in a separate thread def run_driver_async(): status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1 driver.stop() sys.exit(status) framework_thread = threading.Thread(target=run_driver_async) framework_thread.start() print "(Listening for Ctrl-C)" signal.signal(signal.SIGINT, hard_shutdown) while framework_thread.is_alive():
class Command(BaseCommand): '''Command that launches the Scale scheduler ''' option_list = BaseCommand.option_list + ( make_option('-m', '--master', action='store', type='str', default=settings.MESOS_MASTER, help=('The master to connect to')), ) help = 'Launches the Scale scheduler' def handle(self, **options): '''See :meth:`django.core.management.base.BaseCommand.handle`. This method starts the scheduler. ''' # Register a listener to handle clean shutdowns signal.signal(signal.SIGTERM, self._onsigterm) # TODO: clean this up mesos_master = options.get('master') logger.info(u'Command starting: scale_scheduler') logger.info(u' - Master: %s', mesos_master) executor = mesos_pb2.ExecutorInfo() executor.executor_id.value = 'scale' executor.command.value = '%s %s scale_executor' % (settings.PYTHON_EXECUTABLE, settings.MANAGE_FILE) executor.name = 'Scale Executor (Python)' self.scheduler = ScaleScheduler(executor) framework = mesos_pb2.FrameworkInfo() framework.user = '' # Have Mesos fill in the current user. framework.name = 'Scale Framework (Python)' # TODO(vinod): Make checkpointing the default when it is default on the slave. if MESOS_CHECKPOINT: logger.info('Enabling checkpoint for the framework') framework.checkpoint = True if MESOS_AUTHENTICATE: logger.info('Enabling authentication for the framework') if not DEFAULT_PRINCIPLE: logger.error('Expecting authentication principal in the environment') sys.exit(1) if not DEFAULT_SECRET: logger.error('Expecting authentication secret in the environment') sys.exit(1) credential = mesos_pb2.Credential() credential.principal = DEFAULT_PRINCIPLE credential.secret = DEFAULT_SECRET self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master, credential) else: self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master) status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1 # Perform any required clean up operations like stopping background threads status = status or self._shutdown() logger.info(u'Command completed: scale_scheduler') sys.exit(status) def _onsigterm(self, signum, _frame): '''See signal callback registration: :py:func:`signal.signal`. This callback performs a clean shutdown when a TERM signal is received. ''' logger.info(u'Scheduler command terminated due to signal: %i', signum) self._shutdown() sys.exit(1) def _shutdown(self): '''Performs any clean up required by this command. :returns: The exit status code based on whether the shutdown operation was clean with no exceptions. :rtype: int ''' status = 0 try: if self.scheduler: self.scheduler.shutdown() except: logger.exception('Failed to properly shutdown scale scheduler.') status = 1 try: if self.driver: self.driver.stop() except: logger.exception('Failed to properly stop Mesos driver.') status = 1 return status
framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "hello-world" helloWorldScheduler = HelloWorldScheduler(hello_executor, goodbye_executor) httpd = SocketServer.TCPServer( ("", 9000), SimpleHTTPServer.SimpleHTTPRequestHandler) def create_web_server(): print "serving at port", 9000 httpd.serve_forever() thread = threading.Thread(target=create_web_server) thread.start() driver = MesosSchedulerDriver( helloWorldScheduler, framework, "zk://localhost:2181/mesos" ) driver.start() logging.info("Listening for Ctrl-C") signal.signal(signal.SIGINT, graceful_shutdown) while True: time.sleep(5) sys.exit(0)
except Exception as e: raise e finally: mesos_lock.release() db.rhino_tasks.update({'name': task['name']}, {'$set': {'state': 'KILLED'}}) kill_those_that_depend_on(task['name']) kill_those_that_depend_on(doc['name']) db.rhino_tasks.update({'mesos_id': status.task_id.value}, {'$set': {'state': state, 'retCode': ret_code, 'message': message}}) if __name__ == '__main__': try: web_server_thread = threading.Thread(target=web_server, args=()) web_server_thread.start() framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "rhino" mesos_driver = MesosSchedulerDriver( AppsomaRhinoScheduler(), framework, "zk://" + config['zookeeper_hosts'] + "/mesos" ) mesos_driver.run() except KeyboardInterrupt: print "KeyboardInterrupt" os.kill(os.getpid(), 9)
def start_factory_mesos(): global pyroLoopCondition parser = OptionParser( usage="Usage: python factorymesos.py [options] <command>") parser.allow_interspersed_args = False parser.add_option("-s", "--master", type="string", default="", help="url of master (mesos://172.31.252.180:5050)") parser.add_option("-f", "--factory", type="string", default="", help="host:port of master (172.31.252.180:6666)") parser.add_option( "-w", "--warehouse_addr", type="string", default="", help= "kafka-172.31.252.182:9092|mysql-172.31.254.25:3306,db,username,password" ) parser.add_option("-p", "--task_per_node", type="int", default=0, help="max number of tasks on one node (default: 0)") parser.add_option("-I", "--image", type="string", help="image name for Docker") parser.add_option("-V", "--volumes", type="string", help="volumes to mount into Docker") parser.add_option("-r", "--retry", type="int", default=0, help="retry times when failed (default: 0)") parser.add_option( "-e", "--config", type="string", default="/work/opencluster/config.ini", help= "absolute path of configuration file(default:/work/opencluster/config.ini)" ) parser.add_option("-g", "--group", type="string", default='', help="which group to run (default: ''") parser.add_option( "-q", "--quiet", action="store_true", help="be quiet", ) parser.add_option( "-v", "--verbose", action="store_true", help="show more useful log", ) (options, command) = parser.parse_args() if not options: parser.print_help() sys.exit(2) if options.config: Conf.setConfigFile(options.config) options.master = options.master or Conf.getMesosMaster() options.warehouse_addr = options.warehouse_addr or Conf.getWareHouseAddr() servers = options.factory or Conf.getFactoryServers() servs = servers.split(",") server = servs[0].split(":") options.logLevel = (options.quiet and logging.ERROR or options.verbose and logging.DEBUG or logging.INFO) setLogger(Conf.getFactoryServiceName(), "MESOS", options.logLevel) implicitAcknowledgements = 1 if os.getenv("MESOS_EXPLICIT_ACKNOWLEDGEMENTS"): implicitAcknowledgements = 0 sched = FactoryMesos(options, command, implicitAcknowledgements) driver = MesosSchedulerDriver(sched, sched.framework, options.master, implicitAcknowledgements) driver.start() logger.debug("Mesos Scheudler driver started") warehouse_addrs = options.warehouse_addr.split(",") def fetchTasksFromMySQL(): global pyroLoopCondition mysqlIpAndPort = warehouse_addrs[0].split(":") last_data_time = time.time() while pyroLoopCondition: db = MySQLdb.connect(host=mysqlIpAndPort[0], port=int(mysqlIpAndPort[1]), db=warehouse_addrs[1], user=warehouse_addrs[2], passwd=warehouse_addrs[3]) try: cur = db.cursor() curUpt = db.cursor() dataResults = cur.execute( "select task_id,task_desc,task_start_time,status from t_task where status=0 order by priority asc limit 200" ) results = cur.fetchmany(dataResults) for r in results: sched.append_task(cPickle.loads(r[1])) curUpt.execute( "update t_task set task_start_time=now(),status=1 where task_id='" + r[0] + "'") if len(results) > 0: db.commit() last_data_time = time.time() driver.reviveOffers() if sched.tasks_total_len() > MAX_WAITING_TASK: time.sleep(2) if time.time() - last_data_time > MAX_EMPTY_TASK_PERIOD: time.sleep(10) if cur: cur.close() if curUpt: curUpt.close() finally: db.close() def fetchTasksFromKafka(priority): global pyroLoopCondition consumer = KafkaConsumer('OpenCluster%s' % priority, bootstrap_servers=[options.warehouse_addr], group_id="cnlab", auto_commit_enable=True, auto_commit_interval_ms=30 * 1000, auto_offset_reset='smallest') last_data_time = time.time() while pyroLoopCondition: for message in consumer.fetch_messages(): logger.error("%s:%s:%s: key=%s " % (message.topic, message.partition, message.offset, message.key)) sched.append_task(cPickle.loads(message.value)) consumer.task_done(message) last_data_time = time.time() if sched.tasks_len(priority) > MAX_WAITING_TASK: time.sleep(2) if time.time() - last_data_time > MAX_EMPTY_TASK_PERIOD: time.sleep(10) if len(warehouse_addrs) > 2: spawn(fetchTasksFromMySQL) else: for i in range(1, sched.priority_size + 1): spawn(fetchTasksFromKafka, i) def handler(signm, frame): logger.warning("got signal %d, exit now", signm) sched.stop(3) signal.signal(signal.SIGTERM, handler) signal.signal(signal.SIGABRT, handler) try: while not sched.stopped: time.sleep(0.5) sched.check(driver) now = time.time() if now > sched.last_offer_time + 60 + random.randint(0, 5): logger.warning("too long to get offer, reviving...") sched.last_offer_time = now driver.reviveOffers() except KeyboardInterrupt: logger.warning( 'stopped by KeyboardInterrupt. The Program is exiting gracefully! Please wait...' ) sched.stop(4) #terminate pyrothread pyroLoopCondition = False time.sleep(5) driver.stop(False) sys.exit(sched.status)
task = mesos_pb2.TaskInfo() id = uuid.uuid4() task.task_id.value = str(id) task.slave_id.value = offer.slave_id.value task.name = "task {}".format(str(id)) cpus = task.resources.add() cpus.name = "cpus" cpus.type = mesos_pb2.Value.SCALAR cpus.scalar.value = 0.1 mem = task.resources.add() mem.name = "mem" mem.type = mesos_pb2.Value.SCALAR mem.scalar.value = 32 return task if __name__ == '__main__': log("XXX framework started") framework = mesos_pb2.FrameworkInfo() framework.user = "******" framework.name = "nixos-test-framework" driver = MesosSchedulerDriver( NixosTestScheduler(), framework, sys.argv[1] + ":5050" ) driver.run()
def main(): global shutdown global accept_offers global driver cfg = get_configuration() # Configure logging setup_logging(cfg) framework = mesos_framework(cfg) #credentials = mesos_credentials(cfg) mesos_scheduler = OurJobScheduler(cfg) #driver = MesosSchedulerDriver(mesos_scheduler, framework, # cfg.mesos.master, cfg.mesos.imp_ack, # credentials) driver = MesosSchedulerDriver(mesos_scheduler, framework, cfg.mesos.master, cfg.mesos.imp_ack) shutdown = Shutdown() accept_offers = AcceptOffers() # driver.run() blocks, so run it in a separate thread. def run_driver_async(): status = 0 if driver.run() == MesosPb2.DRIVER_STOPPED else 1 if cfg.debug > 0: logger.debug('Stopping Driver') driver.stop() logger.info('Terminating Framework') sys.exit(status) framework_thread = Thread(target=run_driver_async, args=()) framework_thread.start() logger.info('Beginning Processing') while framework_thread.is_alive(): # If a shutdown has been requested, suppress offers and wait for the # framework thread to complete. if shutdown.flag: if cfg.debug > 0: logger.debug('Suppressing Offers') driver.suppressOffers() while framework_thread.is_alive(): logger.debug('Child Thread Still Alive') sleep(5) break # If the max number of jobs are already running, suppress offers and # wait for some jobs to finish. if (mesos_scheduler.tasks_launched == cfg.mesos.max_jobs): driver.suppressOffers() if cfg.debug > 0: logger.debug('Suppressing Offers') # Sleep until we have room for more tasks while (not shutdown.flag and mesos_scheduler.tasks_launched == cfg.mesos.max_jobs): if cfg.debug > 0: logger.debug('Waiting for more available tasks') sleep(5) # Sleep until more processing is requested while not shutdown.flag and not mesos_scheduler.have_work(): if cfg.debug > 0: logger.debug('Waiting for more work') sleep(5) if not shutdown.flag: if cfg.debug > 0: logger.debug('Reviving Offers') driver.reviveOffers() # If there's no new work to be done, suppress offers until we have # more work if not shutdown.flag and not mesos_scheduler.have_work(): driver.suppressOffers() # Sleep until more processing is requested while not shutdown.flag and not mesos_scheduler.have_work(): if cfg.debug > 0: logger.debug('Waiting for more work') sleep(5) if not shutdown.flag: if cfg.debug > 0: logger.debug('Reviving Offers') driver.reviveOffers() # Sleep for a second, so that we are not flying through the loop sleep(1) logger.info('Terminated Processing')
"using offer %s.", task.task_id.value, offer.id.value) driver.launchTasks(offer.id, [task]) def statusUpdate(self, driver, update): ''' when a task is started, over, killed or lost (slave crash, ....), this method will be triggered with a status message. ''' logging.info("Task %s is in state %s" % (update.task_id.value, mesos_pb2.TaskState.Name(update.state))) if __name__ == '__main__': framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "hello-world" driver = MesosSchedulerDriver( HelloWorldScheduler(), framework, "zk://localhost:2181/mesos" # assumes running on the master ) driver.start() logging.info("Listening for Ctrl-C") signal.signal(signal.SIGINT, shutdown) while True: time.sleep(5) sys.exit(0)
def registered(self, driver, framework_id, master_info): logging.info("Registered with framework id: {}".format(framework_id)) def resourceOffers(self, driver, offers): logging.info("Recieved resource offers: {}".format( [o.id.value for o in offers])) # whenever we get an offer, we accept it and use it to launch a task that # just echos hello world to stdout for offer in offers: task = new_task(offer) task.command.value = "echo hello world" time.sleep(2) logging.info("Launching task {task} " "using offer {offer}.".format(task=task.task_id.value, offer=offer.id.value)) tasks = [task] driver.launchTasks(offer.id, tasks) if __name__ == '__main__': # make us a framework framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "hello-world" driver = MesosSchedulerDriver( HelloWorldScheduler(), framework, "zk://34.215.39.254/mesos" # assumes running on the master ) driver.run()