def kill_children(signal, frame): log.error( 'Received a signal that is trying to terminate this process.' ' Terminating mesos and relay child processes!', extra=dict( mesos_framework_name=ns.mesos_framework_name, signal=signal)) try: mesos.terminate() log.info( 'terminated mesos scheduler', extra=dict(mesos_framework_name=ns.mesos_framework_name)) except: log.exception( 'could not terminate mesos scheduler', extra=dict(mesos_framework_name=ns.mesos_framework_name)) try: relay.terminate() log.info( 'terminated relay', extra=dict(mesos_framework_name=ns.mesos_framework_name)) except: log.exception( 'could not terminate relay', extra=dict(mesos_framework_name=ns.mesos_framework_name)) sys.exit(1)
def reregistered(self, driver, masterInfo): log.info( "Re-registered with master", extra=dict( master_pid=masterInfo.pid, master_hostname=masterInfo.hostname, master_id=masterInfo.id, master_ip=masterInfo.ip, master_port=masterInfo.port, mesos_framework_name=self.ns.mesos_framework_name, ))
def reregistered(self, driver, masterInfo): log.info("Re-registered with master", extra=dict( master_pid=masterInfo.pid, master_hostname=masterInfo.hostname, master_id=masterInfo.id, master_ip=masterInfo.ip, master_port=masterInfo.port, mesos_framework_name=self.ns.mesos_framework_name, ))
def _registered(self, driver, frameworkId, masterInfo): self.mesos_ready.acquire() self.mesos_ready.notify() self.mesos_ready.release() log.info( "Registered with master", extra=dict( framework_id=frameworkId.value, master_pid=masterInfo.pid, master_hostname=masterInfo.hostname, master_id=masterInfo.id, master_ip=masterInfo.ip, master_port=masterInfo.port, mesos_framework_name=self.ns.mesos_framework_name, ))
def _registered(self, driver, frameworkId, masterInfo): self.mesos_ready.acquire() self.mesos_ready.notify() self.mesos_ready.release() log.info("Registered with master", extra=dict( framework_id=frameworkId.value, master_pid=masterInfo.pid, master_hostname=masterInfo.hostname, master_id=masterInfo.id, master_ip=masterInfo.ip, master_port=masterInfo.port, mesos_framework_name=self.ns.mesos_framework_name, ))
def init_mesos_scheduler(ns, MV, exception_sender, mesos_ready): import mesos.interface from mesos.interface import mesos_pb2 try: import mesos.native except ImportError: log.error( "Oops! Mesos native bindings are not installed. You can download" " these binaries from mesosphere.", extra=dict(mesos_framework_name=ns.mesos_framework_name)) raise log.info( 'starting mesos scheduler', extra=dict(mesos_framework_name=ns.mesos_framework_name)) # build framework framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "Relay.Mesos: %s" % ns.mesos_framework_name if ns.mesos_framework_principal: framework.principal = ns.mesos_framework_principal if ns.mesos_framework_role: framework.role = ns.mesos_framework_role if ns.mesos_checkpoint: framework.checkpoint = True # build driver driver = mesos.native.MesosSchedulerDriver( Scheduler( MV=MV, exception_sender=exception_sender, mesos_ready=mesos_ready, ns=ns), framework, ns.mesos_master) atexit.register(driver.stop) # run things status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1 driver.stop() # Ensure that the driver process terminates. sys.exit(status)
def main(ns): """ Run Relay as a Mesos framework. Relay's event loop and the Mesos scheduler each run in separate processes and communicate through a multiprocessing.Pipe. These two processes bounce control back and forth between mesos resourceOffers and Relay's warmer/cooler functions. Relay warmer/cooler functions request that mesos tasks get spun up, but those requests are only filled if the mesos scheduler receives enough relevant offers. Relay's requests don't build up: only the largest request since the last fulfilled request is fulfilled at moment enough mesos resources are available. """ if ns.mesos_master is None: log.error( "Oops! You didn't define --mesos_master", extra=dict(mesos_framework_name=ns.mesos_framework_name)) build_arg_parser().print_usage() sys.exit(1) if not ns.mesos_task_resources: log.warn( "You didn't define '--mesos_task_resources'." " Tasks may not start on slaves", extra=dict(mesos_framework_name=ns.mesos_framework_name)) log.info( "Starting Relay Mesos!", extra={k: str(v) for k, v in ns.__dict__.items()}) # a distributed value storing the num and type of tasks mesos scheduler # should create at any given moment in time. # Sign of MV determines task type: warmer or cooler # ie. A positive value of n means n warmer tasks MV = mp.Array('d', [0, 0]) # max_val is a ctypes.c_int64 # store exceptions that may be raised exception_receiver, exception_sender = mp.Pipe(False) # notify relay when mesos framework is ready mesos_ready = mp.Condition() # copy and then override warmer and cooler ns_relay = ns.__class__(**{k: v for k, v in ns.__dict__.items()}) if ns.warmer: ns_relay.warmer = warmer_cooler_wrapper(MV, ns) if ns.cooler: ns_relay.cooler = warmer_cooler_wrapper(MV, ns) mesos_name = "Relay.Mesos Scheduler" mesos = mp.Process( target=catch(init_mesos_scheduler, exception_sender), kwargs=dict(ns=ns, MV=MV, exception_sender=exception_sender, mesos_ready=mesos_ready), name=mesos_name) relay_name = "Relay.Runner Event Loop" relay = mp.Process( target=catch(init_relay, exception_sender), args=(ns_relay, mesos_ready, ns.mesos_framework_name), name=relay_name) mesos.start() # start mesos framework relay.start() # start relay's loop set_signals(mesos, relay, ns) while True: if exception_receiver.poll(): exception_receiver.recv() log.error( 'Terminating child processes because one of them raised' ' an exception', extra=dict( is_relay_alive=relay.is_alive(), is_mesos_alive=mesos.is_alive(), mesos_framework_name=ns.mesos_framework_name)) break if not relay.is_alive(): log.error( "Relay died. Check logs to see why.", extra=dict(mesos_framework_name=ns.mesos_framework_name)) break if not mesos.is_alive(): log.error( "Mesos Scheduler died and didn't notify me of its exception." " This may be a code bug. Check logs.", extra=dict(mesos_framework_name=ns.mesos_framework_name)) break # save cpu cycles by checking for subprocess failures less often if ns.delay > 5: time.sleep(5) else: time.sleep(ns.delay) relay.terminate() mesos.terminate() sys.exit(1)