def start_driver(self): name = '[dpark] ' + os.path.abspath(sys.argv[0]) + ' ' + ' '.join( sys.argv[1:]) if len(name) > 256: name = name[:256] + '...' framework = mesos_pb2.FrameworkInfo() framework.user = getuser() if framework.user == 'root': raise Exception("dpark is not allowed to run as 'root'") framework.name = name framework.hostname = socket.gethostname() self.driver = mesos.MesosSchedulerDriver(self, framework, self.master) self.driver.start() logger.debug("Mesos Scheudler driver started") self.started = True self.last_finish_time = time.time() def check(): while self.started: now = time.time() if not self.activeJobs and now - self.last_finish_time > MAX_IDLE_TIME: logger.info("stop mesos scheduler after %d seconds idle", now - self.last_finish_time) self.stop() break time.sleep(1) spawn(check)
logging.basicConfig( format='[drun] %(threadName)s %(asctime)-15s %(message)s', level=options.quiet and logging.ERROR or options.verbose and logging.DEBUG or logging.WARNING) if options.mpi: if options.retry > 0: logger.error("MPI application can not retry") options.retry = 0 sched = MPIScheduler(options, command) else: sched = SubmitScheduler(options, command) logger.debug("Connecting to mesos master %s", options.master) driver = mesos.MesosSchedulerDriver(sched, sched.framework, options.master) driver.start() def handler(signm, frame): logger.warning("got signal %d, exit now", signm) sched.stop(3) signal.signal(signal.SIGTERM, handler) signal.signal(signal.SIGHUP, handler) signal.signal(signal.SIGABRT, handler) signal.signal(signal.SIGQUIT, handler) try: from rfoo.utils import rconsole rconsole.spawn_server(locals(), 0)
def main() -> None: parser, args = parse_args() prepare_env(args) katsdpservices.setup_logging() katsdpservices.setup_restart() if args.log_level is not None: logging.root.setLevel(args.log_level.upper()) logger = logging.getLogger('katsdpcontroller') logger.info("Starting SDP product controller...") logger.info('katcp: %s:%d', args.host, args.port) logger.info('http: %s', args.http_url) master_controller = aiokatcp.Client(args.master_controller.host, args.master_controller.port) image_lookup = product_controller.KatcpImageLookup(master_controller) try: image_resolver_factory = make_image_resolver_factory( image_lookup, args) except ValueError as exc: parser.error(str(exc)) framework_info = addict.Dict() framework_info.user = args.user framework_info.name = args.subarray_product_id framework_info.checkpoint = True framework_info.principal = args.principal framework_info.roles = [args.realtime_role, args.batch_role] framework_info.capabilities = [{ 'type': 'MULTI_ROLE' }, { 'type': 'TASK_KILLING_STATE' }] loop = asyncio.get_event_loop() sched = scheduler.Scheduler( args.realtime_role, args.host, args.http_port, args.http_url, task_stats=product_controller.TaskStats(), runner_kwargs=dict(access_log_class=web_utils.AccessLogger)) sched.app.router.add_get('/metrics', web_utils.prometheus_handler) sched.app.router.add_get('/health', web_utils.health_handler) driver = pymesos.MesosSchedulerDriver(sched, framework_info, args.mesos_master, use_addict=True, implicit_acknowledgements=False) sched.set_driver(driver) driver.start() dashboard_path = f'/gui/{args.subarray_product_id}/product/dashboard/' dashboard_url: Optional[str] = args.dashboard_url if args.dashboard_port != 0 and dashboard_url is None: dashboard_url = str( yarl.URL.build(scheme='http', host=args.external_hostname, port=args.dashboard_port, path=dashboard_path)) server = product_controller.DeviceServer( args.host, args.port, master_controller, args.subarray_product_id, sched, batch_role=args.batch_role, interface_mode=False, localhost=args.localhost, image_resolver_factory=image_resolver_factory, s3_config=args.s3_config if args.s3_config is not None else {}, graph_dir=args.write_graphs, dashboard_url=dashboard_url) if args.dashboard_port != 0: init_dashboard(server, args, dashboard_path) with katsdpservices.start_aiomonitor(loop, args, locals()): loop.run_until_complete(run(sched, server)) loop.close()