def run(self): """ if role is not defined in the configuration (which it shouldn't) infer it from the name of the machine """ #read configuration file state = StateInfo() resInfo = ResInfo(state) setFromConf(self.instance,resInfo) logger.info(" ") logger.info(" ") logger.info("[[[[ ---- hltd start : instance " + self.instance + " ---- ]]]]") logger.info(" ") if conf.enabled==False: logger.warning("Service is currently disabled.") sys.exit(1) nsslock = threading.Lock() resource_lock = threading.Lock() mm = MountManager(conf) if conf.role == 'fu': """ cleanup resources """ res_in_cloud = len(os.listdir(resInfo.cloud))>0 while True: #switch to cloud mode if cloud files are found (e.g. machine rebooted while in cloud) if res_in_cloud: logger.warning('found cores in cloud. this session will start in the cloud mode') try: resInfo.move_resources_to_cloud() except Exception as ex: logger.warning(str(ex)) state.cloud_mode=True #TODO:what if cloud mode switch fails? cloud_st = state.cloud_status() if not cloud_st:#cloud off,switch on result = state.ignite_cloud() break elif cloud_st==1:#cloud is already on break elif cloud_st>1:#error,try to switch off cloud and switch HLT mode time.sleep(1) logger.warning("cloud status returned error. going to try to retry check") cloud_st = state.cloud_status() if not cloud_st:#cloud off,switch on result = state.ignite_cloud() break elif cloud_st==1:#cloud is already on break elif cloud_st>1:#error,try to switch off cloud and switch HLT mode logger.warning("cloud status returned error. going to try to stop cloud") stop_st = state.extinguish_cloud(repeat=True) #trusting the extinguish function return code if not stop_st: logger.error("failed deactivating cloud") #script error, leaving cores in cloud mode break state.cloud_mode=False if resInfo.cleanup_resources()==True:break time.sleep(0.1) logger.warning("retrying cleanup_resources") resInfo.calculate_threadnumber() #ensure that working directory is ready try:os.makedirs(conf.watch_directory) except:pass #run class init runList = RunList() #start monitor thread to get fu-box-status docs inserted early in case of mount problems boxInfo = BoxInfo() num_cpus_initial=-1 if conf.role == 'fu': """ recheck mount points """ #switch to cloud mode if active, but hltd did not have cores in cloud directory in the last session if not res_in_cloud and state.cloud_script_available(): cl_status = state.cloud_status() cnt = 5 while not (cl_status == 1 or cl_status == 0 or cl_status==66) and cnt>0: time.sleep(1) cnt-=1 cl_status = state.cloud_status() if cl_status >0: if cl_status==66: logger.warning('cloud status code 66 (no NOVA stack). Will run in HLT mode') else: if cl_status > 1: logger.error('cloud status script returns error exit code (status:'+str(cl_status)+') after 5 attempts. Trying to deactivate cloud') stop_st = state.extinguish_cloud(repeat=True) if stop_st==True: #cloud was stopped, can continue in HLT mode pass else: logger.error('cloud deactivate failed. HLT mode will be disabled') resInfo.move_resources_to_cloud() state.cloud_mode=True else: logger.warning("cloud services are running on this host at hltd startup, switching to cloud mode") resInfo.move_resources_to_cloud() state.cloud_mode=True if conf.watch_directory.startswith('/fff/'): p = subprocess.Popen("rm -rf " + conf.watch_directory+'/*',shell=True) p.wait() if not mm.cleanup_mountpoints(nsslock): logger.fatal("error mounting - terminating service") os._exit(10) #recursively remove any stale run data and other commands in the FU watch directory #if conf.watch_directory.strip()!='/': # p = subprocess.Popen("rm -rf " + conf.watch_directory.strip()+'/{run*,end*,quarantined*,exclude,include,suspend*,populationcontrol,herod,logrestart,emu*}',shell=True) # p.wait() #count core files if conf.dynamic_resources: num_cpus_initial = resInfo.count_resources() #start monitor after all state checks/migration have finished sm = SystemMonitor.system_monitor(conf,state,resInfo,runList,mm,boxInfo,num_cpus_initial) #startup es log collector logCollector = None if conf.use_elasticsearch == True: time.sleep(.2) try: restartLogCollector(conf,logger,logCollector,self.instance) except: logger.error("can not spawn log collector. terminating..") os._exit(1) #BU mode threads if conf.role == 'bu': #update_success,machine_blacklist=updateBlacklist() boxInfo.machine_blacklist=[] mm.ramdisk_submount_size=0 if self.instance == 'main': #if there are other instance mountpoints in ramdisk, they will be subtracted from size estimate mm.submount_size(conf.watch_directory) #start boxinfo elasticsearch updater try:os.makedirs(os.path.join(conf.resource_base,'dn')) except:pass try:os.makedirs(os.path.join(conf.resource_base,'boxes')) except:pass if conf.use_elasticsearch == True: boxInfo.updater = BoxInfoUpdater(conf,nsslock,boxInfo.boxdoc_version) boxInfo.updater.start() rr = ResourceRanger(conf,state,resInfo,runList,mm,boxInfo,sm,resource_lock) #init resource ranger try: if conf.role == 'bu': imask = inotify.IN_CLOSE_WRITE | inotify.IN_DELETE | inotify.IN_CREATE | inotify.IN_MOVED_TO rr.register_inotify_path(conf.resource_base, imask) rr.register_inotify_path(os.path.join(conf.resource_base,'boxes'), imask) else: imask = inotify.IN_MOVED_TO rr.register_inotify_path(os.path.join(conf.resource_base,'idle'), imask) rr.register_inotify_path(os.path.join(conf.resource_base,'cloud'), imask) rr.register_inotify_path(os.path.join(conf.resource_base,'except'), imask) rr.register_inotify_path(os.path.join(conf.resource_base,'quarantined'), imask) rr.start_inotify() logger.info("started ResourceRanger - watch_directory "+conf.resource_base) except Exception as ex: logger.error("Exception caught in starting ResourceRanger notifier") logger.error(ex) os._exit(1) #start monitoring new runs runRanger = RunRanger(self.instance,conf,state,resInfo,runList,rr,mm,logCollector,nsslock,resource_lock) runRanger.register_inotify_path(conf.watch_directory,inotify.IN_CREATE) runRanger.start_inotify() logger.info("started RunRanger - watch_directory " + conf.watch_directory) #resource notification enabled with inotify set up sm.allowResourceNotification() try: cgitb.enable(display=0, logdir="/tmp") handler = CGIHTTPServer.CGIHTTPRequestHandler #handler = WebCtrl(self) #to be tested later # the following allows the base directory of the http # server to be 'conf.watch_directory, which is writeable # to everybody if os.path.exists(conf.watch_directory+'/cgi-bin'): os.remove(conf.watch_directory+'/cgi-bin') os.symlink('/opt/hltd/cgi',conf.watch_directory+'/cgi-bin') handler.cgi_directories = ['/cgi-bin'] logger.info("starting http server on port "+str(conf.cgi_port)) httpd = BaseHTTPServer.HTTPServer(("", conf.cgi_port), handler) logger.info("hltd serving at port "+str(conf.cgi_port)+" with role "+conf.role) os.chdir(conf.watch_directory) logger.info("[[[[ ---- hltd instance " + self.instance + ": init complete, starting httpd ---- ]]]]") logger.info("") httpd.serve_forever() except KeyboardInterrupt: logger.info("stop signal detected") runList.clearOngoingRunFlags() aRuns = runList.getActiveRuns() if len(aRuns)>0: logger.info("terminating all ongoing runs") for run in aRuns: if conf.role=='fu': run.Shutdown(True,True) elif conf.role=='bu': run.ShutdownBU() logger.info("terminated all ongoing runs") runRanger.stop_inotify() rr.stop_inotify() if boxInfo.updater is not None: logger.info("stopping boxinfo updater") boxInfo.updater.stop() if logCollector is not None: logger.info("terminating logCollector") logCollector.terminate() logger.info("stopping system monitor") rr.stop_managed_monitor() logger.info("closing httpd socket") httpd.socket.close() logger.info(threading.enumerate()) logger.info("unmounting mount points") if not mm.cleanup_mountpoints(nsslock,remount=False): time.sleep(1) mm.cleanup_mountpoints(nsslock,remount=False) logger.info("shutdown of service (main thread) completed") except Exception as ex: logger.info("exception encountered in operating hltd") logger.info(ex) runRanger.stop_inotify() rr.stop_inotify() rr.stop_managed_monitor() raise