示例#1
0
文件: hltd.py 项目: cmsdaq/hltd
    def run(self):
        """
        if role is not defined in the configuration (which it shouldn't)
        infer it from the name of the machine
        """

        #read configuration file
        state = StateInfo()
        resInfo = ResInfo(state)
        setFromConf(self.instance,resInfo)

        logger.info(" ")
        logger.info(" ")
        logger.info("[[[[ ---- hltd start : instance " + self.instance + " ---- ]]]]")
        logger.info(" ")

        if conf.enabled==False:
            logger.warning("Service is currently disabled.")
            sys.exit(1)

        nsslock = threading.Lock()
        resource_lock = threading.Lock()
        mm = MountManager(conf)

        if conf.role == 'fu':
            """
            cleanup resources
            """
            res_in_cloud = len(os.listdir(resInfo.cloud))>0
            while True:
                #switch to cloud mode if cloud files are found (e.g. machine rebooted while in cloud)
                if res_in_cloud:
                    logger.warning('found cores in cloud. this session will start in the cloud mode')
                    try:
                        resInfo.move_resources_to_cloud()
                    except Exception as ex:
                        logger.warning(str(ex))

                    state.cloud_mode=True
                    #TODO:what if cloud mode switch fails?
                    cloud_st = state.cloud_status()
                    if not cloud_st:#cloud off,switch on
                        result = state.ignite_cloud()
                        break
                    elif cloud_st==1:#cloud is already on
                        break
                    elif cloud_st>1:#error,try to switch off cloud and switch HLT mode
                      time.sleep(1)
                      logger.warning("cloud status returned error. going to try to retry check")
                      cloud_st = state.cloud_status()

                      if not cloud_st:#cloud off,switch on
                        result = state.ignite_cloud()
                        break
                      elif cloud_st==1:#cloud is already on
                        break
                      elif cloud_st>1:#error,try to switch off cloud and switch HLT mode
 
                        logger.warning("cloud status returned error. going to try to stop cloud")
                        stop_st = state.extinguish_cloud(repeat=True)
                      #trusting the extinguish function return code
                        if not stop_st:
                          logger.error("failed deactivating cloud")
                          #script error, leaving cores in cloud mode
                          break
                state.cloud_mode=False
                if resInfo.cleanup_resources()==True:break
                time.sleep(0.1)
                logger.warning("retrying cleanup_resources")

            resInfo.calculate_threadnumber()

            #ensure that working directory is ready
            try:os.makedirs(conf.watch_directory)
            except:pass

        #run class init
        runList = RunList()

        #start monitor thread to get fu-box-status docs inserted early in case of mount problems
        boxInfo = BoxInfo()
        num_cpus_initial=-1

        if conf.role == 'fu':
            """
            recheck mount points
            """
            #switch to cloud mode if active, but hltd did not have cores in cloud directory in the last session
            if not res_in_cloud and state.cloud_script_available():
                    cl_status = state.cloud_status()
                    cnt = 5
                    while not (cl_status == 1 or cl_status == 0 or cl_status==66) and cnt>0:
                      time.sleep(1)
                      cnt-=1
                      cl_status = state.cloud_status()
                    if cl_status >0:
                        if cl_status==66:
                          logger.warning('cloud status code 66 (no NOVA stack). Will run in HLT mode')
                        else:
                          if cl_status > 1:
                            logger.error('cloud status script returns error exit code (status:'+str(cl_status)+') after 5 attempts. Trying to deactivate cloud')
                            stop_st = state.extinguish_cloud(repeat=True)
                            if stop_st==True:
                              #cloud was stopped, can continue in HLT mode
                              pass
                            else:
                              logger.error('cloud deactivate failed. HLT mode will be disabled')
                              resInfo.move_resources_to_cloud()
                              state.cloud_mode=True
                          else:
                            logger.warning("cloud services are running on this host at hltd startup, switching to cloud mode")
                            resInfo.move_resources_to_cloud()
                            state.cloud_mode=True

            if conf.watch_directory.startswith('/fff/'):
                p = subprocess.Popen("rm -rf " + conf.watch_directory+'/*',shell=True)
                p.wait()

            if not mm.cleanup_mountpoints(nsslock):
                logger.fatal("error mounting - terminating service")
                os._exit(10)

            #recursively remove any stale run data and other commands in the FU watch directory
            #if conf.watch_directory.strip()!='/':
            #    p = subprocess.Popen("rm -rf " + conf.watch_directory.strip()+'/{run*,end*,quarantined*,exclude,include,suspend*,populationcontrol,herod,logrestart,emu*}',shell=True)
            #    p.wait()

            #count core files
            if conf.dynamic_resources: num_cpus_initial = resInfo.count_resources()

        #start monitor after all state checks/migration have finished
        sm = SystemMonitor.system_monitor(conf,state,resInfo,runList,mm,boxInfo,num_cpus_initial)

        #startup es log collector
        logCollector = None
        if conf.use_elasticsearch == True:
            time.sleep(.2)
            try:
              restartLogCollector(conf,logger,logCollector,self.instance)
            except:
              logger.error("can not spawn log collector. terminating..")
              os._exit(1)

        #BU mode threads
        if conf.role == 'bu':
            #update_success,machine_blacklist=updateBlacklist()
            boxInfo.machine_blacklist=[]
            mm.ramdisk_submount_size=0
            if self.instance == 'main':
                #if there are other instance mountpoints in ramdisk, they will be subtracted from size estimate
                mm.submount_size(conf.watch_directory)

            #start boxinfo elasticsearch updater
            try:os.makedirs(os.path.join(conf.resource_base,'dn'))
            except:pass
            try:os.makedirs(os.path.join(conf.resource_base,'boxes'))
            except:pass
            if conf.use_elasticsearch == True:
                boxInfo.updater = BoxInfoUpdater(conf,nsslock,boxInfo.boxdoc_version)
                boxInfo.updater.start()

        rr = ResourceRanger(conf,state,resInfo,runList,mm,boxInfo,sm,resource_lock)

        #init resource ranger
        try:
            if conf.role == 'bu':
                imask  = inotify.IN_CLOSE_WRITE | inotify.IN_DELETE | inotify.IN_CREATE | inotify.IN_MOVED_TO
                rr.register_inotify_path(conf.resource_base, imask)
                rr.register_inotify_path(os.path.join(conf.resource_base,'boxes'), imask)
            else:
                imask  = inotify.IN_MOVED_TO
                rr.register_inotify_path(os.path.join(conf.resource_base,'idle'), imask)
                rr.register_inotify_path(os.path.join(conf.resource_base,'cloud'), imask)
                rr.register_inotify_path(os.path.join(conf.resource_base,'except'), imask)
                rr.register_inotify_path(os.path.join(conf.resource_base,'quarantined'), imask)
            rr.start_inotify()
            logger.info("started ResourceRanger - watch_directory "+conf.resource_base)
        except Exception as ex:
            logger.error("Exception caught in starting ResourceRanger notifier")
            logger.error(ex)
            os._exit(1)


        #start monitoring new runs
        runRanger = RunRanger(self.instance,conf,state,resInfo,runList,rr,mm,logCollector,nsslock,resource_lock)
        runRanger.register_inotify_path(conf.watch_directory,inotify.IN_CREATE)
        runRanger.start_inotify()
        logger.info("started RunRanger  - watch_directory " + conf.watch_directory)

        #resource notification enabled with inotify set up
        sm.allowResourceNotification()

        try:
            cgitb.enable(display=0, logdir="/tmp")
            handler = CGIHTTPServer.CGIHTTPRequestHandler
            #handler = WebCtrl(self) #to be tested later
            # the following allows the base directory of the http
            # server to be 'conf.watch_directory, which is writeable
            # to everybody
            if os.path.exists(conf.watch_directory+'/cgi-bin'):
                os.remove(conf.watch_directory+'/cgi-bin')
            os.symlink('/opt/hltd/cgi',conf.watch_directory+'/cgi-bin')

            handler.cgi_directories = ['/cgi-bin']
            logger.info("starting http server on port "+str(conf.cgi_port))
            httpd = BaseHTTPServer.HTTPServer(("", conf.cgi_port), handler)

            logger.info("hltd serving at port "+str(conf.cgi_port)+" with role "+conf.role)
            os.chdir(conf.watch_directory)
            logger.info("[[[[ ---- hltd instance " + self.instance + ": init complete, starting httpd ---- ]]]]")
            logger.info("")
            httpd.serve_forever()
        except KeyboardInterrupt:
            logger.info("stop signal detected")
            runList.clearOngoingRunFlags()
            aRuns =  runList.getActiveRuns()
            if len(aRuns)>0:
                logger.info("terminating all ongoing runs")
                for run in aRuns:
                    if conf.role=='fu':
                        run.Shutdown(True,True)
                    elif conf.role=='bu':
                        run.ShutdownBU()
                logger.info("terminated all ongoing runs")
            runRanger.stop_inotify()
            rr.stop_inotify()
            if boxInfo.updater is not None:
                logger.info("stopping boxinfo updater")
                boxInfo.updater.stop()
            if logCollector is not None:
                logger.info("terminating logCollector")
                logCollector.terminate()
            logger.info("stopping system monitor")
            rr.stop_managed_monitor()
            logger.info("closing httpd socket")
            httpd.socket.close()
            logger.info(threading.enumerate())
            logger.info("unmounting mount points")
            if not mm.cleanup_mountpoints(nsslock,remount=False):
                time.sleep(1)
                mm.cleanup_mountpoints(nsslock,remount=False)

            logger.info("shutdown of service (main thread) completed")
        except Exception as ex:
            logger.info("exception encountered in operating hltd")
            logger.info(ex)
            runRanger.stop_inotify()
            rr.stop_inotify()
            rr.stop_managed_monitor()
            raise