class MonitorRanger: def __init__(self,recursiveMode=False): self.logger = logging.getLogger(self.__class__.__name__) self.eventQueue = False self.inotifyWrapper = InotifyWrapper(self,recursiveMode) def register_inotify_path(self,path,mask): self.inotifyWrapper.registerPath(path,mask) def start_inotify(self): self.inotifyWrapper.start() def stop_inotify(self): logging.info("MonitorRanger: Stop inotify wrapper") self.inotifyWrapper.stop() logging.info("MonitorRanger: Join inotify wrapper") self.inotifyWrapper.join() logging.info("MonitorRanger: Inotify wrapper returned") def process_default(self, event): self.logger.debug("event: %s on: %s" %(str(event.mask),event.fullpath)) if self.eventQueue: self.eventQueue.put(event) def setEventQueue(self,queue): self.eventQueue = queue
def __init__(self, recursiveMode=False): self.logger = logging.getLogger(self.__class__.__name__) self.eventQueue = False self.inotifyWrapper = InotifyWrapper(self, recursiveMode) self.queueStatusPath = None self.queueStatusPathMon = None self.queueStatusPathDir = None self.queuedLumiList = [] self.maxQueuedLumi = -1 #max seen/closed by anelastic thread self.maxReceivedEoLS = -1 self.maxClosedLumi = -1 self.numOpenLumis = -1 self.lock = threading.Lock()
def __init__(self,recursiveMode=False): self.logger = logging.getLogger(self.__class__.__name__) self.eventQueue = False self.inotifyWrapper = InotifyWrapper(self,recursiveMode) self.queueStatusPath = None self.queueStatusPathMon = None self.queueStatusPathDir = None self.queuedLumiList = [] self.maxQueuedLumi=-1 #max seen/closed by anelastic thread self.maxReceivedEoLS=-1 self.maxClosedLumi=-1 self.numOpenLumis=-1 self.maxCMSSWLumi=-1 self.maxLSWithOutput=-1 self.lock = threading.Lock() self.output_bw=0 self.lumi_bw=0 self.data_size_ls_num = 0 self.data_size_val = 0 self.data_size_last_update = 0 self.statsCollectorThread = None
def __init__(self,recursiveMode=False): self.logger = logging.getLogger(self.__class__.__name__) self.eventQueue = False self.inotifyWrapper = InotifyWrapper(self,recursiveMode) self.queueStatusPath = None self.queueStatusPathMon = None self.queueStatusPathDir = None self.queuedLumiList = [] self.maxQueuedLumi=-1 #max seen/closed by anelastic thread self.maxReceivedEoLS=-1 self.maxClosedLumi=-1 self.numOpenLumis=-1 self.maxCMSSWLumi=-1 self.lock = threading.Lock()
def __init__(self,confClass,stateInfo,resInfo,runList,mountMgr,boxInfo,monitor,resource_lock): self.inotifyWrapper = InotifyWrapper(self) self.logger = logging.getLogger(self.__class__.__name__) self.state = stateInfo self.resInfo = resInfo self.runList = runList self.managed_monitor = monitor self.managed_monitor.preStart() self.managed_monitor.start() self.regpath = [] self.mm = mountMgr self.boxInfo = boxInfo self.resource_lock = resource_lock self.hostname = os.uname()[1] global conf conf = confClass
def __init__(self,recursiveMode=False): self.logger = logging.getLogger(self.__class__.__name__) self.eventQueue = False self.inotifyWrapper = InotifyWrapper(self,recursiveMode)
class MonitorRanger: def __init__(self,recursiveMode=False): self.logger = logging.getLogger(self.__class__.__name__) self.eventQueue = False self.inotifyWrapper = InotifyWrapper(self,recursiveMode) self.queueStatusPath = None self.queueStatusPathMon = None self.queueStatusPathDir = None self.queuedLumiList = [] self.maxQueuedLumi=-1 #max seen/closed by anelastic thread self.maxReceivedEoLS=-1 self.maxClosedLumi=-1 self.numOpenLumis=-1 self.maxCMSSWLumi=-1 self.lock = threading.Lock() def register_inotify_path(self,path,mask): self.inotifyWrapper.registerPath(path,mask) def start_inotify(self): self.inotifyWrapper.start() def stop_inotifyTimeout(self,timeout): self.logger.info("MonitorRanger: Stop inotify wrapper") self.inotifyWrapper.stop() self.logger.info("MonitorRanger: Join inotify wrapper") self.inotifyWrapper.join(timeout) if self.inotifyWrapper.isAlive(): self.logger.info("MonitorRanger: Inotify wrapper join timeout ("+str(timeout)+")") return False else: self.logger.info("MonitorRanger: Inotify wrapper returned") return True def stop_inotify(self): self.logger.info("MonitorRanger: Stop inotify wrapper") self.inotifyWrapper.stop() self.logger.info("MonitorRanger: Join inotify wrapper") self.inotifyWrapper.join() self.logger.info("MonitorRanger: Inotify wrapper returned") def process_default(self, event): self.logger.debug("event: %s on: %s" %(str(event.mask),event.fullpath)) if self.eventQueue: if self.queueStatusPath!=None: if self.checkNewLumi(event): self.eventQueue.put(event) else: self.eventQueue.put(event) def setEventQueue(self,queue): self.eventQueue = queue def checkNewLumi(self,event): if event.fullpath.endswith("_EoLS.jsn"): try: queuedLumi = int(os.path.basename(event.fullpath).split('_')[1][2:]) self.lock.acquire() if queuedLumi not in self.queuedLumiList: if queuedLumi>self.maxQueuedLumi: self.maxQueuedLumi=queuedLumi self.queuedLumiList.append(queuedLumi) self.lock.release() self.updateQueueStatusFile() else: self.lock.release() #skip if EoL for LS in queue has already been written once (e.g. double file create race) return False except: self.logger.warning("Problem checking new EoLS filename: "+str(os.path.basename(event.fullpath)) + " error:"+str(ex)) try:self.lock.release() except:pass #delete associated BoLS file try: os.unlink(event.fullpath[:event.fullpath.rfind("_EoLS.jsn")]+"_BoLS.jsn") except: pass elif event.fullpath.endswith("_BoLS.jsn"): try: queuedLumi = int(os.path.basename(event.fullpath).split('_')[1][2:]) if queuedLumi>self.maxCMSSWLumi: self.maxCMSSWLumi = queuedLumi self.updateQueueStatusFile() except: pass #not passed to the queue return False return True def notifyLumi(self,ls,maxReceivedEoLS,maxClosedLumi,numOpenLumis): if self.queueStatusPath==None:return self.lock.acquire() if ls!=None and ls in self.queuedLumiList: self.queuedLumiList.remove(ls) self.maxReceivedEoLS=maxReceivedEoLS self.maxClosedLumi=maxClosedLumi self.numOpenLumis=numOpenLumis self.lock.release() self.updateQueueStatusFile() def setQueueStatusPath(self,path,monpath): self.queueStatusPath = path self.queueStatusPathMon = monpath self.queueStatusPathDir = path[:path.rfind('/')] def updateQueueStatusFile(self): if self.queueStatusPath==None:return num_queued_lumis = len(self.queuedLumiList) if not os.path.exists(self.queueStatusPathDir): self.logger.error("No directory to write queueStatusFile: "+str(self.queueStatusPathDir)) else: self.logger.info("Update status file - queued lumis:"+str(num_queued_lumis)+ " EoLS:: max queued:"+str(self.maxQueuedLumi) \ +" un-queued:"+str(self.maxReceivedEoLS)+" Lumis:: last closed:"+str(self.maxClosedLumi) \ + " num open:"+str(self.numOpenLumis) + " max LS in cmssw:"+str(self.maxCMSSWLumi)) #write json doc = {"numQueuedLS":num_queued_lumis, "maxQueuedLS":self.maxQueuedLumi, "numReadFromQueueLS:":self.maxReceivedEoLS, "maxClosedLS":self.maxClosedLumi, "numReadOpenLS":self.numOpenLumis, "CMSSWMaxLS":self.maxCMSSWLumi } try: if self.queueStatusPath!=None: attempts=3 while attempts>0: try: with open(self.queueStatusPath+TEMPEXT,"w") as fp: #fcntl.flock(fp, fcntl.LOCK_EX) json.dump(doc,fp) os.rename(self.queueStatusPath+TEMPEXT,self.queueStatusPath) break except Exception as ex: attempts-=1 if attempts==0: raise ex self.logger.warning("Unable to write status file, with error:" + str(ex)+".retrying...") time.sleep(0.05) try: shutil.copyfile(self.queueStatusPath,self.queueStatusPathMon) except: pass except Exception as ex: self.logger.error("Unable to open/write " + self.queueStatusPath) self.logger.exception(ex)
class MonitorRanger: def __init__(self,recursiveMode=False): self.logger = logging.getLogger(self.__class__.__name__) self.eventQueue = False self.inotifyWrapper = InotifyWrapper(self,recursiveMode) self.queueStatusPath = None self.queueStatusPathMon = None self.queueStatusPathDir = None self.queuedLumiList = [] self.maxQueuedLumi=-1 #max seen/closed by anelastic thread self.maxReceivedEoLS=-1 self.maxClosedLumi=-1 self.numOpenLumis=-1 self.maxCMSSWLumi=-1 self.maxLSWithOutput=-1 self.lock = threading.Lock() self.output_bw=0 self.lumi_bw=0 self.data_size_ls_num = 0 self.data_size_val = 0 self.data_size_last_update = 0 self.statsCollectorThread = None def startStatsCollector(self): self.statsCollectorThread = threading.Thread(target=self.statsCollector) self.statsCollectorThread.daemon=True #set as daemon thread (not blocking process termination) self.statsCollectorThread.start() def statsCollector(self): global bw_cnt bw_cnt_time=None while True: new_time = time.time() if bw_cnt_time is not None: d_t = new_time-bw_cnt_time if d_t!=0: self.output_bw=bw_cnt/d_t bw_cnt=0 bw_cnt_time=new_time #refresh last completed lumi BW if self.data_size_ls_num>0: if new_time - self.data_size_last_update < 60: self.lumi_bw=self.data_size_val/23.31 else: self.lumi_bw=0. if self.queueStatusPathDir and not os.path.exists(self.queueStatusPathDir): self.logger.info('no queue status dir yet.') else: self.updateQueueStatusFile(".statsCollector") time.sleep(23.4) def register_inotify_path(self,path,mask): self.inotifyWrapper.registerPath(path,mask) def start_inotify(self): self.inotifyWrapper.start() def stop_inotifyTimeout(self,timeout): self.logger.info("MonitorRanger: Stop inotify wrapper") self.inotifyWrapper.stop() self.logger.info("MonitorRanger: Join inotify wrapper") self.inotifyWrapper.join(timeout) if self.inotifyWrapper.isAlive(): self.logger.info("MonitorRanger: Inotify wrapper join timeout ("+str(timeout)+")") return False else: self.logger.info("MonitorRanger: Inotify wrapper returned") return True def stop_inotify(self): self.logger.info("MonitorRanger: Stop inotify wrapper") self.inotifyWrapper.stop() self.logger.info("MonitorRanger: Join inotify wrapper") self.inotifyWrapper.join() self.logger.info("MonitorRanger: Inotify wrapper returned") def process_default(self, event): self.logger.debug("event: %s on: %s" %(str(event.mask),event.fullpath)) if self.eventQueue: if self.queueStatusPath!=None: if self.checkNewLumi(event): self.eventQueue.put(event) else: self.eventQueue.put(event) def setEventQueue(self,queue): self.eventQueue = queue def checkNewLumi(self,event): if event.fullpath.endswith("_EoLS.jsn"): try: queuedLumi = int(os.path.basename(event.fullpath).split('_')[1][2:]) self.lock.acquire() if queuedLumi not in self.queuedLumiList: if queuedLumi>self.maxQueuedLumi: self.maxQueuedLumi=queuedLumi self.queuedLumiList.append(queuedLumi) self.lock.release() self.updateQueueStatusFile(".checkNewLumi") else: self.lock.release() #skip if EoL for LS in queue has already been written once (e.g. double file create race) return False except Exception as ex: self.logger.warning("Problem checking new EoLS filename: "+str(os.path.basename(event.fullpath)) + " error:"+str(ex)) try:self.lock.release() except:pass #delete associated BoLS file try: os.unlink(event.fullpath[:event.fullpath.rfind("_EoLS.jsn")]+"_BoLS.jsn") except: pass elif event.fullpath.endswith("_BoLS.jsn"): try: queuedLumi = int(os.path.basename(event.fullpath).split('_')[1][2:]) if queuedLumi>self.maxCMSSWLumi: self.maxCMSSWLumi = queuedLumi self.updateQueueStatusFile(".checkNewLumi") except: pass #not passed to the queue return False return True def notifyLumi(self,ls,maxReceivedEoLS,maxClosedLumi,numOpenLumis): if self.queueStatusPath==None:return self.lock.acquire() if ls!=None and ls in self.queuedLumiList: self.queuedLumiList.remove(ls) self.maxReceivedEoLS=maxReceivedEoLS self.maxClosedLumi=maxClosedLumi self.numOpenLumis=numOpenLumis self.lock.release() self.updateQueueStatusFile(".notifyLumi") def notifyMaxLsWithOutput(self,ls): self.maxLSWithOutput=max(ls,self.maxLSWithOutput) def setQueueStatusPath(self,path,monpath): self.queueStatusPath = path self.queueStatusPathMon = monpath self.queueStatusPathDir = path[:path.rfind('/')] def updateQueueStatusFile(self,tmpsuffix): if self.queueStatusPath==None:return num_queued_lumis = len(self.queuedLumiList) if not os.path.exists(self.queueStatusPathDir): self.logger.error("No directory to write queueStatusFile: "+str(self.queueStatusPathDir)) else: self.logger.info("Update status file - queued lumis:"+str(num_queued_lumis)+ " EoLS:: max queued:"+str(self.maxQueuedLumi) \ +" un-queued:"+str(self.maxReceivedEoLS)+" Lumis:: last closed:"+str(self.maxClosedLumi) \ + " num open:"+str(self.numOpenLumis) + " max LS in cmssw:"+str(self.maxCMSSWLumi)) #write json doc = {"numQueuedLS":num_queued_lumis, "maxQueuedLS":self.maxQueuedLumi, "numReadFromQueueLS:":self.maxReceivedEoLS, "maxClosedLS":self.maxClosedLumi, "numReadOpenLS":self.numOpenLumis, "CMSSWMaxLS":self.maxCMSSWLumi, "maxLSWithOutput":self.maxLSWithOutput, "outputBW": self.output_bw, "lumiBW": self.lumi_bw } if self.queueStatusPath!=None: attempts=3 while attempts>0: try: #copy to main hltd tmpjson=json.dumps(doc) with open(self.queueStatusPath+tmpsuffix+TEMPEXT,"w") as fp: json.dump(doc,fp) os.rename(self.queueStatusPath+tmpsuffix+TEMPEXT,self.queueStatusPath) #copy to monitoring directory try: shutil.copyfile(self.queueStatusPath,self.queueStatusPathMon) except: pass break except Exception as ex: attempts-=1 if attempts==0: self.logger.error("Unable to open/write " + self.queueStatusPath) self.logger.exception(ex) else: self.logger.warning("Unable to write status file, with error:" + str(ex)+".retrying...") time.sleep(0.05)
class ResourceRanger: def __init__(self,confClass,stateInfo,resInfo,runList,mountMgr,boxInfo,monitor,resource_lock): self.inotifyWrapper = InotifyWrapper(self) self.logger = logging.getLogger(self.__class__.__name__) self.state = stateInfo self.resInfo = resInfo self.runList = runList self.managed_monitor = monitor self.managed_monitor.preStart() self.managed_monitor.start() self.regpath = [] self.mm = mountMgr self.boxInfo = boxInfo self.resource_lock = resource_lock self.hostname = os.uname()[1] global conf conf = confClass def register_inotify_path(self,path,mask): self.inotifyWrapper.registerPath(path,mask) self.regpath.append(path) def start_inotify(self): self.inotifyWrapper.start() def stop_managed_monitor(self): self.managed_monitor.stop() self.managed_monitor.join() self.logger.info("ResourceRanger: managed monitor shutdown done") def stop_inotify(self): self.inotifyWrapper.stop() self.inotifyWrapper.join() self.logger.info("ResourceRanger: Inotify wrapper shutdown done") def process_IN_MOVED_TO(self, event): self.logger.debug('ResourceRanger-MOVEDTO: event '+event.fullpath) basename = os.path.basename(event.fullpath) #closures for stopping resources def stopResourceMaybe(resourcename,current_run,quarantining): #detect if this resource belongs to the run and terminate the process if needed #this will only happen if last run is an ongoing run if not conf.role=='fu' or not conf.dynamic_resources: return None activeRuns = self.runList.getActiveRuns() for checkRun in activeRuns: #skip if this is part of normal stopping procedure if current_run: if not current_run.is_ongoing_run and current_run.runnumber==checkRun.runnumber:continue for checkRes in checkRun.online_resource_list: if resourcename in checkRes.cpu and checkRes.processstate==100: self.logger.info('found matching resource for '+resourcename) time.sleep(.1) #allow EoR procedure (no process restarts for that run) in resource is from a previous run #TODO: check if this is last remaining resource of current run, then EoR action could be taken eor_allow = current_run.runnumber != checkRun.runnumber and not quarantining checkRes.Stop(end_run_allow=eor_allow,move_q=quarantining)#stop and release all resources return checkRes return None def waitResource(resource,is_locked): def resJoin(join_timeout): if is_locked: try:self.resource_lock.release() except:pass resource.watchdog.join(join_timeout) if is_locked:self.resource_lock.acquire() if resource: try: resJoin(120) if resource.isAlive(): self.logger.info('terminating ' + resource.process.pid) resource.process.terminate() resJoin(30) if resource.isAlive(): self.logger.info('killing ' + resource.process.pid) resource.process.kill() resJoin(10) except Exception as ex: self.logger.info("exception in waitResource: "+str(ex)) if is_locked: #make sure to return it locked try:self.resource_lock.release() except:pass self.resource_lock.acquire() return if basename.startswith('resource_summary'):return try: resourcepath=event.fullpath[1:event.fullpath.rfind("/")] resourcestate=resourcepath[resourcepath.rfind("/")+1:] resourcename=event.fullpath[event.fullpath.rfind("/")+1:] self.resource_lock.acquire() if not (resourcestate == 'online' or resourcestate == 'cloud' or resourcestate == 'quarantined'): self.logger.debug('ResourceNotifier: new resource ' +resourcename +' in ' +resourcepath +' state ' +resourcestate ) if self.state.cloud_mode and not \ self.state.entering_cloud_mode and not \ self.state.exiting_cloud_mode and not \ self.state.abort_cloud_mode and not \ self.state.disabled_resource_allocation: time.sleep(1) logging.info('detected resource moved to non-cloud resource dir while already switched to cloud mode. Deactivating cloud.') with open(os.path.join(conf.watch_directory,'include'),'w+') as fobj: pass self.resource_lock.release() time.sleep(1) return run = self.runList.getLastOngoingRun() if run is not None: self.logger.info("ResourceRanger: found active run "+str(run.runnumber)+ " when received inotify MOVED event for "+event.fullpath) """grab resources that become available #@@EM implement threaded acquisition of resources here """ #make sure owner of the process ends or is terminated (in case file was moved by action other than process exit) if os.path.exists(event.fullpath): waitResource(stopResourceMaybe(resourcename,run,False),is_locked=True) if run is not None: #find all ready cores in same dir where inotify was triggered try: reslist = os.listdir('/'+resourcepath) except Exception as ex: self.logger.error("RUN:"+str(run.runnumber)+" - exception encountered in looking for resources") self.logger.exception(ex) #put inotify-ed resource as the first item fileFound=False for resindex,resname in enumerate(reslist): fileFound=False if resname == resourcename: fileFound=True if resindex != 0: firstitem = reslist[0] reslist[0] = resourcename reslist[resindex] = firstitem break if fileFound==False: #inotified file was already moved earlier self.resource_lock.release() return #acquire sufficient cores for a multithreaded process start #returns whether it can be matched to existing online resource or not matchedList = run.MatchResource(reslist) if matchedList: #matched with previous resource (restarting process) acquired_sufficient = True res = run.AcquireResource(matchedList,resourcestate) else: resourcenames = [] for resname in reslist: if len(resourcenames) < self.resInfo.nstreams: resourcenames.append(resname) else: break acquired_sufficient = False if len(resourcenames) == self.resInfo.nstreams: acquired_sufficient = True res = run.AcquireResource(resourcenames,resourcestate) if acquired_sufficient: self.logger.info("ResourceRanger: acquired resource(s) "+str(res.cpu)) run.StartOnResource(res) self.logger.info("ResourceRanger: started process on resource " +str(res.cpu)) else: #if no run is active, move (x N threads) files from except to idle to be picked up for the next run #todo: debug,write test for this... if resourcestate == 'except': try: reslist = os.listdir('/'+resourcepath) #put inotify-ed resource as the first item fileFound=False for resindex,resname in enumerate(reslist): if resname == resourcename: fileFound=True if resindex != 0: firstitem = reslist[0] reslist[0] = resourcename reslist[resindex] = firstitem break if fileFound==False: #inotified file was already moved earlier self.resource_lock.release() return resourcenames = [] for resname in reslist: if len(resourcenames) < self.resInfo.nstreams: resourcenames.append(resname) else: break if len(resourcenames) == self.resInfo.nstreams: for resname in resourcenames: self.resInfo.resmove(self.resInfo.broken,self.resInfo.idles,resname) #move this except after listdir? except Exception as ex: self.logger.info("exception encountered in looking for resources in except") self.logger.info(ex) elif resourcestate=="cloud": #check if cloud mode was initiated, activate if necessary if conf.role=='fu' and self.state.cloud_mode==False: time.sleep(1) logging.info('detected core moved to cloud resources. Triggering cloud activation sequence.') with open(os.path.join(conf.watch_directory,'exclude'),'w+') as fobj: pass time.sleep(1) elif resourcestate=="quarantined": #quarantined check, terminate owner if needed if os.path.exists(event.fullpath): waitResource(stopResourceMaybe(resourcename,self.runList.getLastOngoingRun(),True),is_locked=True) except Exception as ex: self.logger.error("exception in ResourceRanger") self.logger.exception(ex) try: self.resource_lock.release() except:pass def process_IN_CREATE(self, event): self.logger.debug('ResourceRanger-CREATE: event '+event.fullpath) if conf.dqm_machine:return basename = os.path.basename(event.fullpath) if basename.startswith('resource_summary'):return if basename=='blacklist':return if basename.startswith('test'):return if conf.role!='bu' or basename.endswith(self.hostname): return #catch stop and hltd restart on FU which might leave state inconsistent try: if basename in self.boxInfo.FUMap: if self.boxInfo.FUMap[basename][0]["cloudState"]=="resourcesMasked": self.boxInfo.FUMap[basename][0]["cloudState"]="off" except: pass self.findRunAndNotify(basename,event.fullpath,False) def process_default(self, event): self.logger.debug('ResourceRanger: event '+event.fullpath +' type '+ str(event.mask)) filename=event.fullpath[event.fullpath.rfind("/")+1:] def process_IN_CLOSE_WRITE(self, event): self.logger.debug('ResourceRanger-IN_CLOSE_WRITE: event '+event.fullpath) resourcepath=event.fullpath[0:event.fullpath.rfind("/")] basename = os.path.basename(event.fullpath) if basename.startswith('resource_summary'):return if conf.role=='fu':return if basename == os.uname()[1]:return if basename.startswith('test'):return if basename == 'blacklist': with open(os.path.join(conf.watch_directory,'appliance','blacklist'),'r') as fi: try: self.boxInfo.machine_blacklist = json.load(fi) except: pass if resourcepath.endswith('boxes'): if basename in self.boxInfo.machine_blacklist: try:self.boxInfo.FUMap.pop(basename) except:pass else: current_time = time.time() current_datetime = datetime.datetime.utcfromtimestamp(current_time) emptyBox=False try: currentBox = self.boxInfo.FUMap[basename] except: currentBox = None try: infile = fileHandler(event.fullpath) if infile.data=={}:emptyBox=True #check which time is later (in case of small clock skew and small difference) if current_datetime > dateutil.parser.parse(infile.data['fm_date']): dt = (current_datetime - dateutil.parser.parse(infile.data['fm_date'])).seconds else: dt = -(dateutil.parser.parse(infile.data['fm_date'])-current_datetime).seconds if dt > 5: self.logger.warning('setting stale flag for resource '+basename + ' which is '+str(dt)+' seconds behind') #should be << 1s if NFS is responsive, set stale handle flag infile.data['detectedStaleHandle']=True elif dt < -5: self.logger.error('setting stale flag for resource '+basename + ' which is '+str(dt)+' seconds ahead (clock skew)') infile.data['detectedStaleHandle']=True self.boxInfo.FUMap[basename] = [infile.data,current_time,True] #detect flip from cloud to non-cloud (only notify FU if no active runs are found FU) try: if currentBox and currentBox[0]["cloudState"]!="off" and infile.data["cloudState"]=="off" and len(infile.data["activeRuns"])==0: self.logger.info('cloud state flip detected for ' + str(basename) + ':' + str(currentBox[0]["cloudState"]) + ' to ' + str(infile.data["cloudState"])) self.findRunAndNotify(basename,event.fullpath,True) except (KeyError,IndexError,TypeError) as ex: self.logger.warning("cloud flip detection problem: "+str(ex)) except Exception as ex: if not emptyBox: self.logger.error("Unable to read of parse boxinfo file "+basename) self.logger.exception(ex) else: self.logger.warning("got empty box file "+basename) try: self.boxInfo.FUMap[basename][2]=False except: #boxinfo entry doesn't exist yet self.boxInfo.FUMap[basename]=[None,current_time,False] def findRunAndNotify(self,basename,fullpath,override): try: resourceage = os.path.getmtime(fullpath) self.resource_lock.acquire() lrun = self.runList.getLastRun() newRes = None if lrun!=None: is_stale,f_ip = lrun.checkStaleResourceFileAndIP(fullpath) if is_stale: self.logger.error("RUN:"+str(lrun.runnumber)+" - notification: skipping resource "+basename+" which is stale") self.resource_lock.release() return self.logger.info('Try attaching FU resource: last run is '+str(lrun.runnumber)) newRes = lrun.maybeNotifyNewRun(basename,resourceage,f_ip,override) self.resource_lock.release() if newRes: newRes.NotifyNewRun(lrun.runnumber) except Exception as ex: self.logger.exception(ex) try:self.resource_lock.release() except:pass def checkNotifiedBoxes(self,runNumber): keys = self.boxInfo.FUMap.keys() c_time = time.time() for key in keys: #if key==self.hostname:continue #checked in inotify thread try: edata,etime,lastStatus = self.boxInfo.FUMap[key] except: #key deleted return False,False if c_time - etime > 20:continue #parsing or file access, check failed if lastStatus==False: return False,False try: #run is found in at least one box if runNumber in edata['activeRuns']:return True,True except: #invalid boxinfo data return False,False #all box data are valid, run not found return True,False def checkBoxes(self,runNumber): checkSuccessful=True runFound=False ioErrCount=0 valErrCount=0 files = os.listdir(self.regpath[-1]) c_time = time.time() for file in files: if file == self.hostname:continue #ignore file if it is too old (FU with a problem) filename = os.path.join(dir,file) if c_time - os.path.getmtime(filename) > 20:continue try: with open(filename,'r') as fp: doc = json.load(fp) except IOError as ex: checkSuccessful=False break except ValueError as ex: checkSuccessful=False break except Exception as ex: self.logger.exception(ex) checkSuccessful=False break; try: if runNumber in doc['activeRuns']: runFound=True break; except Exception as ex: self.logger.exception(ex) checkSuccessful=False break return checkSuccessful,runFound