def updateCS(changeSet): global vo, dry, ceBdiiDict changeList = sorted(changeSet) if dry: gLogger.notice('The following needed changes are detected:\n') else: gLogger.notice('We are about to make the following changes to CS:\n') for entry in changeList: gLogger.notice("%s/%s %s -> %s" % entry) if not dry: csAPI = CSAPI() csAPI.initialize() result = csAPI.downloadCSData() if not result['OK']: gLogger.error('Failed to initialize CSAPI object', result['Message']) DIRACExit(-1) for section, option, value, new_value in changeSet: if value == 'Unknown' or not value: csAPI.setOption(cfgPath(section, option), new_value) else: csAPI.modifyValue(cfgPath(section, option), new_value) yn = six.moves.input( 'Do you want to commit changes to CS ? [default yes] [yes|no]: ') if yn == '' or yn.lower().startswith('y'): result = csAPI.commit() if not result['OK']: gLogger.error("Error while commit to CS", result['Message']) else: gLogger.notice("Successfully committed %d changes to CS" % len(changeSet))
def updateCS( changeSet ): global vo, dry, ceBdiiDict changeList = list( changeSet ) changeList.sort() if dry: gLogger.notice( 'The following needed changes are detected:\n' ) else: gLogger.notice( 'We are about to make the following changes to CS:\n' ) for entry in changeList: gLogger.notice( "%s/%s %s -> %s" % entry ) if not dry: csAPI = CSAPI() csAPI.initialize() result = csAPI.downloadCSData() if not result['OK']: gLogger.error( 'Failed to initialize CSAPI object', result['Message'] ) DIRACExit( -1 ) for section, option, value, new_value in changeSet: if value == 'Unknown' or not value: csAPI.setOption( cfgPath( section, option ), new_value ) else: csAPI.modifyValue( cfgPath( section, option ), new_value ) yn = raw_input( 'Do you want to commit changes to CS ? [default yes] [yes|no]: ' ) if yn == '' or yn.lower().startswith( 'y' ): result = csAPI.commit() if not result['OK']: gLogger.error( "Error while commit to CS", result['Message'] ) else: gLogger.notice( "Successfully committed %d changes to CS" % len( changeSet ) )
def test_configurationAutoUpdate(value1, value2): """ Test if service refresh his configuration. It sent a random value to the CS and check if Service can return it. """ csapi = CSAPI() # SETTING FIRST VALUE csapi.modifyValue("/DIRAC/Configuration/TestUpdateValue", value1) csapi.commitChanges() # Wait for automatic refresh (+1 to be sure that request is done) time.sleep(gConfigurationData.getPropagationTime() + 1) RPCClient("Framework/User").getTestValue() assert RPCClient("Framework/User").getTestValue()["Value"] == value1 # SETTING SECOND VALUE csapi.modifyValue("/DIRAC/Configuration/TestUpdateValue", value2) csapi.commitChanges() time.sleep(gConfigurationData.getPropagationTime() + 1) assert RPCClient("Framework/User").getTestValue()["Value"] == value2
class Bdii2CSAgent(AgentModule): def __init__(self, *args, **kwargs): """ Defines default parameters """ super(Bdii2CSAgent, self).__init__(*args, **kwargs) self.addressTo = '' self.addressFrom = '' self.voName = [] self.subject = "Bdii2CSAgent" self.alternativeBDIIs = [] self.voBdiiCEDict = {} self.voBdiiSEDict = {} self.csAPI = None # What to get self.processCEs = True self.processSEs = False # Update the CS or not? self.dryRun = False def initialize(self): """ Gets run paramaters from the configuration """ self.addressTo = self.am_getOption('MailTo', self.addressTo) self.addressFrom = self.am_getOption('MailFrom', self.addressFrom) # Create a list of alternative bdii urls self.alternativeBDIIs = self.am_getOption('AlternativeBDIIs', self.alternativeBDIIs) # Check if the bdii url is appended by a port number, if not append the default 2170 for index, url in enumerate(self.alternativeBDIIs): if not url.split(':')[-1].isdigit(): self.alternativeBDIIs[index] += ':2170' if self.addressTo and self.addressFrom: self.log.info("MailTo", self.addressTo) self.log.info("MailFrom", self.addressFrom) if self.alternativeBDIIs: self.log.info("AlternativeBDII URLs:", self.alternativeBDIIs) self.processCEs = self.am_getOption('ProcessCEs', self.processCEs) self.processSEs = self.am_getOption('ProcessSEs', self.processSEs) self.dryRun = self.am_getOption('DryRun', self.dryRun) self.voName = self.am_getOption('VirtualOrganization', self.voName) if not self.voName: self.voName = self.am_getOption('VO', []) if not self.voName or (len(self.voName) == 1 and self.voName[0].lower() == 'all'): # Get all VOs defined in the configuration self.voName = [] result = getVOs() if result['OK']: vos = result['Value'] for vo in vos: vomsVO = getVOOption(vo, "VOMSName") if vomsVO: self.voName.append(vomsVO) if self.voName: self.log.info("Agent will manage VO(s) %s" % self.voName) else: self.log.fatal("VirtualOrganization option not defined for agent") return S_ERROR() self.csAPI = CSAPI() return self.csAPI.initialize() def execute(self): """ General agent execution method """ self.voBdiiCEDict = {} self.voBdiiSEDict = {} # Get a "fresh" copy of the CS data result = self.csAPI.downloadCSData() if not result['OK']: self.log.warn("Could not download a fresh copy of the CS data", result['Message']) # Refresh the configuration from the master server gConfig.forceRefresh(fromMaster=True) if self.processCEs: self.__lookForNewCEs() self.__updateCEs() if self.processSEs: self.__lookForNewSEs() self.__updateSEs() return S_OK() def __lookForNewCEs(self): """ Look up BDII for CEs not yet present in the DIRAC CS """ bannedCEs = self.am_getOption('BannedCEs', []) result = getCEsFromCS() if not result['OK']: return result knownCEs = set(result['Value']) knownCEs = knownCEs.union(set(bannedCEs)) for vo in self.voName: result = self.__getBdiiCEInfo(vo) if not result['OK']: continue bdiiInfo = result['Value'] result = getGridCEs(vo, bdiiInfo=bdiiInfo, ceBlackList=knownCEs) if not result['OK']: self.log.error('Failed to get unused CEs', result['Message']) siteDict = result['Value'] body = '' for site in siteDict: newCEs = set(siteDict[site].keys()) # pylint: disable=no-member if not newCEs: continue ceString = '' for ce in newCEs: queueString = '' ceInfo = bdiiInfo[site]['CEs'][ce] newCEString = "CE: %s, GOCDB Site Name: %s" % (ce, site) systemTuple = siteDict[site][ce]['System'] osString = "%s_%s_%s" % (systemTuple) newCEString = "\n%s\n%s\n" % (newCEString, osString) for queue in ceInfo['Queues']: queueStatus = ceInfo['Queues'][queue].get( 'GlueCEStateStatus', 'UnknownStatus') if 'production' in queueStatus.lower(): ceType = ceInfo['Queues'][queue].get( 'GlueCEImplementationName', '') queueString += " %s %s %s\n" % ( queue, queueStatus, ceType) if queueString: ceString += newCEString ceString += "Queues:\n" ceString += queueString if ceString: body += ceString if body: body = "\nWe are glad to inform You about new CE(s) possibly suitable for %s:\n" % vo + body body += "\n\nTo suppress information about CE add its name to BannedCEs list.\n" body += "Add new Sites/CEs for vo %s with the command:\n" % vo body += "dirac-admin-add-resources --vo %s --ce\n" % vo self.log.info(body) if self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail(self.addressTo, self.subject, body, self.addressFrom, localAttempt=False) if not result['OK']: self.log.error( 'Can not send new site notification mail', result['Message']) return S_OK() def __getBdiiCEInfo(self, vo): if vo in self.voBdiiCEDict: return S_OK(self.voBdiiCEDict[vo]) self.log.info("Check for available CEs for VO", vo) totalResult = S_OK({}) message = '' mainResult = getBdiiCEInfo(vo) if not mainResult['OK']: self.log.error("Failed getting information from default bdii", mainResult['Message']) message = mainResult['Message'] for bdii in reversed(self.alternativeBDIIs): resultAlt = getBdiiCEInfo(vo, host=bdii) if resultAlt['OK']: totalResult['Value'].update(resultAlt['Value']) else: self.log.error("Failed getting information from %s " % bdii, resultAlt['Message']) message = (message + "\n" + resultAlt['Message']).strip() if mainResult['OK']: totalResult['Value'].update(mainResult['Value']) if not totalResult[ 'Value'] and message: ## Dict is empty and we have an error message self.log.error("Error during BDII request", message) totalResult = S_ERROR(message) else: self.voBdiiCEDict[vo] = totalResult['Value'] return totalResult def __getBdiiSEInfo(self, vo): if vo in self.voBdiiSEDict: return S_OK(self.voBdiiSEDict[vo]) self.log.info("Check for available SEs for VO", vo) result = getBdiiSEInfo(vo) message = '' if not result['OK']: message = result['Message'] for bdii in self.alternativeBDIIs: result = getBdiiSEInfo(vo, host=bdii) if result['OK']: break if not result['OK']: if message: self.log.error("Error during BDII request", message) else: self.log.error("Error during BDII request", result['Message']) else: self.voBdiiSEDict[vo] = result['Value'] return result def __updateCEs(self): """ Update the Site/CE/queue settings in the CS if they were changed in the BDII """ bdiiChangeSet = set() for vo in self.voName: result = self.__getBdiiCEInfo(vo) if not result['OK']: continue ceBdiiDict = result['Value'] result = getSiteUpdates(vo, bdiiInfo=ceBdiiDict, log=self.log) if not result['OK']: continue bdiiChangeSet = bdiiChangeSet.union(result['Value']) # We have collected all the changes, consolidate VO settings result = self.__updateCS(bdiiChangeSet) return result def __updateCS(self, bdiiChangeSet): queueVODict = {} changeSet = set() for entry in bdiiChangeSet: section, option, _value, new_value = entry if option == "VO": queueVODict.setdefault(section, set()) queueVODict[section] = queueVODict[section].union( set(new_value.split(','))) else: changeSet.add(entry) for section, VOs in queueVODict.items(): changeSet.add((section, 'VO', '', ','.join(VOs))) if changeSet: changeList = list(changeSet) changeList.sort() body = '\n'.join( ["%s/%s %s -> %s" % entry for entry in changeList]) if body and self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail(self.addressTo, self.subject, body, self.addressFrom, localAttempt=False) if body: self.log.info( 'The following configuration changes were detected:') self.log.info(body) for section, option, value, new_value in changeSet: if value == 'Unknown' or not value: self.csAPI.setOption(cfgPath(section, option), new_value) else: self.csAPI.modifyValue(cfgPath(section, option), new_value) if self.dryRun: self.log.info("Dry Run: CS won't be updated") self.csAPI.showDiff() else: result = self.csAPI.commit() if not result['OK']: self.log.error("Error while committing to CS", result['Message']) else: self.log.info("Successfully committed %d changes to CS" % len(changeList)) return result else: self.log.info("No changes found") return S_OK() def __lookForNewSEs(self): """ Look up BDII for SEs not yet present in the DIRAC CS """ bannedSEs = self.am_getOption('BannedSEs', []) result = getSEsFromCS() if not result['OK']: return result knownSEs = set(result['Value']) knownSEs = knownSEs.union(set(bannedSEs)) for vo in self.voName: result = self.__getBdiiSEInfo(vo) if not result['OK']: continue bdiiInfo = result['Value'] result = getGridSRMs(vo, bdiiInfo=bdiiInfo, srmBlackList=knownSEs) if not result['OK']: continue siteDict = result['Value'] body = '' for site in siteDict: newSEs = set(siteDict[site].keys()) # pylint: disable=no-member if not newSEs: continue for se in newSEs: body += '\n New SE %s available at site %s:\n' % (se, site) backend = siteDict[site][se]['SE'].get( 'GlueSEImplementationName', 'Unknown') size = siteDict[site][se]['SE'].get( 'GlueSESizeTotal', 'Unknown') body += ' Backend %s, Size %s' % (backend, size) if body: body = "\nWe are glad to inform You about new SE(s) possibly suitable for %s:\n" % vo + body body += "\n\nTo suppress information about an SE add its name to BannedSEs list.\n" body += "Add new SEs for vo %s with the command:\n" % vo body += "dirac-admin-add-resources --vo %s --se\n" % vo self.log.info(body) if self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail(self.addressTo, self.subject, body, self.addressFrom, localAttempt=False) if not result['OK']: self.log.error( 'Can not send new site notification mail', result['Message']) return S_OK() def __updateSEs(self): """ Update the Storage Element settings in the CS if they were changed in the BDII """ bdiiChangeSet = set() for vo in self.voName: result = self.__getBdiiSEInfo(vo) if not result['OK']: continue seBdiiDict = result['Value'] result = getSRMUpdates(vo, bdiiInfo=seBdiiDict) if not result['OK']: continue bdiiChangeSet = bdiiChangeSet.union(result['Value']) # We have collected all the changes, consolidate VO settings result = self.__updateCS(bdiiChangeSet) return result
change = False if newSite: gLogger.notice("Adding new site to CS: %s" % diracSiteName) csAPI.setOption("%s/Name" % cfgBase, gridSiteName) gLogger.notice("Adding CEs: %s" % ",".join(ces)) csAPI.setOption("%s/CE" % cfgBase, ",".join(ces)) change = True else: cesCS = set(gConfig.getValue("%s/CE" % cfgBase, [])) ces = set(ces) newCEs = ces - cesCS if newCEs: cesCS = cesCS.union(ces) gLogger.notice("Adding CEs %s" % ",".join(newCEs)) cesCS = cesCS.union(ces) csAPI.modifyValue("%s/CE" % cfgBase, ",".join(cesCS)) change = True if change: res = csAPI.commitChanges() if not res["OK"]: gLogger.error("Failed to commit changes to CS", res["Message"]) DIRACExit(-1) else: if newSite: gLogger.notice( "Successfully added site %s to the CS with name %s and CEs: %s" % (diracSiteName, gridSiteName, ",".join(ces)) ) else: gLogger.notice("Successfully added new CEs to site %s: %s" % (diracSiteName, ",".join(newCEs)))
class Bdii2CSAgent( AgentModule ): addressTo = '' addressFrom = '' voName = '' subject = "CE2CSAgent" alternativeBDIIs = [] def initialize( self ): self.addressTo = self.am_getOption( 'MailTo', self.addressTo ) self.addressFrom = self.am_getOption( 'MailFrom', self.addressFrom ) # Create a list of alternative bdii urls self.alternativeBDIIs = self.am_getOption( 'AlternativeBDIIs', [] ) # Check if the bdii url is appended by a port number, if not append the default 2170 for index, url in enumerate( self.alternativeBDIIs ): if not url.split( ':' )[-1].isdigit(): self.alternativeBDIIs[index] += ':2170' if self.addressTo and self.addressFrom: self.log.info( "MailTo", self.addressTo ) self.log.info( "MailFrom", self.addressFrom ) if self.alternativeBDIIs : self.log.info( "AlternativeBDII URLs:", self.alternativeBDIIs ) self.subject = "CE2CSAgent" self.processCEs = self.am_getOption( 'ProcessCEs', True ) self.processSEs = self.am_getOption( 'ProcessSEs', False ) self.voName = self.am_getOption( 'VirtualOrganization', [] ) if not self.voName: self.voName = self.am_getOption( 'VO', [] ) if not self.voName or ( len( self.voName ) == 1 and self.voName[0].lower() == 'all' ): # Get all VOs defined in the configuration self.voName = [] result = getVOs() if result['OK']: vos = result['Value'] for vo in vos: vomsVO = getVOOption( vo, "VOMSName" ) if vomsVO: self.voName.append( vomsVO ) if self.voName: self.log.info( "Agent will manage VO(s) %s" % self.voName ) else: self.log.fatal( "VirtualOrganization option not defined for agent" ) return S_ERROR() self.voBdiiCEDict = {} self.voBdiiSEDict = {} self.csAPI = CSAPI() return self.csAPI.initialize() def execute( self ): """ General agent execution method """ # Get a "fresh" copy of the CS data result = self.csAPI.downloadCSData() if not result['OK']: self.log.warn( "Could not download a fresh copy of the CS data", result[ 'Message' ] ) if self.processCEs: self.__lookForNewCEs() self.__updateCEs() if self.processSEs: self.__lookForNewSEs() self.__updateSEs() return S_OK() def __lookForNewCEs( self ): """ Look up BDII for CEs not yet present in the DIRAC CS """ bannedCEs = self.am_getOption( 'BannedCEs', [] ) result = getCEsFromCS() if not result['OK']: return result knownCEs = set( result['Value'] ) knownCEs = knownCEs.union( set( bannedCEs ) ) for vo in self.voName: result = self.__getBdiiCEInfo( vo ) if not result['OK']: continue bdiiInfo = result['Value'] result = getGridCEs( vo, bdiiInfo = bdiiInfo, ceBlackList = knownCEs ) if not result['OK']: self.log.error( 'Failed to get unused CEs', result['Message'] ) siteDict = result['Value'] body = '' for site in siteDict: newCEs = set( siteDict[site].keys() ) if not newCEs: continue ceString = '' for ce in newCEs: queueString = '' ceInfo = bdiiInfo[site]['CEs'][ce] ceString = "CE: %s, GOCDB Site Name: %s" % ( ce, site ) systemTuple = siteDict[site][ce]['System'] osString = "%s_%s_%s" % ( systemTuple ) newCEString = "\n%s\n%s\n" % ( ceString, osString ) for queue in ceInfo['Queues']: queueStatus = ceInfo['Queues'][queue].get( 'GlueCEStateStatus', 'UnknownStatus' ) if 'production' in queueStatus.lower(): ceType = ceInfo['Queues'][queue].get( 'GlueCEImplementationName', '' ) queueString += " %s %s %s\n" % ( queue, queueStatus, ceType ) if queueString: ceString = newCEString ceString += "Queues:\n" ceString += queueString if ceString: body += ceString if body: body = "\nWe are glad to inform You about new CE(s) possibly suitable for %s:\n" % vo + body body += "\n\nTo suppress information about CE add its name to BannedCEs list.\n" body += "Add new Sites/CEs for vo %s with the command:\n" % vo body += "dirac-admin-add-resources --vo %s --ce\n" % vo self.log.info( body ) if self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail( self.addressTo, self.subject, body, self.addressFrom, localAttempt = False ) if not result['OK']: self.log.error( 'Can not send new site notification mail', result['Message'] ) return S_OK() def __getBdiiCEInfo( self, vo ): if vo in self.voBdiiCEDict: return S_OK( self.voBdiiCEDict[vo] ) self.log.info( "Check for available CEs for VO", vo ) result = getBdiiCEInfo( vo ) message = '' if not result['OK']: message = result['Message'] for bdii in self.alternativeBDIIs : result = getBdiiCEInfo( vo, host = bdii ) if result['OK']: break if not result['OK']: if message: self.log.error( "Error during BDII request", message ) else: self.log.error( "Error during BDII request", result['Message'] ) else: self.voBdiiCEDict[vo] = result['Value'] return result def __getBdiiSEInfo( self, vo ): if vo in self.voBdiiSEDict: return S_OK( self.voBdiiSEDict[vo] ) self.log.info( "Check for available SEs for VO", vo ) result = getBdiiSEInfo( vo ) message = '' if not result['OK']: message = result['Message'] for bdii in self.alternativeBDIIs : result = getBdiiSEInfo( vo, host = bdii ) if result['OK']: break if not result['OK']: if message: self.log.error( "Error during BDII request", message ) else: self.log.error( "Error during BDII request", result['Message'] ) else: self.voBdiiSEDict[vo] = result['Value'] return result def __updateCEs( self ): """ Update the Site/CE/queue settings in the CS if they were changed in the BDII """ bdiiChangeSet = set() for vo in self.voName: result = self.__getBdiiCEInfo( vo ) if not result['OK']: continue ceBdiiDict = result['Value'] result = getSiteUpdates( vo, bdiiInfo = ceBdiiDict, log = self.log ) if not result['OK']: continue bdiiChangeSet = bdiiChangeSet.union( result['Value'] ) # We have collected all the changes, consolidate VO settings result = self.__updateCS( bdiiChangeSet ) return result def __updateCS( self, bdiiChangeSet ): queueVODict = {} changeSet = set() for entry in bdiiChangeSet: section, option , _value, new_value = entry if option == "VO": queueVODict.setdefault( section, set() ) queueVODict[section] = queueVODict[section].union( set( new_value.split( ',' ) ) ) else: changeSet.add( entry ) for section, VOs in queueVODict.items(): changeSet.add( ( section, 'VO', '', ','.join( VOs ) ) ) if changeSet: changeList = list( changeSet ) changeList.sort() body = '\n'.join( [ "%s/%s %s -> %s" % entry for entry in changeList ] ) if body and self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail( self.addressTo, self.subject, body, self.addressFrom, localAttempt = False ) if body: self.log.info( 'The following configuration changes were detected:' ) self.log.info( body ) for section, option, value, new_value in changeSet: if value == 'Unknown' or not value: self.csAPI.setOption( cfgPath( section, option ), new_value ) else: self.csAPI.modifyValue( cfgPath( section, option ), new_value ) result = self.csAPI.commit() if not result['OK']: self.log.error( "Error while committing to CS", result['Message'] ) else: self.log.info( "Successfully committed %d changes to CS" % len( changeList ) ) return result else: self.log.info( "No changes found" ) return S_OK() def __lookForNewSEs( self ): """ Look up BDII for SEs not yet present in the DIRAC CS """ bannedSEs = self.am_getOption( 'BannedSEs', [] ) result = getSEsFromCS() if not result['OK']: return result knownSEs = set( result['Value'] ) knownSEs = knownSEs.union( set( bannedSEs ) ) for vo in self.voName: result = self.__getBdiiSEInfo( vo ) if not result['OK']: continue bdiiInfo = result['Value'] result = getGridSRMs( vo, bdiiInfo = bdiiInfo, srmBlackList = knownSEs ) if not result['OK']: continue siteDict = result['Value'] body = '' for site in siteDict: newSEs = set( siteDict[site].keys() ) if not newSEs: continue for se in newSEs: body += '\n New SE %s available at site %s:\n' % ( se, site ) backend = siteDict[site][se]['SE'].get( 'GlueSEImplementationName', 'Unknown' ) size = siteDict[site][se]['SE'].get( 'GlueSESizeTotal', 'Unknown' ) body += ' Backend %s, Size %s' % ( backend, size ) if body: body = "\nWe are glad to inform You about new SE(s) possibly suitable for %s:\n" % vo + body body += "\n\nTo suppress information about an SE add its name to BannedSEs list.\n" body += "Add new SEs for vo %s with the command:\n" % vo body += "dirac-admin-add-resources --vo %s --se\n" % vo self.log.info( body ) if self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail( self.addressTo, self.subject, body, self.addressFrom, localAttempt = False ) if not result['OK']: self.log.error( 'Can not send new site notification mail', result['Message'] ) return S_OK() def __updateSEs( self ): """ Update the Storage Element settings in the CS if they were changed in the BDII """ bdiiChangeSet = set() for vo in self.voName: result = self.__getBdiiSEInfo( vo ) if not result['OK']: continue seBdiiDict = result['Value'] result = getSRMUpdates( vo, bdiiInfo = seBdiiDict ) if not result['OK']: continue bdiiChangeSet = bdiiChangeSet.union( result['Value'] ) # We have collected all the changes, consolidate VO settings result = self.__updateCS( bdiiChangeSet ) return result
class CE2CSAgent( AgentModule ): addressTo = '' addressFrom = '' voName = '' subject = "CE2CSAgent" alternativeBDIIs = [] def initialize( self ): # TODO: Have no default and if no mail is found then use the diracAdmin group # and resolve all associated mail addresses. self.addressTo = self.am_getOption( 'MailTo', self.addressTo ) self.addressFrom = self.am_getOption( 'MailFrom', self.addressFrom ) # create a list of alternative bdii urls self.alternativeBDIIs = self.am_getOption( 'AlternativeBDIIs', [] ) # check if the bdii url is appended by a port number, if not append the default 2170 for index, url in enumerate( self.alternativeBDIIs ): if not url.split( ':' )[-1].isdigit(): self.alternativeBDIIs[index] += ':2170' if self.addressTo and self.addressFrom: self.log.info( "MailTo", self.addressTo ) self.log.info( "MailFrom", self.addressFrom ) if self.alternativeBDIIs : self.log.info( "AlternativeBDII URLs:", self.alternativeBDIIs ) self.subject = "CE2CSAgent" # This sets the Default Proxy to used as that defined under # /Operations/Shifter/TestManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'TestManager' ) self.voName = self.am_getOption( 'VirtualOrganization', self.voName ) if not self.voName: self.voName = getVO() if not self.voName: self.log.fatal( "VO option not defined for agent" ) return S_ERROR() self.csAPI = CSAPI() return self.csAPI.initialize() def execute( self ): self.log.info( "Start Execution" ) result = getProxyInfo() if not result[ 'OK' ]: return result infoDict = result[ 'Value' ] self.log.info( formatProxyInfoAsString( infoDict ) ) #Get a "fresh" copy of the CS data result = self.csAPI.downloadCSData() if not result[ 'OK' ]: self.log.warn( "Could not download a fresh copy of the CS data", result[ 'Message' ] ) self.__lookForCE() self.__infoFromCE() self.log.info( "End Execution" ) return S_OK() def __checkAlternativeBDIISite( self, fun, *args ): if self.alternativeBDIIs: self.log.warn( "Trying to use alternative bdii sites" ) for site in self.alternativeBDIIs : self.log.info( "Trying to contact alternative bdii ", site ) if len( args ) == 1 : result = fun( args[0], host = site ) elif len( args ) == 2 : result = fun( args[0], vo = args[1], host = site ) if not result['OK'] : self.log.error ( "Problem contacting alternative bddii", result['Message'] ) elif result['OK'] : return result self.log.warn( "Also checking alternative BDII sites failed" ) return result def __lookForCE( self ): knownces = self.am_getOption( 'BannedCEs', [] ) result = gConfig.getSections( '/Resources/Sites' ) if not result['OK']: return grids = result['Value'] for grid in grids: result = gConfig.getSections( '/Resources/Sites/%s' % grid ) if not result['OK']: return sites = result['Value'] for site in sites: opt = gConfig.getOptionsDict( '/Resources/Sites/%s/%s' % ( grid, site ) )['Value'] ces = List.fromChar( opt.get( 'CE', '' ) ) knownces += ces response = ldapCEState( '', vo = self.voName ) if not response['OK']: self.log.error( "Error during BDII request", response['Message'] ) response = self.__checkAlternativeBDIISite( ldapCEState, '', self.voName ) return response newces = {} for queue in response['Value']: try: queuename = queue['GlueCEUniqueID'] except: continue cename = queuename.split( ":" )[0] if not cename in knownces: newces[cename] = None self.log.debug( "newce", cename ) body = "" possibleNewSites = [] for ce in newces.iterkeys(): response = ldapCluster( ce ) if not response['OK']: self.log.warn( "Error during BDII request", response['Message'] ) response = self.__checkAlternativeBDIISite( ldapCluster, ce ) continue clusters = response['Value'] if len( clusters ) != 1: self.log.warn( "Error in cluster length", " CE %s Length %d" % ( ce, len( clusters ) ) ) if len( clusters ) == 0: continue cluster = clusters[0] fkey = cluster.get( 'GlueForeignKey', [] ) if type( fkey ) == type( '' ): fkey = [fkey] nameBDII = None for entry in fkey: if entry.count( 'GlueSiteUniqueID' ): nameBDII = entry.split( '=' )[1] break if not nameBDII: continue cestring = "CE: %s, GOCDB Name: %s" % ( ce, nameBDII ) self.log.info( cestring ) response = ldapCE( ce ) if not response['OK']: self.log.warn( "Error during BDII request", response['Message'] ) response = self.__checkAlternativeBDIISite( ldapCE, ce ) continue ceinfos = response['Value'] if len( ceinfos ): ceinfo = ceinfos[0] systemName = ceinfo.get( 'GlueHostOperatingSystemName', 'Unknown' ) systemVersion = ceinfo.get( 'GlueHostOperatingSystemVersion', 'Unknown' ) systemRelease = ceinfo.get( 'GlueHostOperatingSystemRelease', 'Unknown' ) else: systemName = "Unknown" systemVersion = "Unknown" systemRelease = "Unknown" osstring = "SystemName: %s, SystemVersion: %s, SystemRelease: %s" % ( systemName, systemVersion, systemRelease ) self.log.info( osstring ) response = ldapCEState( ce, vo = self.voName ) if not response['OK']: self.log.warn( "Error during BDII request", response['Message'] ) response = self.__checkAlternativeBDIISite( ldapCEState, ce, self.voName ) continue newcestring = "\n\n%s\n%s" % ( cestring, osstring ) usefull = False cestates = response['Value'] for cestate in cestates: queuename = cestate.get( 'GlueCEUniqueID', 'UnknownName' ) queuestatus = cestate.get( 'GlueCEStateStatus', 'UnknownStatus' ) queuestring = "%s %s" % ( queuename, queuestatus ) self.log.info( queuestring ) newcestring += "\n%s" % queuestring if queuestatus.count( 'Production' ): usefull = True if usefull: body += newcestring possibleNewSites.append( 'dirac-admin-add-site DIRACSiteName %s %s' % ( nameBDII, ce ) ) if body: body = "We are glad to inform You about new CE(s) possibly suitable for %s:\n" % self.voName + body body += "\n\nTo suppress information about CE add its name to BannedCEs list." for possibleNewSite in possibleNewSites: body = "%s\n%s" % ( body, possibleNewSite ) self.log.info( body ) if self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail( self.addressTo, self.subject, body, self.addressFrom, localAttempt = False ) return S_OK() def __infoFromCE( self ): sitesSection = cfgPath( 'Resources', 'Sites' ) result = gConfig.getSections( sitesSection ) if not result['OK']: return grids = result['Value'] changed = False body = "" for grid in grids: gridSection = cfgPath( sitesSection, grid ) result = gConfig.getSections( gridSection ) if not result['OK']: return sites = result['Value'] for site in sites: siteSection = cfgPath( gridSection, site ) opt = gConfig.getOptionsDict( siteSection )['Value'] name = opt.get( 'Name', '' ) if name: coor = opt.get( 'Coordinates', 'Unknown' ) mail = opt.get( 'Mail', 'Unknown' ) result = ldapSite( name ) if not result['OK']: self.log.warn( "BDII site %s: %s" % ( name, result['Message'] ) ) result = self.__checkAlternativeBDIISite( ldapSite, name ) if result['OK']: bdiisites = result['Value'] if len( bdiisites ) == 0: self.log.warn( name, "Error in bdii: leng = 0" ) else: if not len( bdiisites ) == 1: self.log.warn( name, "Warning in bdii: leng = %d" % len( bdiisites ) ) bdiisite = bdiisites[0] try: longitude = bdiisite['GlueSiteLongitude'] latitude = bdiisite['GlueSiteLatitude'] newcoor = "%s:%s" % ( longitude, latitude ) except: self.log.warn( "Error in bdii coor" ) newcoor = "Unknown" try: newmail = bdiisite['GlueSiteSysAdminContact'].split( ":" )[-1].strip() except: self.log.warn( "Error in bdii mail" ) newmail = "Unknown" self.log.debug( "%s %s %s" % ( name, newcoor, newmail ) ) if newcoor != coor: self.log.info( "%s" % ( name ), "%s -> %s" % ( coor, newcoor ) ) if coor == 'Unknown': self.csAPI.setOption( cfgPath( siteSection, 'Coordinates' ), newcoor ) else: self.csAPI.modifyValue( cfgPath( siteSection, 'Coordinates' ), newcoor ) changed = True if newmail != mail: self.log.info( "%s" % ( name ), "%s -> %s" % ( mail, newmail ) ) if mail == 'Unknown': self.csAPI.setOption( cfgPath( siteSection, 'Mail' ), newmail ) else: self.csAPI.modifyValue( cfgPath( siteSection, 'Mail' ), newmail ) changed = True celist = List.fromChar( opt.get( 'CE', '' ) ) if not celist: self.log.warn( site, 'Empty site list' ) continue # result = gConfig.getSections( cfgPath( siteSection,'CEs' ) # if not result['OK']: # self.log.debug( "Section CEs:", result['Message'] ) for ce in celist: ceSection = cfgPath( siteSection, 'CEs', ce ) result = gConfig.getOptionsDict( ceSection ) if not result['OK']: self.log.debug( "Section CE", result['Message'] ) wnTmpDir = 'Unknown' arch = 'Unknown' os = 'Unknown' si00 = 'Unknown' pilot = 'Unknown' cetype = 'Unknown' else: ceopt = result['Value'] wnTmpDir = ceopt.get( 'wnTmpDir', 'Unknown' ) arch = ceopt.get( 'architecture', 'Unknown' ) os = ceopt.get( 'OS', 'Unknown' ) si00 = ceopt.get( 'SI00', 'Unknown' ) pilot = ceopt.get( 'Pilot', 'Unknown' ) cetype = ceopt.get( 'CEType', 'Unknown' ) result = ldapCE( ce ) if not result['OK']: self.log.warn( 'Error in bdii for %s' % ce, result['Message'] ) result = self.__checkAlternativeBDIISite( ldapCE, ce ) continue try: bdiice = result['Value'][0] except: self.log.warn( 'Error in bdii for %s' % ce, result ) bdiice = None if bdiice: try: newwnTmpDir = bdiice['GlueSubClusterWNTmpDir'] except: newwnTmpDir = 'Unknown' if wnTmpDir != newwnTmpDir and newwnTmpDir != 'Unknown': section = cfgPath( ceSection, 'wnTmpDir' ) self.log.info( section, " -> ".join( ( wnTmpDir, newwnTmpDir ) ) ) if wnTmpDir == 'Unknown': self.csAPI.setOption( section, newwnTmpDir ) else: self.csAPI.modifyValue( section, newwnTmpDir ) changed = True try: newarch = bdiice['GlueHostArchitecturePlatformType'] except: newarch = 'Unknown' if arch != newarch and newarch != 'Unknown': section = cfgPath( ceSection, 'architecture' ) self.log.info( section, " -> ".join( ( arch, newarch ) ) ) if arch == 'Unknown': self.csAPI.setOption( section, newarch ) else: self.csAPI.modifyValue( section, newarch ) changed = True try: newos = '_'.join( ( bdiice['GlueHostOperatingSystemName'], bdiice['GlueHostOperatingSystemVersion'], bdiice['GlueHostOperatingSystemRelease'] ) ) except: newos = 'Unknown' if os != newos and newos != 'Unknown': section = cfgPath( ceSection, 'OS' ) self.log.info( section, " -> ".join( ( os, newos ) ) ) if os == 'Unknown': self.csAPI.setOption( section, newos ) else: self.csAPI.modifyValue( section, newos ) changed = True body = body + "OS was changed %s -> %s for %s at %s\n" % ( os, newos, ce, site ) try: newsi00 = bdiice['GlueHostBenchmarkSI00'] except: newsi00 = 'Unknown' if si00 != newsi00 and newsi00 != 'Unknown': section = cfgPath( ceSection, 'SI00' ) self.log.info( section, " -> ".join( ( si00, newsi00 ) ) ) if si00 == 'Unknown': self.csAPI.setOption( section, newsi00 ) else: self.csAPI.modifyValue( section, newsi00 ) changed = True try: rte = bdiice['GlueHostApplicationSoftwareRunTimeEnvironment'] if self.voName.lower() == 'lhcb': if 'VO-lhcb-pilot' in rte: newpilot = 'True' else: newpilot = 'False' else: newpilot = 'Unknown' except: newpilot = 'Unknown' if pilot != newpilot and newpilot != 'Unknown': section = cfgPath( ceSection, 'Pilot' ) self.log.info( section, " -> ".join( ( pilot, newpilot ) ) ) if pilot == 'Unknown': self.csAPI.setOption( section, newpilot ) else: self.csAPI.modifyValue( section, newpilot ) changed = True result = ldapCEState( ce, vo = self.voName ) #getBDIICEVOView if not result['OK']: self.log.warn( 'Error in bdii for queue %s' % ce, result['Message'] ) result = self.__checkAlternativeBDIISite( ldapCEState, ce, self.voName ) continue try: queues = result['Value'] except: self.log.warn( 'Error in bdii for queue %s' % ce, result['Massage'] ) continue newcetype = 'Unknown' for queue in queues: try: queuetype = queue['GlueCEImplementationName'] except: queuetype = 'Unknown' if newcetype == 'Unknown': newcetype = queuetype else: if queuetype != newcetype: self.log.warn( 'Error in bdii for ce %s ' % ce, 'different cetypes %s %s' % ( newcetype, queuetype ) ) if newcetype=='ARC-CE': newcetype = 'ARC' if cetype != newcetype and newcetype != 'Unknown': section = cfgPath( ceSection, 'CEType' ) self.log.info( section, " -> ".join( ( cetype, newcetype ) ) ) if cetype == 'Unknown': self.csAPI.setOption( section, newcetype ) else: self.csAPI.modifyValue( section, newcetype ) changed = True for queue in queues: try: queueName = queue['GlueCEUniqueID'].split( '/' )[-1] except: self.log.warn( 'error in queuename ', queue ) continue try: newmaxCPUTime = queue['GlueCEPolicyMaxCPUTime'] except: newmaxCPUTime = None newsi00 = None try: caps = queue['GlueCECapability'] if type( caps ) == type( '' ): caps = [caps] for cap in caps: if cap.count( 'CPUScalingReferenceSI00' ): newsi00 = cap.split( '=' )[-1] except: newsi00 = None queueSection = cfgPath( ceSection, 'Queues', queueName ) result = gConfig.getOptionsDict( queueSection ) if not result['OK']: self.log.warn( "Section Queues", result['Message'] ) maxCPUTime = 'Unknown' si00 = 'Unknown' else: queueopt = result['Value'] maxCPUTime = queueopt.get( 'maxCPUTime', 'Unknown' ) si00 = queueopt.get( 'SI00', 'Unknown' ) if newmaxCPUTime and ( maxCPUTime != newmaxCPUTime ): section = cfgPath( queueSection, 'maxCPUTime' ) self.log.info( section, " -> ".join( ( maxCPUTime, newmaxCPUTime ) ) ) if maxCPUTime == 'Unknown': self.csAPI.setOption( section, newmaxCPUTime ) else: self.csAPI.modifyValue( section, newmaxCPUTime ) changed = True if newsi00 and ( si00 != newsi00 ): section = cfgPath( queueSection, 'SI00' ) self.log.info( section, " -> ".join( ( si00, newsi00 ) ) ) if si00 == 'Unknown': self.csAPI.setOption( section, newsi00 ) else: self.csAPI.modifyValue( section, newsi00 ) changed = True if changed: self.log.info( body ) if body and self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail( self.addressTo, self.subject, body, self.addressFrom, localAttempt = False ) return self.csAPI.commit() else: self.log.info( "No changes found" ) return S_OK()
class CE2CSAgent(AgentModule): """ !!!Out-dated!!! Moved to Bdii2CSAgent """ addressTo = '' addressFrom = '' voName = '' subject = "CE2CSAgent" alternativeBDIIs = [] csAPI = None def initialize(self): # TODO: Have no default and if no mail is found then use the diracAdmin group # and resolve all associated mail addresses. self.addressTo = self.am_getOption('MailTo', self.addressTo) self.addressFrom = self.am_getOption('MailFrom', self.addressFrom) # Create a list of alternative bdii urls self.alternativeBDIIs = self.am_getOption('AlternativeBDIIs', []) # Check if the bdii url is appended by a port number, if not append the default 2170 for index, url in enumerate(self.alternativeBDIIs): if not url.split(':')[-1].isdigit(): self.alternativeBDIIs[index] += ':2170' if self.addressTo and self.addressFrom: self.log.info("MailTo", self.addressTo) self.log.info("MailFrom", self.addressFrom) if self.alternativeBDIIs: self.log.info("AlternativeBDII URLs:", self.alternativeBDIIs) self.subject = "CE2CSAgent" # This sets the Default Proxy to used as that defined under # /Operations/Shifter/TestManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'TestManager') self.voName = self.am_getOption('VirtualOrganization', []) if not self.voName: vo = getVO() if vo: self.voName = [vo] if self.voName: self.log.info("Agent will manage VO(s) %s" % self.voName) else: self.log.fatal("VirtualOrganization option not defined for agent") return S_ERROR() self.csAPI = CSAPI() return self.csAPI.initialize() def execute(self): self.log.info("Start Execution") result = getProxyInfo() if not result['OK']: return result infoDict = result['Value'] self.log.info(formatProxyInfoAsString(infoDict)) # Get a "fresh" copy of the CS data result = self.csAPI.downloadCSData() if not result['OK']: self.log.warn("Could not download a fresh copy of the CS data", result['Message']) self.__lookForCE() self.__infoFromCE() self.log.info("End Execution") return S_OK() def __checkAlternativeBDIISite(self, fun, *args): if self.alternativeBDIIs: self.log.warn("Trying to use alternative BDII sites") for site in self.alternativeBDIIs: self.log.info("Trying to contact alternative BDII", site) if len(args) == 1: result = fun(args[0], host=site) elif len(args) == 2: result = fun(args[0], vo=args[1], host=site) if not result['OK']: self.log.error("Problem contacting alternative BDII", result['Message']) elif result['OK']: return result self.log.warn("Also checking alternative BDII sites failed") return result def __lookForCE(self): knownCEs = self.am_getOption('BannedCEs', []) result = gConfig.getSections('/Resources/Sites') if not result['OK']: return grids = result['Value'] for grid in grids: result = gConfig.getSections('/Resources/Sites/%s' % grid) if not result['OK']: return sites = result['Value'] for site in sites: opt = gConfig.getOptionsDict('/Resources/Sites/%s/%s' % (grid, site))['Value'] ces = List.fromChar(opt.get('CE', '')) knownCEs += ces response = '' for vo in self.voName: self.log.info("Check for available CEs for VO", vo) response = ldapCEState('', vo) if not response['OK']: self.log.error("Error during BDII request", response['Message']) response = self.__checkAlternativeBDIISite(ldapCEState, '', vo) return response newCEs = {} for queue in response['Value']: try: queueName = queue['GlueCEUniqueID'] except: continue ceName = queueName.split(":")[0] if not ceName in knownCEs: newCEs[ceName] = None self.log.debug("New CE", ceName) body = "" possibleNewSites = [] for ce in newCEs.iterkeys(): response = ldapCluster(ce) if not response['OK']: self.log.warn("Error during BDII request", response['Message']) response = self.__checkAlternativeBDIISite(ldapCluster, ce) continue clusters = {} clusters = response['Value'] if len(clusters) != 1: self.log.warn("Error in cluster length", " CE %s Length %d" % (ce, len(clusters))) if len(clusters) == 0: continue cluster = clusters[0] fkey = cluster.get('GlueForeignKey', []) if type(fkey) == type(''): fkey = [fkey] nameBDII = None for entry in fkey: if entry.count('GlueSiteUniqueID'): nameBDII = entry.split('=')[1] break if not nameBDII: continue ceString = "CE: %s, GOCDB Name: %s" % (ce, nameBDII) self.log.info(ceString) response = ldapCE(ce) if not response['OK']: self.log.warn("Error during BDII request", response['Message']) response = self.__checkAlternativeBDIISite(ldapCE, ce) continue ceInfos = response['Value'] if len(ceInfos): ceInfo = ceInfos[0] systemName = ceInfo.get('GlueHostOperatingSystemName', 'Unknown') systemVersion = ceInfo.get( 'GlueHostOperatingSystemVersion', 'Unknown') systemRelease = ceInfo.get( 'GlueHostOperatingSystemRelease', 'Unknown') else: systemName = "Unknown" systemVersion = "Unknown" systemRelease = "Unknown" osString = "SystemName: %s, SystemVersion: %s, SystemRelease: %s" % ( systemName, systemVersion, systemRelease) self.log.info(osString) response = ldapCEState(ce, vo) if not response['OK']: self.log.warn("Error during BDII request", response['Message']) response = self.__checkAlternativeBDIISite( ldapCEState, ce, vo) continue newCEString = "\n\n%s\n%s" % (ceString, osString) usefull = False ceStates = response['Value'] for ceState in ceStates: queueName = ceState.get('GlueCEUniqueID', 'UnknownName') queueStatus = ceState.get('GlueCEStateStatus', 'UnknownStatus') queueString = "%s %s" % (queueName, queueStatus) self.log.info(queueString) newCEString += "\n%s" % queueString if queueStatus.count('Production'): usefull = True if usefull: body += newCEString possibleNewSites.append( 'dirac-admin-add-site DIRACSiteName %s %s' % (nameBDII, ce)) if body: body = "We are glad to inform You about new CE(s) possibly suitable for %s:\n" % vo + body body += "\n\nTo suppress information about CE add its name to BannedCEs list." for possibleNewSite in possibleNewSites: body = "%s\n%s" % (body, possibleNewSite) self.log.info(body) if self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail(self.addressTo, self.subject, body, self.addressFrom, localAttempt=False) return S_OK() def __infoFromCE(self): sitesSection = cfgPath('Resources', 'Sites') result = gConfig.getSections(sitesSection) if not result['OK']: return grids = result['Value'] changed = False body = "" for grid in grids: gridSection = cfgPath(sitesSection, grid) result = gConfig.getSections(gridSection) if not result['OK']: return sites = result['Value'] for site in sites: siteSection = cfgPath(gridSection, site) opt = gConfig.getOptionsDict(siteSection)['Value'] name = opt.get('Name', '') if name: coor = opt.get('Coordinates', 'Unknown') mail = opt.get('Mail', 'Unknown') result = ldapSite(name) if not result['OK']: self.log.warn("BDII site %s: %s" % (name, result['Message'])) result = self.__checkAlternativeBDIISite( ldapSite, name) if result['OK']: bdiiSites = result['Value'] if len(bdiiSites) == 0: self.log.warn(name, "Error in BDII: leng = 0") else: if not len(bdiiSites) == 1: self.log.warn( name, "Warning in BDII: leng = %d" % len(bdiiSites)) bdiiSite = bdiiSites[0] try: longitude = bdiiSite['GlueSiteLongitude'] latitude = bdiiSite['GlueSiteLatitude'] newcoor = "%s:%s" % (longitude, latitude) except: self.log.warn("Error in BDII coordinates") newcoor = "Unknown" try: newmail = bdiiSite[ 'GlueSiteSysAdminContact'].split( ":")[-1].strip() except: self.log.warn("Error in BDII mail") newmail = "Unknown" self.log.debug("%s %s %s" % (name, newcoor, newmail)) if newcoor != coor: self.log.info("%s" % (name), "%s -> %s" % (coor, newcoor)) if coor == 'Unknown': self.csAPI.setOption( cfgPath(siteSection, 'Coordinates'), newcoor) else: self.csAPI.modifyValue( cfgPath(siteSection, 'Coordinates'), newcoor) changed = True if newmail != mail: self.log.info("%s" % (name), "%s -> %s" % (mail, newmail)) if mail == 'Unknown': self.csAPI.setOption( cfgPath(siteSection, 'Mail'), newmail) else: self.csAPI.modifyValue( cfgPath(siteSection, 'Mail'), newmail) changed = True ceList = List.fromChar(opt.get('CE', '')) if not ceList: self.log.warn(site, 'Empty site list') continue # result = gConfig.getSections( cfgPath( siteSection,'CEs' ) # if not result['OK']: # self.log.debug( "Section CEs:", result['Message'] ) for ce in ceList: ceSection = cfgPath(siteSection, 'CEs', ce) result = gConfig.getOptionsDict(ceSection) if not result['OK']: self.log.debug("Section CE", result['Message']) wnTmpDir = 'Unknown' arch = 'Unknown' os = 'Unknown' si00 = 'Unknown' pilot = 'Unknown' ceType = 'Unknown' else: ceopt = result['Value'] wnTmpDir = ceopt.get('wnTmpDir', 'Unknown') arch = ceopt.get('architecture', 'Unknown') os = ceopt.get('OS', 'Unknown') si00 = ceopt.get('SI00', 'Unknown') pilot = ceopt.get('Pilot', 'Unknown') ceType = ceopt.get('CEType', 'Unknown') result = ldapCE(ce) if not result['OK']: self.log.warn('Error in BDII for %s' % ce, result['Message']) result = self.__checkAlternativeBDIISite(ldapCE, ce) continue try: bdiiCE = result['Value'][0] except: self.log.warn('Error in BDII for %s' % ce, result) bdiiCE = None if bdiiCE: try: newWNTmpDir = bdiiCE['GlueSubClusterWNTmpDir'] except: newWNTmpDir = 'Unknown' if wnTmpDir != newWNTmpDir and newWNTmpDir != 'Unknown': section = cfgPath(ceSection, 'wnTmpDir') self.log.info(section, " -> ".join( (wnTmpDir, newWNTmpDir))) if wnTmpDir == 'Unknown': self.csAPI.setOption(section, newWNTmpDir) else: self.csAPI.modifyValue(section, newWNTmpDir) changed = True try: newArch = bdiiCE[ 'GlueHostArchitecturePlatformType'] except: newArch = 'Unknown' if arch != newArch and newArch != 'Unknown': section = cfgPath(ceSection, 'architecture') self.log.info(section, " -> ".join( (arch, newArch))) if arch == 'Unknown': self.csAPI.setOption(section, newArch) else: self.csAPI.modifyValue(section, newArch) changed = True try: newOS = '_'.join( (bdiiCE['GlueHostOperatingSystemName'], bdiiCE['GlueHostOperatingSystemVersion'], bdiiCE['GlueHostOperatingSystemRelease'])) except: newOS = 'Unknown' if os != newOS and newOS != 'Unknown': section = cfgPath(ceSection, 'OS') self.log.info(section, " -> ".join((os, newOS))) if os == 'Unknown': self.csAPI.setOption(section, newOS) else: self.csAPI.modifyValue(section, newOS) changed = True body = body + "OS was changed %s -> %s for %s at %s\n" % ( os, newOS, ce, site) try: newSI00 = bdiiCE['GlueHostBenchmarkSI00'] except: newSI00 = 'Unknown' if si00 != newSI00 and newSI00 != 'Unknown': section = cfgPath(ceSection, 'SI00') self.log.info(section, " -> ".join( (si00, newSI00))) if si00 == 'Unknown': self.csAPI.setOption(section, newSI00) else: self.csAPI.modifyValue(section, newSI00) changed = True try: rte = bdiiCE[ 'GlueHostApplicationSoftwareRunTimeEnvironment'] for vo in self.voName: if vo.lower() == 'lhcb': if 'VO-lhcb-pilot' in rte: newPilot = 'True' else: newPilot = 'False' else: newPilot = 'Unknown' except: newPilot = 'Unknown' if pilot != newPilot and newPilot != 'Unknown': section = cfgPath(ceSection, 'Pilot') self.log.info(section, " -> ".join( (pilot, newPilot))) if pilot == 'Unknown': self.csAPI.setOption(section, newPilot) else: self.csAPI.modifyValue(section, newPilot) changed = True newVO = '' for vo in self.voName: result = ldapCEState(ce, vo) #getBDIICEVOView if not result['OK']: self.log.warn('Error in BDII for queue %s' % ce, result['Message']) result = self.__checkAlternativeBDIISite( ldapCEState, ce, vo) continue try: queues = result['Value'] except: self.log.warn('Error in BDII for queue %s' % ce, result['Massage']) continue newCEType = 'Unknown' for queue in queues: try: queueType = queue['GlueCEImplementationName'] except: queueType = 'Unknown' if newCEType == 'Unknown': newCEType = queueType else: if queueType != newCEType: self.log.warn( 'Error in BDII for CE %s ' % ce, 'different CE types %s %s' % (newCEType, queueType)) if newCEType == 'ARC-CE': newCEType = 'ARC' if ceType != newCEType and newCEType != 'Unknown': section = cfgPath(ceSection, 'CEType') self.log.info(section, " -> ".join( (ceType, newCEType))) if ceType == 'Unknown': self.csAPI.setOption(section, newCEType) else: self.csAPI.modifyValue(section, newCEType) changed = True for queue in queues: try: queueName = queue['GlueCEUniqueID'].split( '/')[-1] except: self.log.warn('Error in queueName ', queue) continue try: newMaxCPUTime = queue['GlueCEPolicyMaxCPUTime'] except: newMaxCPUTime = None newSI00 = None try: caps = queue['GlueCECapability'] if type(caps) == type(''): caps = [caps] for cap in caps: if cap.count('CPUScalingReferenceSI00'): newSI00 = cap.split('=')[-1] except: newSI00 = None queueSection = cfgPath(ceSection, 'Queues', queueName) result = gConfig.getOptionsDict(queueSection) if not result['OK']: self.log.warn("Section Queues", result['Message']) maxCPUTime = 'Unknown' si00 = 'Unknown' allowedVOs = [''] else: queueOpt = result['Value'] maxCPUTime = queueOpt.get( 'maxCPUTime', 'Unknown') si00 = queueOpt.get('SI00', 'Unknown') if newVO == '': # Remember previous iteration, if none - read from conf allowedVOs = queueOpt.get('VO', '').split(",") else: # Else use newVO, as it can contain changes, which aren't in conf yet allowedVOs = newVO.split(",") if newMaxCPUTime and (maxCPUTime != newMaxCPUTime): section = cfgPath(queueSection, 'maxCPUTime') self.log.info( section, " -> ".join( (maxCPUTime, newMaxCPUTime))) if maxCPUTime == 'Unknown': self.csAPI.setOption( section, newMaxCPUTime) else: self.csAPI.modifyValue( section, newMaxCPUTime) changed = True if newSI00 and (si00 != newSI00): section = cfgPath(queueSection, 'SI00') self.log.info(section, " -> ".join( (si00, newSI00))) if si00 == 'Unknown': self.csAPI.setOption(section, newSI00) else: self.csAPI.modifyValue(section, newSI00) changed = True modifyVO = True # Flag saying if we need VO option to change newVO = '' if allowedVOs != ['']: for allowedVO in allowedVOs: allowedVO = allowedVO.strip( ) # Get rid of spaces newVO += allowedVO if allowedVO == vo: # Current VO has been already in list newVO = '' modifyVO = False # Don't change anything break # Skip next 'if', proceed to next VO newVO += ', ' if modifyVO: section = cfgPath(queueSection, 'VO') newVO += vo self.log.info( section, " -> ".join( ('%s' % allowedVOs, newVO))) if allowedVOs == ['']: self.csAPI.setOption(section, newVO) else: self.csAPI.modifyValue(section, newVO) changed = True if changed: self.log.info(body) if body and self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail(self.addressTo, self.subject, body, self.addressFrom, localAttempt=False) return self.csAPI.commit() else: self.log.info("No changes found") return S_OK()
class Bdii2CSAgent(AgentModule): def __init__(self, *args, **kwargs): """ Defines default parameters """ super(Bdii2CSAgent, self).__init__(*args, **kwargs) self.addressTo = '' self.addressFrom = '' self.voName = [] self.subject = "Bdii2CSAgent" self.alternativeBDIIs = [] self.voBdiiCEDict = {} self.voBdiiSEDict = {} self.host = 'lcg-bdii.cern.ch:2170' self.glue2URLs = [] self.glue2Only = False self.csAPI = None # What to get self.processCEs = True self.processSEs = False self.selectedSites = [] # Update the CS or not? self.dryRun = False def initialize(self): """ Gets run paramaters from the configuration """ self.addressTo = self.am_getOption('MailTo', self.addressTo) self.addressFrom = self.am_getOption('MailFrom', self.addressFrom) # Create a list of alternative bdii urls self.alternativeBDIIs = self.am_getOption('AlternativeBDIIs', self.alternativeBDIIs) self.host = self.am_getOption('Host', self.host) self.glue2URLs = self.am_getOption('GLUE2URLs', self.glue2URLs) self.glue2Only = self.am_getOption('GLUE2Only', self.glue2Only) # Check if the bdii url is appended by a port number, if not append the default 2170 for index, url in enumerate(self.alternativeBDIIs): if not url.split(':')[-1].isdigit(): self.alternativeBDIIs[index] += ':2170' if self.addressTo and self.addressFrom: self.log.info("MailTo", self.addressTo) self.log.info("MailFrom", self.addressFrom) if self.alternativeBDIIs: self.log.info("AlternativeBDII URLs:", self.alternativeBDIIs) self.processCEs = self.am_getOption('ProcessCEs', self.processCEs) self.processSEs = self.am_getOption('ProcessSEs', self.processSEs) self.selectedSites = self.am_getOption('SelectedSites', []) self.dryRun = self.am_getOption('DryRun', self.dryRun) self.voName = self.am_getOption('VirtualOrganization', self.voName) if not self.voName: self.voName = self.am_getOption('VO', []) if not self.voName or (len(self.voName) == 1 and self.voName[0].lower() == 'all'): # Get all VOs defined in the configuration self.voName = [] result = getVOs() if result['OK']: vos = result['Value'] for vo in vos: vomsVO = getVOOption(vo, "VOMSName") if vomsVO: self.voName.append(vomsVO) if self.voName: self.log.info("Agent will manage VO(s) %s" % self.voName) else: self.log.fatal("VirtualOrganization option not defined for agent") return S_ERROR() self.csAPI = CSAPI() return self.csAPI.initialize() def execute(self): """ General agent execution method """ self.voBdiiCEDict = {} self.voBdiiSEDict = {} # Get a "fresh" copy of the CS data result = self.csAPI.downloadCSData() if not result['OK']: self.log.warn("Could not download a fresh copy of the CS data", result['Message']) # Refresh the configuration from the master server gConfig.forceRefresh(fromMaster=True) if self.processCEs: self.__lookForNewCEs() self.__updateCEs() if self.processSEs: self.__lookForNewSEs() self.__updateSEs() return S_OK() def __lookForNewCEs(self): """ Look up BDII for CEs not yet present in the DIRAC CS """ bannedCEs = self.am_getOption('BannedCEs', []) result = getCEsFromCS() if not result['OK']: return result knownCEs = set(result['Value']) knownCEs = knownCEs.union(set(bannedCEs)) for vo in self.voName: result = self.__getBdiiCEInfo(vo) if not result['OK']: continue bdiiInfo = result['Value'] result = getGridCEs(vo, bdiiInfo=bdiiInfo, ceBlackList=knownCEs) if not result['OK']: self.log.error('Failed to get unused CEs', result['Message']) siteDict = result['Value'] body = '' for site in siteDict: newCEs = set(siteDict[site].keys()) # pylint: disable=no-member if not newCEs: continue ceString = '' for ce in newCEs: queueString = '' ceInfo = bdiiInfo[site]['CEs'][ce] newCEString = "CE: %s, GOCDB Site Name: %s" % (ce, site) systemTuple = siteDict[site][ce]['System'] osString = "%s_%s_%s" % (systemTuple) newCEString = "\n%s\n%s\n" % (newCEString, osString) for queue in ceInfo['Queues']: queueStatus = ceInfo['Queues'][queue].get('GlueCEStateStatus', 'UnknownStatus') if 'production' in queueStatus.lower(): ceType = ceInfo['Queues'][queue].get('GlueCEImplementationName', '') queueString += " %s %s %s\n" % (queue, queueStatus, ceType) if queueString: ceString += newCEString ceString += "Queues:\n" ceString += queueString if ceString: body += ceString if body: body = "\nWe are glad to inform You about new CE(s) possibly suitable for %s:\n" % vo + body body += "\n\nTo suppress information about CE add its name to BannedCEs list.\n" body += "Add new Sites/CEs for vo %s with the command:\n" % vo body += "dirac-admin-add-resources --vo %s --ce\n" % vo self.log.info(body) if self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail(self.addressTo, self.subject, body, self.addressFrom, localAttempt=False, avoidSpam=True) if not result['OK']: self.log.error('Can not send new site notification mail', result['Message']) return S_OK() def __getBdiiCEInfo(self, vo): if vo in self.voBdiiCEDict: return S_OK(self.voBdiiCEDict[vo]) self.log.info("Check for available CEs for VO", vo) totalResult = S_OK({}) message = '' mainResult = getBdiiCEInfo(vo, host=self.host, glue2=self.glue2Only) if not mainResult['OK']: self.log.error("Failed getting information from default bdii", mainResult['Message']) message = mainResult['Message'] for bdii in reversed(self.alternativeBDIIs): resultAlt = getBdiiCEInfo(vo, host=bdii, glue2=self.glue2Only) if resultAlt['OK']: totalResult['Value'].update(resultAlt['Value']) else: self.log.error("Failed getting information from %s " % bdii, resultAlt['Message']) message = (message + "\n" + resultAlt['Message']).strip() for glue2URL in self.glue2URLs: if self.glue2Only: break resultGlue2 = getBdiiCEInfo(vo, host=glue2URL, glue2=True) if resultGlue2['OK']: totalResult['Value'].update(resultGlue2['Value']) else: self.log.error("Failed getting GLUE2 information for", "%s, %s: %s" % (glue2URL, vo, resultGlue2['Message'])) message = (message + "\n" + resultGlue2['Message']).strip() if mainResult['OK']: totalResult['Value'].update(mainResult['Value']) if not totalResult['Value'] and message: # Dict is empty and we have an error message self.log.error("Error during BDII request", message) totalResult = S_ERROR(message) else: self.voBdiiCEDict[vo] = totalResult['Value'] return totalResult def __getBdiiSEInfo(self, vo): if vo in self.voBdiiSEDict: return S_OK(self.voBdiiSEDict[vo]) self.log.info("Check for available SEs for VO", vo) result = getBdiiSEInfo(vo) message = '' if not result['OK']: message = result['Message'] for bdii in self.alternativeBDIIs: result = getBdiiSEInfo(vo, host=bdii) if result['OK']: break if not result['OK']: if message: self.log.error("Error during BDII request", message) else: self.log.error("Error during BDII request", result['Message']) else: self.voBdiiSEDict[vo] = result['Value'] return result def __updateCEs(self): """ Update the Site/CE/queue settings in the CS if they were changed in the BDII """ bdiiChangeSet = set() for vo in self.voName: result = self.__getBdiiCEInfo(vo) if not result['OK']: continue ceBdiiDict = result['Value'] self.__purgeSites(ceBdiiDict) result = getSiteUpdates(vo, bdiiInfo=ceBdiiDict, log=self.log) if not result['OK']: continue bdiiChangeSet = bdiiChangeSet.union(result['Value']) # We have collected all the changes, consolidate VO settings result = self.__updateCS(bdiiChangeSet) return result def __purgeSites(self, ceBdiiDict): """Remove all sites that are not in self.selectedSites. Modifies the ceBdiiDict! """ if not self.selectedSites: return for site in list(ceBdiiDict): ces = list(ceBdiiDict[site]['CEs']) if not ces: self.log.error("No CE information for site:", site) continue diracSiteName = getSiteForCE(ces[0]) if not diracSiteName['OK']: self.log.error("Failed to get DIRAC site name for ce", "%s: %s" % (ces[0], diracSiteName['Message'])) continue self.log.debug("Checking site %s (%s), aka %s" % (site, ces, diracSiteName['Value'])) if diracSiteName['Value'] in self.selectedSites: continue self.log.info("Dropping site %s, aka %s" % (site, diracSiteName)) ceBdiiDict.pop(site) return def __updateCS(self, bdiiChangeSet): queueVODict = {} changeSet = set() for entry in bdiiChangeSet: section, option, _value, new_value = entry if option == "VO": queueVODict.setdefault(section, set()) queueVODict[section] = queueVODict[section].union(set(new_value.split(','))) else: changeSet.add(entry) for section, VOs in queueVODict.items(): changeSet.add((section, 'VO', '', ','.join(VOs))) if changeSet: changeList = sorted(changeSet) body = '\n'.join(["%s/%s %s -> %s" % entry for entry in changeList]) if body and self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail(self.addressTo, self.subject, body, self.addressFrom, localAttempt=False) if body: self.log.info('The following configuration changes were detected:') self.log.info(body) for section, option, value, new_value in changeSet: if value == 'Unknown' or not value: self.csAPI.setOption(cfgPath(section, option), new_value) else: self.csAPI.modifyValue(cfgPath(section, option), new_value) if self.dryRun: self.log.info("Dry Run: CS won't be updated") self.csAPI.showDiff() else: result = self.csAPI.commit() if not result['OK']: self.log.error("Error while committing to CS", result['Message']) else: self.log.info("Successfully committed %d changes to CS" % len(changeList)) return result else: self.log.info("No changes found") return S_OK() def __lookForNewSEs(self): """ Look up BDII for SEs not yet present in the DIRAC CS """ bannedSEs = self.am_getOption('BannedSEs', []) result = getSEsFromCS() if not result['OK']: return result knownSEs = set(result['Value']) knownSEs = knownSEs.union(set(bannedSEs)) for vo in self.voName: result = self.__getBdiiSEInfo(vo) if not result['OK']: continue bdiiInfo = result['Value'] result = getGridSRMs(vo, bdiiInfo=bdiiInfo, srmBlackList=knownSEs) if not result['OK']: continue siteDict = result['Value'] body = '' for site in siteDict: newSEs = set(siteDict[site].keys()) # pylint: disable=no-member if not newSEs: continue for se in newSEs: body += '\n New SE %s available at site %s:\n' % (se, site) backend = siteDict[site][se]['SE'].get('GlueSEImplementationName', 'Unknown') size = siteDict[site][se]['SE'].get('GlueSESizeTotal', 'Unknown') body += ' Backend %s, Size %s' % (backend, size) if body: body = "\nWe are glad to inform You about new SE(s) possibly suitable for %s:\n" % vo + body body += "\n\nTo suppress information about an SE add its name to BannedSEs list.\n" body += "Add new SEs for vo %s with the command:\n" % vo body += "dirac-admin-add-resources --vo %s --se\n" % vo self.log.info(body) if self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail(self.addressTo, self.subject, body, self.addressFrom, localAttempt=False) if not result['OK']: self.log.error('Can not send new site notification mail', result['Message']) return S_OK() def __updateSEs(self): """ Update the Storage Element settings in the CS if they were changed in the BDII """ bdiiChangeSet = set() for vo in self.voName: result = self.__getBdiiSEInfo(vo) if not result['OK']: continue seBdiiDict = result['Value'] result = getSRMUpdates(vo, bdiiInfo=seBdiiDict) if not result['OK']: continue bdiiChangeSet = bdiiChangeSet.union(result['Value']) # We have collected all the changes, consolidate VO settings result = self.__updateCS(bdiiChangeSet) return result
class DiracAdmin(API): """ Administrative functionalities """ ############################################################################# def __init__(self): """Internal initialization of the DIRAC Admin API. """ super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG': self.dbg = True self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp') self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus() self._siteSet = set(getSites().get('Value', [])) ############################################################################# def uploadProxy(self): """Upload a proxy to the DIRAC WMS. This method Example usage: >>> print diracAdmin.uploadProxy('dteam_pilot') {'OK': True, 'Value': 0L} :return: S_OK,S_ERROR :param permanent: Indefinitely update proxy :type permanent: boolean """ return gProxyManager.uploadProxy() ############################################################################# def setProxyPersistency(self, userDN, userGroup, persistent=True): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> gLogger.notice(diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True )) {'OK': True } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param persistent: Persistent flag :type persistent: boolean :return: S_OK,S_ERROR """ return gProxyManager.setPersistency(userDN, userGroup, persistent) ############################################################################# def checkProxyUploaded(self, userDN, userGroup, requiredTime): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> gLogger.notice(diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True )) {'OK': True, 'Value' : True/False } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param requiredTime: Required life time of the uploaded proxy :type requiredTime: boolean :return: S_OK,S_ERROR """ return gProxyManager.userHasProxy(userDN, userGroup, requiredTime) ############################################################################# def getSiteMask(self, printOutput=False, status='Active'): """Retrieve current site mask from WMS Administrator service. Example usage: >>> gLogger.notice(diracAdmin.getSiteMask()) {'OK': True, 'Value': 0L} :return: S_OK,S_ERROR """ result = self.sitestatus.getSites(siteState=status) if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: gLogger.notice(site) return result ############################################################################# def getBannedSites(self, printOutput=False): """Retrieve current list of banned and probing sites. Example usage: >>> gLogger.notice(diracAdmin.getBannedSites()) {'OK': True, 'Value': []} :return: S_OK,S_ERROR """ bannedSites = self.sitestatus.getSites(siteState='Banned') if not bannedSites['OK']: return bannedSites probingSites = self.sitestatus.getSites(siteState='Probing') if not probingSites['OK']: return probingSites mergedList = sorted(bannedSites['Value'] + probingSites['Value']) if printOutput: gLogger.notice('\n'.join(mergedList)) return S_OK(mergedList) ############################################################################# def getSiteSection(self, site, printOutput=False): """Simple utility to get the list of CEs for DIRAC site name. Example usage: >>> gLogger.notice(diracAdmin.getSiteSection('LCG.CERN.ch')) {'OK': True, 'Value':} :return: S_OK,S_ERROR """ gridType = site.split('.')[0] if not gConfig.getSections('/Resources/Sites/%s' % (gridType))['OK']: return S_ERROR('/Resources/Sites/%s is not a valid site section' % (gridType)) result = gConfig.getOptionsDict('/Resources/Sites/%s/%s' % (gridType, site)) if printOutput and result['OK']: gLogger.notice(self.pPrint.pformat(result['Value'])) return result ############################################################################# def allowSite(self, site, comment, printOutput=False): """Adds the site to the site mask. Example usage: >>> gLogger.notice(diracAdmin.allowSite()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result result = self.getSiteMask(status='Active') if not result['OK']: return result siteMask = result['Value'] if site in siteMask: if printOutput: gLogger.notice('Site %s is already Active' % site) return S_OK('Site %s is already Active' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Active', comment) else: result = WMSAdministratorClient().allowSite(site, comment) if not result['OK']: return result if printOutput: gLogger.notice('Site %s status is set to Active' % site) return result ############################################################################# def getSiteMaskLogging(self, site=None, printOutput=False): """Retrieves site mask logging information. Example usage: >>> gLogger.notice(diracAdmin.getSiteMaskLogging('LCG.AUVER.fr')) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result if self.rssFlag: result = ResourceStatusClient().selectStatusElement('Site', 'History', name=site) else: result = WMSAdministratorClient().getSiteMaskLogging(site) if not result['OK']: return result if printOutput: if site: gLogger.notice('\nSite Mask Logging Info for %s\n' % site) else: gLogger.notice('\nAll Site Mask Logging Info\n') sitesLogging = result['Value'] if isinstance(sitesLogging, dict): for siteName, tupleList in sitesLogging.items( ): # can be an iterator if not siteName: gLogger.notice('\n===> %s\n' % siteName) for tup in tupleList: stup = str(tup[0]).ljust(8) + str(tup[1]).ljust(20) stup += '( ' + str(tup[2]).ljust(len(str( tup[2]))) + ' ) "' + str(tup[3]) + '"' gLogger.notice(stup) gLogger.notice(' ') elif isinstance(sitesLogging, list): sitesLoggingList = [(sl[1], sl[3], sl[4]) for sl in sitesLogging] for siteLog in sitesLoggingList: gLogger.notice(siteLog) return S_OK() ############################################################################# def banSite(self, site, comment, printOutput=False): """Removes the site from the site mask. Example usage: >>> gLogger.notice(diracAdmin.banSite()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result mask = self.getSiteMask(status='Banned') if not mask['OK']: return mask siteMask = mask['Value'] if site in siteMask: if printOutput: gLogger.notice('Site %s is already Banned' % site) return S_OK('Site %s is already Banned' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Banned', comment) else: result = WMSAdministratorClient().banSite(site, comment) if not result['OK']: return result if printOutput: gLogger.notice('Site %s status is set to Banned' % site) return result ############################################################################# def __checkSiteIsValid(self, site): """Internal function to check that a site name is valid. """ if isinstance(site, (list, set, dict)): site = set(site) - self._siteSet if not site: return S_OK() elif site in self._siteSet: return S_OK() return S_ERROR('Specified site %s is not in list of defined sites' % str(site)) ############################################################################# def getServicePorts(self, setup='', printOutput=False): """Checks the service ports for the specified setup. If not given this is taken from the current installation (/DIRAC/Setup) Example usage: >>> gLogger.notice(diracAdmin.getServicePorts()) {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ if not setup: setup = gConfig.getValue('/DIRAC/Setup', '') setupList = gConfig.getSections('/DIRAC/Setups', []) if not setupList['OK']: return S_ERROR('Could not get /DIRAC/Setups sections') setupList = setupList['Value'] if setup not in setupList: return S_ERROR('Setup %s is not in allowed list: %s' % (setup, ', '.join(setupList))) serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup) if not serviceSetups['OK']: return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup) serviceSetups = serviceSetups['Value'] # dict systemList = gConfig.getSections('/Systems') if not systemList['OK']: return S_ERROR('Could not get Systems sections') systemList = systemList['Value'] result = {} for system in systemList: if system in serviceSetups: path = '/Systems/%s/%s/Services' % (system, serviceSetups[system]) servicesList = gConfig.getSections(path) if not servicesList['OK']: self.log.warn('Could not get sections in %s' % path) else: servicesList = servicesList['Value'] if not servicesList: servicesList = [] self.log.verbose('System: %s ServicesList: %s' % (system, ', '.join(servicesList))) for service in servicesList: spath = '%s/%s/Port' % (path, service) servicePort = gConfig.getValue(spath, 0) if servicePort: self.log.verbose('Found port for %s/%s = %s' % (system, service, servicePort)) result['%s/%s' % (system, service)] = servicePort else: self.log.warn('No port found for %s' % spath) else: self.log.warn('%s is not defined in /DIRAC/Setups/%s' % (system, setup)) if printOutput: gLogger.notice(self.pPrint.pformat(result)) return S_OK(result) ############################################################################# def getProxy(self, userDN, userGroup, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> gLogger.notice(diracAdmin.getProxy()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadProxy(userDN, userGroup, limited=limited, requiredTimeLeft=validity) ############################################################################# def getVOMSProxy(self, userDN, userGroup, vomsAttr=False, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and VOMS extensions and stores this in a file in the local directory by default. Example usage: >>> gLogger.notice(diracAdmin.getVOMSProxy()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadVOMSProxy(userDN, userGroup, limited=limited, requiredVOMSAttribute=vomsAttr, requiredTimeLeft=validity) ############################################################################# def getPilotProxy(self, userDN, userGroup, validity=43200): """Retrieves a pilot proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> gLogger.notice(diracAdmin.getVOMSProxy()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.getPilotProxyFromDIRACGroup( userDN, userGroup, requiredTimeLeft=validity) ############################################################################# def resetJob(self, jobID): """Reset a job or list of jobs in the WMS. This operation resets the reschedule counter for a job or list of jobs and allows them to run as new. Example:: >>> gLogger.notice(dirac.reset(12345)) {'OK': True, 'Value': [12345]} :param job: JobID :type job: integer or list of integers :return: S_OK,S_ERROR """ if isinstance(jobID, six.string_types): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobID' ) elif isinstance(jobID, list): try: jobID = [int(job) for job in jobID] except Exception as x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobIDs' ) result = JobManagerClient(useCertificates=False).resetJob(jobID) return result ############################################################################# def getJobPilotOutput(self, jobID, directory=''): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> gLogger.notice(dirac.getJobPilotOutput(12345)) {'OK': True, StdOut:'',StdError:''} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) result = WMSAdministratorClient().getJobPilotOutput(jobID) if not result['OK']: return result outputPath = '%s/pilot_%s' % (directory, jobID) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.verbose('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdError' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdError']) self.log.verbose('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotOutput(self, gridReference, directory=''): """Retrieve the pilot output (std.out and std.err) for an existing job in the WMS. >>> gLogger.notice(dirac.getJobPilotOutput(12345)) {'OK': True, 'Value': {}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) result = PilotManagerClient().getPilotOutput(gridReference) if not result['OK']: return result gridReferenceSmall = gridReference.split('/')[-1] if not gridReferenceSmall: gridReferenceSmall = 'reference' outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.info('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdErr' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdErr']) self.log.info('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotInfo(self, gridReference): """Retrieve info relative to a pilot reference >>> gLogger.notice(dirac.getPilotInfo(12345)) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') result = PilotManagerClient().getPilotInfo(gridReference) return result ############################################################################# def killPilot(self, gridReference): """Kill the pilot specified >>> gLogger.notice(dirac.getPilotInfo(12345)) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') result = PilotManagerClient().killPilot(gridReference) return result ############################################################################# def getPilotLoggingInfo(self, gridReference): """Retrieve the pilot logging info for an existing job in the WMS. >>> gLogger.notice(dirac.getPilotLoggingInfo(12345)) {'OK': True, 'Value': {"The output of the command"}} :param gridReference: Gridp pilot job reference Id :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') return PilotManagerClient().getPilotLoggingInfo(gridReference) ############################################################################# def getJobPilots(self, jobID): """Extract the list of submitted pilots and their status for a given jobID from the WMS. Useful information is printed to the screen. >>> gLogger.notice(dirac.getJobPilots()) {'OK': True, 'Value': {PilotID:{StatusDict}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if isinstance(jobID, six.string_types): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or string for existing jobID') result = PilotManagerClient().getPilots(jobID) if result['OK']: gLogger.notice(self.pPrint.pformat(result['Value'])) return result ############################################################################# def getPilotSummary(self, startDate='', endDate=''): """Retrieve the pilot output for an existing job in the WMS. Summary is printed at INFO level, full dictionary of results also returned. >>> gLogger.notice(dirac.getPilotSummary()) {'OK': True, 'Value': {CE:{Status:Count}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ result = PilotManagerClient().getPilotSummary(startDate, endDate) if not result['OK']: return result ceDict = result['Value'] headers = 'CE'.ljust(28) i = 0 for ce, summary in ceDict.iteritems(): states = summary.keys() if len(states) > i: i = len(states) for i in xrange(i): headers += 'Status'.ljust(12) + 'Count'.ljust(12) gLogger.notice(headers) for ce, summary in ceDict.iteritems(): line = ce.ljust(28) states = sorted(summary) for state in states: count = str(summary[state]) line += state.ljust(12) + count.ljust(12) gLogger.notice(line) return result ############################################################################# def setSiteProtocols(self, site, protocolsList, printOutput=False): """ Allows to set the defined protocols for each SE for a given site. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are', ', '.join(defaultProtocols)) for protocol in protocolsList: if protocol not in defaultProtocols: return S_ERROR( 'Requested to set protocol %s in list but %s is not ' 'in default list of protocols:\n%s' % (protocol, protocol, ', '.join(defaultProtocols))) modifiedCS = False result = promptUser( 'Do you want to add the following default protocols:' ' %s for SE(s):\n%s' % (', '.join(protocolsList), ', '.join(siteSEs))) if not result['OK']: return result if result['Value'].lower() != 'y': self.log.always('No protocols will be added') return S_OK() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue( '/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % ( se, section) self.log.verbose('Setting %s to %s' % (path, ', '.join(protocolsList))) result = self.csSetOption(path, ', '.join(protocolsList)) if not result['OK']: return result modifiedCS = True if modifiedCS: result = self.csCommitChanges(False) if not result['OK']: return S_ERROR('CS Commit failed with message = %s' % (result['Message'])) else: if printOutput: gLogger.notice('Successfully committed changes to CS') else: if printOutput: gLogger.notice('No modifications to CS required') return S_OK() ############################################################################# def csSetOption(self, optionPath, optionValue): """ Function to modify an existing value in the CS. """ return self.csAPI.setOption(optionPath, optionValue) ############################################################################# def csSetOptionComment(self, optionPath, comment): """ Function to modify an existing value in the CS. """ return self.csAPI.setOptionComment(optionPath, comment) ############################################################################# def csModifyValue(self, optionPath, newValue): """ Function to modify an existing value in the CS. """ return self.csAPI.modifyValue(optionPath, newValue) ############################################################################# def csRegisterUser(self, username, properties): """ Registers a user in the CS. - username: Username of the user (easy;) - properties: Dict containing: - DN - groups : list/tuple of groups the user belongs to - <others> : More properties of the user, like mail """ return self.csAPI.addUser(username, properties) ############################################################################# def csDeleteUser(self, user): """ Deletes a user from the CS. Can take a list of users """ return self.csAPI.deleteUsers(user) ############################################################################# def csModifyUser(self, username, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addUser and applies the changes """ return self.csAPI.modifyUser(username, properties, createIfNonExistant) ############################################################################# def csListUsers(self, group=False): """ Lists the users in the CS. If no group is specified return all users. """ return self.csAPI.listUsers(group) ############################################################################# def csDescribeUsers(self, mask=False): """ List users and their properties in the CS. If a mask is given, only users in the mask will be returned """ return self.csAPI.describeUsers(mask) ############################################################################# def csModifyGroup(self, groupname, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addGroup and applies the changes """ return self.csAPI.modifyGroup(groupname, properties, createIfNonExistant) ############################################################################# def csListHosts(self): """ Lists the hosts in the CS """ return self.csAPI.listHosts() ############################################################################# def csDescribeHosts(self, mask=False): """ Gets extended info for the hosts in the CS """ return self.csAPI.describeHosts(mask) ############################################################################# def csModifyHost(self, hostname, properties, createIfNonExistant=False): """ Modify a host in the CS. Takes the same params as in addHost and applies the changes """ return self.csAPI.modifyHost(hostname, properties, createIfNonExistant) ############################################################################# def csListGroups(self): """ Lists groups in the CS """ return self.csAPI.listGroups() ############################################################################# def csDescribeGroups(self, mask=False): """ List groups and their properties in the CS. If a mask is given, only groups in the mask will be returned """ return self.csAPI.describeGroups(mask) ############################################################################# def csSyncUsersWithCFG(self, usersCFG): """ Synchronize users in cfg with its contents """ return self.csAPI.syncUsersWithCFG(usersCFG) ############################################################################# def csCommitChanges(self, sortUsers=True): """ Commit the changes in the CS """ return self.csAPI.commitChanges(sortUsers=False) ############################################################################# def sendMail(self, address, subject, body, fromAddress=None, localAttempt=True, html=False): """ Send mail to specified address with body. """ notification = NotificationClient() return notification.sendMail(address, subject, body, fromAddress, localAttempt, html) ############################################################################# def sendSMS(self, userName, body, fromAddress=None): """ Send mail to specified address with body. """ if len(body) > 160: return S_ERROR('Exceeded maximum SMS length of 160 characters') notification = NotificationClient() return notification.sendSMS(userName, body, fromAddress) ############################################################################# def getBDIISite(self, site, host=None): """ Get information about site from BDII at host """ return ldapSite(site, host=host) ############################################################################# def getBDIICluster(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCluster(ce, host=host) ############################################################################# def getBDIICE(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCE(ce, host=host) ############################################################################# def getBDIIService(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapService(ce, host=host) ############################################################################# def getBDIICEState(self, ce, useVO=voName, host=None): """ Get information about ce state from BDII at host """ return ldapCEState(ce, useVO, host=host) ############################################################################# def getBDIICEVOView(self, ce, useVO=voName, host=None): """ Get information about ce voview from BDII at host """ return ldapCEVOView(ce, useVO, host=host)
class MonitorAgents(AgentModule): """MonitorAgents class.""" def __init__(self, *args, **kwargs): """Initialize the agent, clients, default values.""" AgentModule.__init__(self, *args, **kwargs) self.name = 'MonitorAgents' self.setup = "Production" self.enabled = False self.restartAgents = False self.restartExecutors = False self.restartServices = False self.controlComponents = False self.commitURLs = False self.diracLocation = "/opt/dirac/pro" self.sysAdminClient = SystemAdministratorClient(socket.gethostname()) self.jobMonClient = JobMonitoringClient() self.nClient = NotificationClient() self.csAPI = None self.agents = dict() self.executors = dict() self.services = dict() self.errors = list() self.accounting = defaultdict(dict) self.addressTo = ["*****@*****.**"] self.addressFrom = "*****@*****.**" self.emailSubject = "MonitorAgents on %s" % socket.gethostname() def logError(self, errStr, varMsg=''): """Append errors to a list, which is sent in email notification.""" self.log.error(errStr, varMsg) self.errors.append(errStr + " " + varMsg) def beginExecution(self): """Reload the configurations before every cycle.""" self.setup = self.am_getOption("Setup", self.setup) self.enabled = self.am_getOption("EnableFlag", self.enabled) self.restartAgents = self.am_getOption("RestartAgents", self.restartAgents) self.restartExecutors = self.am_getOption("RestartExecutors", self.restartExecutors) self.restartServices = self.am_getOption("RestartServices", self.restartServices) self.diracLocation = os.environ.get("DIRAC", self.diracLocation) self.addressTo = self.am_getOption('MailTo', self.addressTo) self.addressFrom = self.am_getOption('MailFrom', self.addressFrom) self.controlComponents = self.am_getOption('ControlComponents', self.controlComponents) self.commitURLs = self.am_getOption('CommitURLs', self.commitURLs) self.csAPI = CSAPI() res = self.getRunningInstances(instanceType='Agents') if not res["OK"]: return S_ERROR("Failure to get running agents") self.agents = res["Value"] res = self.getRunningInstances(instanceType='Executors') if not res["OK"]: return S_ERROR("Failure to get running executors") self.executors = res["Value"] res = self.getRunningInstances(instanceType='Services') if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] self.accounting.clear() return S_OK() def sendNotification(self): """Send email notification about changes done in the last cycle.""" if not(self.errors or self.accounting): return S_OK() emailBody = "" rows = [] for instanceName, val in self.accounting.iteritems(): rows.append([[instanceName], [val.get('Treatment', 'No Treatment')], [str(val.get('LogAge', 'Not Relevant'))]]) if rows: columns = ["Instance", "Treatment", "Log File Age (Minutes)"] emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=' | ') if self.errors: emailBody += "\n\nErrors:" emailBody += "\n".join(self.errors) self.log.notice("Sending Email:\n" + emailBody) for address in self.addressTo: res = self.nClient.sendMail(address, self.emailSubject, emailBody, self.addressFrom, localAttempt=False) if not res['OK']: self.log.error("Failure to send Email notification to ", address) continue self.errors = [] self.accounting.clear() return S_OK() def getRunningInstances(self, instanceType='Agents', runitStatus='Run'): """Return a dict of running agents, executors or services. Key is agent's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation :param str instanceType: 'Agents', 'Executors', 'Services' :param str runitStatus: Return only those instances with given RunitStatus or 'All' :returns: Dictionary of running instances """ res = self.sysAdminClient.getOverallStatus() if not res["OK"]: self.logError("Failure to get %s from system administrator client" % instanceType, res["Message"]) return res val = res['Value'][instanceType] runningAgents = defaultdict(dict) for system, agents in val.iteritems(): for agentName, agentInfo in agents.iteritems(): if agentInfo['Setup'] and agentInfo['Installed']: if runitStatus != 'All' and agentInfo['RunitStatus'] != runitStatus: continue confPath = cfgPath('/Systems/' + system + '/' + self.setup + '/%s/' % instanceType + agentName) for option, default in (('PollingTime', HOUR), ('Port', None)): optPath = os.path.join(confPath, option) runningAgents[agentName][option] = gConfig.getValue(optPath, default) runningAgents[agentName]["LogFileLocation"] = \ os.path.join(self.diracLocation, 'runit', system, agentName, 'log', 'current') runningAgents[agentName]["PID"] = agentInfo["PID"] runningAgents[agentName]['Module'] = agentInfo['Module'] runningAgents[agentName]['RunitStatus'] = agentInfo['RunitStatus'] runningAgents[agentName]['System'] = system return S_OK(runningAgents) def on_terminate(self, agentName, process): """Execute callback when a process terminates gracefully.""" self.log.info("%s's process with ID: %s has been terminated successfully" % (agentName, process.pid)) def execute(self): """Execute checks for agents, executors, services.""" for instanceType in ('executor', 'agent', 'service'): for name, options in getattr(self, instanceType + 's').iteritems(): # call checkAgent, checkExecutor, checkService res = getattr(self, 'check' + instanceType.capitalize())(name, options) if not res['OK']: self.logError("Failure when checking %s" % instanceType, "%s, %s" % (name, res['Message'])) res = self.componentControl() if not res['OK']: if "Stopped does not exist" not in res['Message'] and \ "Running does not exist" not in res['Message']: self.logError("Failure to control components", res['Message']) if not self.errors: res = self.checkURLs() if not res['OK']: self.logError("Failure to check URLs", res['Message']) else: self.logError('Something was wrong before, not checking URLs this time') self.sendNotification() if self.errors: return S_ERROR("Error during this cycle, check log") return S_OK() @staticmethod def getLastAccessTime(logFileLocation): """Return the age of log file.""" lastAccessTime = 0 try: lastAccessTime = os.path.getmtime(logFileLocation) lastAccessTime = datetime.fromtimestamp(lastAccessTime) except OSError as e: return S_ERROR('Failed to access logfile %s: %r' % (logFileLocation, e)) now = datetime.now() age = now - lastAccessTime return S_OK(age) def restartInstance(self, pid, instanceName, enabled): """Kill a process which is then restarted automatically.""" if not (self.enabled and enabled): self.log.info("Restarting is disabled, please restart %s manually" % instanceName) self.accounting[instanceName]["Treatment"] = "Please restart it manually" return S_OK(NO_RESTART) try: agentProc = psutil.Process(int(pid)) processesToTerminate = agentProc.children(recursive=True) processesToTerminate.append(agentProc) for proc in processesToTerminate: proc.terminate() _gone, alive = psutil.wait_procs(processesToTerminate, timeout=5, callback=partial(self.on_terminate, instanceName)) for proc in alive: self.log.info("Forcefully killing process %s" % proc.pid) proc.kill() return S_OK() except psutil.Error as err: self.logError("Exception occurred in terminating processes", "%s" % err) return S_ERROR() def checkService(self, serviceName, options): """Ping the service, restart if the ping does not respond.""" url = self._getURL(serviceName, options) self.log.info("Pinging service", url) pingRes = Client().ping(url=url) if not pingRes['OK']: self.log.info('Failure pinging service: %s: %s' % (url, pingRes['Message'])) res = self.restartInstance(int(options['PID']), serviceName, self.restartServices) if not res["OK"]: return res elif res['OK'] and res['Value'] != NO_RESTART: self.accounting[serviceName]["Treatment"] = "Successfully Restarted" self.log.info("Agent %s has been successfully restarted" % serviceName) self.log.info("Service responded OK") return S_OK() def checkAgent(self, agentName, options): """Check the age of agent's log file, if it is too old then restart the agent.""" pollingTime, currentLogLocation, pid = options['PollingTime'], options['LogFileLocation'], options['PID'] self.log.info("Checking Agent: %s" % agentName) self.log.info("Polling Time: %s" % pollingTime) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (agentName, (age.seconds / MINUTES))) maxLogAge = max(pollingTime + HOUR, 2 * HOUR) if age.seconds < maxLogAge: return S_OK() self.log.info("Current log file is too old for Agent %s" % agentName) self.accounting[agentName]["LogAge"] = age.seconds / MINUTES res = self.restartInstance(int(pid), agentName, self.restartAgents) if not res["OK"]: return res elif res['OK'] and res['Value'] != NO_RESTART: self.accounting[agentName]["Treatment"] = "Successfully Restarted" self.log.info("Agent %s has been successfully restarted" % agentName) return S_OK() def checkExecutor(self, executor, options): """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors.""" currentLogLocation = options['LogFileLocation'] pid = options['PID'] self.log.info("Checking executor: %s" % executor) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (executor, (age.seconds / MINUTES))) if age.seconds < 2 * HOUR: return S_OK() self.log.info("Current log file is too old for Executor %s" % executor) self.accounting[executor]["LogAge"] = age.seconds / MINUTES res = self.checkForCheckingJobs(executor) if not res['OK']: return res if res['OK'] and res['Value'] == NO_CHECKING_JOBS: self.accounting.pop(executor, None) return S_OK(NO_RESTART) res = self.restartInstance(int(pid), executor, self.restartExecutors) if not res["OK"]: return res elif res['OK'] and res['Value'] != NO_RESTART: self.accounting[executor]["Treatment"] = "Successfully Restarted" self.log.info("Executor %s has been successfully restarted" % executor) return S_OK() def checkForCheckingJobs(self, executorName): """Check if there are checking jobs with the **executorName** as current MinorStatus.""" attrDict = {'Status': 'Checking', 'MinorStatus': executorName} # returns list of jobs IDs resJobs = self.jobMonClient.getJobs(attrDict) if not resJobs['OK']: self.logError("Could not get jobs for this executor", "%s: %s" % (executorName, resJobs['Message'])) return resJobs if resJobs['Value']: self.log.info("Found %d jobs in 'Checking' status for %s" % (len(resJobs['Value']), executorName)) return S_OK(CHECKING_JOBS) self.log.info("Found no jobs in 'Checking' status for %s" % executorName) return S_OK(NO_CHECKING_JOBS) def componentControl(self): """Monitor and control component status as defined in the CS. Check for running and stopped components and ensure they have the proper status as defined in the CS Registry/Hosts/_HOST_/[Running|Stopped] sections :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`, :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR` """ # get the current status of the components resCurrent = self._getCurrentComponentStatus() if not resCurrent['OK']: return resCurrent currentStatus = resCurrent['Value'] resDefault = self._getDefaultComponentStatus() if not resDefault['OK']: return resDefault defaultStatus = resDefault['Value'] # ensure instances are in the right state shouldBe = {} shouldBe['Run'] = defaultStatus['Run'].intersection(currentStatus['Down']) shouldBe['Down'] = defaultStatus['Down'].intersection(currentStatus['Run']) shouldBe['Unknown'] = defaultStatus['All'].symmetric_difference(currentStatus['All']) self._ensureComponentRunning(shouldBe['Run']) self._ensureComponentDown(shouldBe['Down']) for instance in shouldBe['Unknown']: self.logError("Unknown instance", "%r, either uninstall or add to config" % instance) return S_OK() def _getCurrentComponentStatus(self): """Get current status for components.""" resOverall = self.sysAdminClient.getOverallStatus() if not resOverall['OK']: return resOverall currentStatus = {'Down': set(), 'Run': set(), 'All': set()} informationDict = resOverall['Value'] for systemsDict in informationDict.values(): for system, instancesDict in systemsDict.items(): for instanceName, instanceInfoDict in instancesDict.items(): identifier = '%s__%s' % (system, instanceName) runitStatus = instanceInfoDict.get('RunitStatus') if runitStatus in ('Run', 'Down'): currentStatus[runitStatus].add(identifier) currentStatus['All'] = currentStatus['Run'] | currentStatus['Down'] return S_OK(currentStatus) def _getDefaultComponentStatus(self): """Get the configured status of the components.""" host = socket.gethostname() defaultStatus = {'Down': set(), 'Run': set(), 'All': set()} resRunning = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Running')) resStopped = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Stopped')) if not resRunning['OK']: return resRunning if not resStopped['OK']: return resStopped defaultStatus['Run'] = set(resRunning['Value'].keys()) defaultStatus['Down'] = set(resStopped['Value'].keys()) defaultStatus['All'] = defaultStatus['Run'] | defaultStatus['Down'] if defaultStatus['Run'].intersection(defaultStatus['Down']): self.logError("Overlap in configuration", str(defaultStatus['Run'].intersection(defaultStatus['Down']))) return S_ERROR("Bad host configuration") return S_OK(defaultStatus) def _ensureComponentRunning(self, shouldBeRunning): """Ensure the correct components are running.""" for instance in shouldBeRunning: self.log.info("Starting instance %s" % instance) system, name = instance.split('__') if self.controlComponents: res = self.sysAdminClient.startComponent(system, name) if not res['OK']: self.logError("Failed to start component:", "%s: %s" % (instance, res['Message'])) else: self.accounting[instance]["Treatment"] = "Instance was down, started instance" else: self.accounting[instance]["Treatment"] = "Instance is down, should be started" def _ensureComponentDown(self, shouldBeDown): """Ensure the correct components are not running.""" for instance in shouldBeDown: self.log.info("Stopping instance %s" % instance) system, name = instance.split('__') if self.controlComponents: res = self.sysAdminClient.stopComponent(system, name) if not res['OK']: self.logError("Failed to stop component:", "%s: %s" % (instance, res['Message'])) else: self.accounting[instance]["Treatment"] = "Instance was running, stopped instance" else: self.accounting[instance]["Treatment"] = "Instance is running, should be stopped" def checkURLs(self): """Ensure that the running services have their URL in the Config.""" self.log.info("Checking URLs") # get services again, in case they were started/stop in controlComponents gConfig.forceRefresh(fromMaster=True) res = self.getRunningInstances(instanceType='Services', runitStatus='All') if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] for service, options in self.services.iteritems(): self.log.debug("Checking URL for %s with options %s" % (service, options)) # ignore SystemAdministrator, does not have URLs if 'SystemAdministrator' in service: continue self._checkServiceURL(service, options) if self.csAPI.csModified and self.commitURLs: self.log.info("Commiting changes to the CS") result = self.csAPI.commit() if not result['OK']: self.logError('Commit to CS failed', result['Message']) return S_ERROR("Failed to commit to CS") return S_OK() def _checkServiceURL(self, serviceName, options): """Ensure service URL is properly configured in the CS.""" url = self._getURL(serviceName, options) system = options['System'] module = options['Module'] self.log.info("Checking URLs for %s/%s" % (system, module)) urlsConfigPath = os.path.join('/Systems', system, self.setup, 'URLs', module) urls = gConfig.getValue(urlsConfigPath, []) self.log.debug("Found configured URLs for %s: %s" % (module, urls)) self.log.debug("This URL is %s" % url) runitStatus = options['RunitStatus'] wouldHave = 'Would have ' if not self.commitURLs else '' if runitStatus == 'Run' and url not in urls: urls.append(url) message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) if runitStatus == 'Down' and url in urls: urls.remove(url) message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) @staticmethod def _getURL(serviceName, options): """Return URL for the service.""" system = options['System'] port = options['Port'] host = socket.gethostname() url = 'dips://%s:%s/%s/%s' % (host, port, system, serviceName) return url
class ComponentSupervisionAgent(AgentModule): """ComponentSupervisionAgent class.""" def __init__(self, *args, **kwargs): """Initialize the agent, clients, default values.""" AgentModule.__init__(self, *args, **kwargs) self.name = "ComponentSupervisionAgent" self.setup = "DIRAC-Production" self.enabled = False self.restartAgents = False self.restartExecutors = False self.restartServices = False self.controlComponents = False self.commitURLs = False self.doNotRestartInstancePattern = ["RequestExecutingAgent"] self.diracLocation = rootPath self.sysAdminClient = SystemAdministratorClient(socket.getfqdn()) self.jobMonClient = JobMonitoringClient() self.nClient = NotificationClient() self.csAPI = None self.agents = dict() self.executors = dict() self.services = dict() self._tornadoPort = "8443" self.errors = list() self.accounting = defaultdict(dict) self.addressTo = [] self.addressFrom = "" self.emailSubject = "ComponentSupervisionAgent on %s" % socket.getfqdn( ) def logError(self, errStr, varMsg=""): """Append errors to a list, which is sent in email notification.""" self.log.error(errStr, varMsg) self.errors.append(errStr + " " + varMsg) def beginExecution(self): """Reload the configurations before every cycle.""" self.setup = self.am_getOption("Setup", self.setup) self.enabled = self.am_getOption("EnableFlag", self.enabled) self.restartAgents = self.am_getOption("RestartAgents", self.restartAgents) self.restartExecutors = self.am_getOption("RestartExecutors", self.restartExecutors) self.restartServices = self.am_getOption("RestartServices", self.restartServices) self.addressTo = self.am_getOption("MailTo", self.addressTo) self.addressFrom = self.am_getOption("MailFrom", self.addressFrom) self.controlComponents = self.am_getOption("ControlComponents", self.controlComponents) self.commitURLs = self.am_getOption("CommitURLs", self.commitURLs) self.doNotRestartInstancePattern = self.am_getOption( "DoNotRestartInstancePattern", self.doNotRestartInstancePattern) self.csAPI = CSAPI() res = self.getRunningInstances(instanceType="Agents") if not res["OK"]: return S_ERROR("Failure to get running agents") self.agents = res["Value"] res = self.getRunningInstances(instanceType="Executors") if not res["OK"]: return S_ERROR("Failure to get running executors") self.executors = res["Value"] res = self.getRunningInstances(instanceType="Services") if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] self.accounting.clear() return S_OK() def sendNotification(self): """Send email notification about changes done in the last cycle.""" if not (self.errors or self.accounting): return S_OK() emailBody = "" rows = [] for instanceName, val in self.accounting.items(): rows.append([[instanceName], [val.get("Treatment", "No Treatment")], [str(val.get("LogAge", "Not Relevant"))]]) if rows: columns = ["Instance", "Treatment", "Log File Age (Minutes)"] emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=" | ") if self.errors: emailBody += "\n\nErrors:" emailBody += "\n".join(self.errors) self.log.notice("Sending Email:\n" + emailBody) for address in self.addressTo: res = self.nClient.sendMail(address, self.emailSubject, emailBody, self.addressFrom, localAttempt=False) if not res["OK"]: self.log.error("Failure to send Email notification to ", address) continue self.errors = [] self.accounting.clear() return S_OK() def getRunningInstances(self, instanceType="Agents", runitStatus="Run"): """Return a dict of running agents, executors or services. Key is component's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation :param str instanceType: 'Agents', 'Executors', 'Services' :param str runitStatus: Return only those instances with given RunitStatus or 'All' :returns: Dictionary of running instances """ res = self.sysAdminClient.getOverallStatus() if not res["OK"]: self.logError( "Failure to get %s from system administrator client" % instanceType, res["Message"]) return res val = res["Value"][instanceType] runningComponents = defaultdict(dict) for system, components in val.items(): for componentName, componentInfo in components.items(): if componentInfo["Setup"] and componentInfo["Installed"]: if runitStatus != "All" and componentInfo[ "RunitStatus"] != runitStatus: continue for option, default in (("PollingTime", HOUR), ("Port", None), ("Protocol", None)): runningComponents[componentName][ option] = self._getComponentOption( instanceType, system, componentName, option, default) # remove empty values so we can use defaults in _getURL if not runningComponents[componentName][option]: runningComponents[componentName].pop(option) runningComponents[componentName][ "LogFileLocation"] = os.path.join( self.diracLocation, "runit", system, componentName, "log", "current") runningComponents[componentName]["PID"] = componentInfo[ "PID"] runningComponents[componentName]["Module"] = componentInfo[ "Module"] runningComponents[componentName][ "RunitStatus"] = componentInfo["RunitStatus"] runningComponents[componentName]["System"] = system return S_OK(runningComponents) def _getComponentOption(self, instanceType, system, componentName, option, default): """Get component option from DIRAC CS, using components' base classes methods.""" componentPath = PathFinder.getComponentSection( system=system, component=componentName, setup=self.setup, componentCategory=instanceType, ) if instanceType != "Agents": return gConfig.getValue(Path.cfgPath(componentPath, option), default) # deal with agent configuration componentLoadModule = gConfig.getValue( Path.cfgPath(componentPath, "Module"), componentName) fullComponentName = Path.cfgPath(system, componentName) fullComponentLoadName = Path.cfgPath(system, componentLoadModule) return AgentModule(fullComponentName, fullComponentLoadName).am_getOption( option, default) def on_terminate(self, componentName, process): """Execute callback when a process terminates gracefully.""" self.log.info( "%s's process with ID: %s has been terminated successfully" % (componentName, process.pid)) def execute(self): """Execute checks for agents, executors, services.""" for instanceType in ("executor", "agent", "service"): for name, options in getattr(self, instanceType + "s").items(): # call checkAgent, checkExecutor, checkService res = getattr(self, "check" + instanceType.capitalize())(name, options) if not res["OK"]: self.logError("Failure when checking %s" % instanceType, "%s, %s" % (name, res["Message"])) res = self.componentControl() if not res["OK"]: if "Stopped does not exist" not in res[ "Message"] and "Running does not exist" not in res[ "Message"]: self.logError("Failure to control components", res["Message"]) if not self.errors: res = self.checkURLs() if not res["OK"]: self.logError("Failure to check URLs", res["Message"]) else: self.logError( "Something was wrong before, not checking URLs this time") self.sendNotification() if self.errors: return S_ERROR("Error during this cycle, check log") return S_OK() @staticmethod def getLastAccessTime(logFileLocation): """Return the age of log file.""" lastAccessTime = 0 try: lastAccessTime = os.path.getmtime(logFileLocation) lastAccessTime = datetime.fromtimestamp(lastAccessTime) except OSError as e: return S_ERROR("Failed to access logfile %s: %r" % (logFileLocation, e)) now = datetime.now() age = now - lastAccessTime return S_OK(age) def restartInstance(self, pid, instanceName, enabled): """Kill a process which is then restarted automatically.""" if not (self.enabled and enabled): self.log.info( "Restarting is disabled, please restart %s manually" % instanceName) self.accounting[instanceName][ "Treatment"] = "Please restart it manually" return S_OK(NO_RESTART) if any(pattern in instanceName for pattern in self.doNotRestartInstancePattern): self.log.info( "Restarting for %s is disabled, please restart it manually" % instanceName) self.accounting[instanceName][ "Treatment"] = "Please restart it manually" return S_OK(NO_RESTART) try: componentProc = psutil.Process(int(pid)) processesToTerminate = componentProc.children(recursive=True) processesToTerminate.append(componentProc) for proc in processesToTerminate: proc.terminate() _gone, alive = psutil.wait_procs(processesToTerminate, timeout=5, callback=partial( self.on_terminate, instanceName)) for proc in alive: self.log.info("Forcefully killing process %s" % proc.pid) proc.kill() return S_OK() except psutil.Error as err: self.logError("Exception occurred in terminating processes", "%s" % err) return S_ERROR() def checkService(self, serviceName, options): """Ping the service, restart if the ping does not respond.""" url = self._getURL(serviceName, options) self.log.info("Pinging service", url) pingRes = Client().ping(url=url) if not pingRes["OK"]: self.log.info("Failure pinging service: %s: %s" % (url, pingRes["Message"])) res = self.restartInstance(int(options["PID"]), serviceName, self.restartServices) if not res["OK"]: return res if res["Value"] != NO_RESTART: self.accounting[serviceName][ "Treatment"] = "Successfully Restarted" self.log.info("Service %s has been successfully restarted" % serviceName) self.log.info("Service responded OK") return S_OK() def checkAgent(self, agentName, options): """Check the age of agent's log file, if it is too old then restart the agent.""" pollingTime, currentLogLocation, pid = (options["PollingTime"], options["LogFileLocation"], options["PID"]) self.log.info("Checking Agent: %s" % agentName) self.log.info("Polling Time: %s" % pollingTime) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (agentName, (age.seconds / MINUTES))) maxLogAge = max(pollingTime + HOUR, 2 * HOUR) if age.seconds < maxLogAge: return S_OK() self.log.info("Current log file is too old for Agent %s" % agentName) self.accounting[agentName]["LogAge"] = age.seconds / MINUTES res = self.restartInstance(int(pid), agentName, self.restartAgents) if not res["OK"]: return res if res["Value"] != NO_RESTART: self.accounting[agentName]["Treatment"] = "Successfully Restarted" self.log.info("Agent %s has been successfully restarted" % agentName) return S_OK() def checkExecutor(self, executor, options): """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors.""" currentLogLocation = options["LogFileLocation"] pid = options["PID"] self.log.info("Checking executor: %s" % executor) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (executor, (age.seconds / MINUTES))) if age.seconds < 2 * HOUR: return S_OK() self.log.info("Current log file is too old for Executor %s" % executor) self.accounting[executor]["LogAge"] = age.seconds / MINUTES res = self.checkForCheckingJobs(executor) if not res["OK"]: return res if res["OK"] and res["Value"] == NO_CHECKING_JOBS: self.accounting.pop(executor, None) return S_OK(NO_RESTART) res = self.restartInstance(int(pid), executor, self.restartExecutors) if not res["OK"]: return res elif res["OK"] and res["Value"] != NO_RESTART: self.accounting[executor]["Treatment"] = "Successfully Restarted" self.log.info("Executor %s has been successfully restarted" % executor) return S_OK() def checkForCheckingJobs(self, executorName): """Check if there are checking jobs with the **executorName** as current MinorStatus.""" attrDict = {"Status": "Checking", "MinorStatus": executorName} # returns list of jobs IDs resJobs = self.jobMonClient.getJobs(attrDict) if not resJobs["OK"]: self.logError("Could not get jobs for this executor", "%s: %s" % (executorName, resJobs["Message"])) return resJobs if resJobs["Value"]: self.log.info('Found %d jobs in "Checking" status for %s' % (len(resJobs["Value"]), executorName)) return S_OK(CHECKING_JOBS) self.log.info('Found no jobs in "Checking" status for %s' % executorName) return S_OK(NO_CHECKING_JOBS) def componentControl(self): """Monitor and control component status as defined in the CS. Check for running and stopped components and ensure they have the proper status as defined in the CS Registry/Hosts/_HOST_/[Running|Stopped] sections :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`, :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR` """ # get the current status of the components resCurrent = self._getCurrentComponentStatus() if not resCurrent["OK"]: return resCurrent currentStatus = resCurrent["Value"] resDefault = self._getDefaultComponentStatus() if not resDefault["OK"]: return resDefault defaultStatus = resDefault["Value"] # ensure instances are in the right state shouldBe = {} shouldBe["Run"] = defaultStatus["Run"].intersection( currentStatus["Down"]) shouldBe["Down"] = defaultStatus["Down"].intersection( currentStatus["Run"]) shouldBe["Unknown"] = defaultStatus["All"].symmetric_difference( currentStatus["All"]) self._ensureComponentRunning(shouldBe["Run"]) self._ensureComponentDown(shouldBe["Down"]) for instance in shouldBe["Unknown"]: self.logError("Unknown instance", "%r, either uninstall or add to config" % instance) return S_OK() def _getCurrentComponentStatus(self): """Get current status for components.""" resOverall = self.sysAdminClient.getOverallStatus() if not resOverall["OK"]: return resOverall currentStatus = {"Down": set(), "Run": set(), "All": set()} informationDict = resOverall["Value"] for systemsDict in informationDict.values(): for system, instancesDict in systemsDict.items(): for instanceName, instanceInfoDict in instancesDict.items(): identifier = "%s__%s" % (system, instanceName) runitStatus = instanceInfoDict.get("RunitStatus") if runitStatus in ("Run", "Down"): currentStatus[runitStatus].add(identifier) currentStatus["All"] = currentStatus["Run"] | currentStatus["Down"] return S_OK(currentStatus) def _getDefaultComponentStatus(self): """Get the configured status of the components.""" host = socket.getfqdn() defaultStatus = {"Down": set(), "Run": set(), "All": set()} resRunning = gConfig.getOptionsDict( Path.cfgPath("/Registry/Hosts/", host, "Running")) resStopped = gConfig.getOptionsDict( Path.cfgPath("/Registry/Hosts/", host, "Stopped")) if not resRunning["OK"]: return resRunning if not resStopped["OK"]: return resStopped defaultStatus["Run"] = set(resRunning["Value"]) defaultStatus["Down"] = set(resStopped["Value"]) defaultStatus["All"] = defaultStatus["Run"] | defaultStatus["Down"] if defaultStatus["Run"].intersection(defaultStatus["Down"]): self.logError( "Overlap in configuration", str(defaultStatus["Run"].intersection(defaultStatus["Down"]))) return S_ERROR("Bad host configuration") return S_OK(defaultStatus) def _ensureComponentRunning(self, shouldBeRunning): """Ensure the correct components are running.""" for instance in shouldBeRunning: self.log.info("Starting instance %s" % instance) system, name = instance.split("__") if self.controlComponents: res = self.sysAdminClient.startComponent(system, name) if not res["OK"]: self.logError("Failed to start component:", "%s: %s" % (instance, res["Message"])) else: self.accounting[instance][ "Treatment"] = "Instance was down, started instance" else: self.accounting[instance][ "Treatment"] = "Instance is down, should be started" def _ensureComponentDown(self, shouldBeDown): """Ensure the correct components are not running.""" for instance in shouldBeDown: self.log.info("Stopping instance %s" % instance) system, name = instance.split("__") if self.controlComponents: res = self.sysAdminClient.stopComponent(system, name) if not res["OK"]: self.logError("Failed to stop component:", "%s: %s" % (instance, res["Message"])) else: self.accounting[instance][ "Treatment"] = "Instance was running, stopped instance" else: self.accounting[instance][ "Treatment"] = "Instance is running, should be stopped" def checkURLs(self): """Ensure that the running services have their URL in the Config.""" self.log.info("Checking URLs") # get services again, in case they were started/stop in controlComponents gConfig.forceRefresh(fromMaster=True) # get port used for https based services try: tornadoSystemInstance = PathFinder.getSystemInstance( system="Tornado", setup=self.setup, ) self._tornadoPort = gConfig.getValue( Path.cfgPath("/System/Tornado/", tornadoSystemInstance, "Port"), self._tornadoPort, ) except RuntimeError: pass self.log.debug("Using Tornado Port:", self._tornadoPort) res = self.getRunningInstances(instanceType="Services", runitStatus="All") if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] for service, options in sorted(self.services.items()): self.log.debug("Checking URL for %s with options %s" % (service, options)) # ignore SystemAdministrator, does not have URLs if "SystemAdministrator" in service: continue self._checkServiceURL(service, options) if self.csAPI.csModified and self.commitURLs: self.log.info("Commiting changes to the CS") result = self.csAPI.commit() if not result["OK"]: self.logError("Commit to CS failed", result["Message"]) return S_ERROR("Failed to commit to CS") return S_OK() def _checkServiceURL(self, serviceName, options): """Ensure service URL is properly configured in the CS.""" url = self._getURL(serviceName, options) system = options["System"] module = options["Module"] self.log.info("Checking URLs for %s/%s" % (system, module)) urlsConfigPath = Path.cfgPath( PathFinder.getSystemURLSection(system=system, setup=self.setup), module) urls = gConfig.getValue(urlsConfigPath, []) self.log.debug("Found configured URLs for %s: %s" % (module, urls)) self.log.debug("This URL is %s" % url) runitStatus = options["RunitStatus"] wouldHave = "Would have " if not self.commitURLs else "" if runitStatus == "Run" and url not in urls: urls.append(url) message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) if runitStatus == "Down" and url in urls: urls.remove(url) message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) def _getURL(self, serviceName, options): """Return URL for the service.""" system = options["System"] port = options.get("Port", self._tornadoPort) host = socket.getfqdn() protocol = options.get("Protocol", "dips") url = "%s://%s:%s/%s/%s" % (protocol, host, port, system, serviceName) return url
change = False if newSite: gLogger.notice("Adding new site to CS: %s" % diracSiteName) csAPI.setOption("%s/Name" % cfgBase, gridSiteName) gLogger.notice("Adding CEs: %s" % ','.join(ces)) csAPI.setOption("%s/CE" % cfgBase, ','.join(ces)) change = True else: cesCS = set(gConfig.getValue("%s/CE" % cfgBase, [])) ces = set(ces) newCEs = ces - cesCS if newCEs: cesCS = cesCS.union(ces) gLogger.notice("Adding CEs %s" % ','.join(newCEs)) cesCS = cesCS.union(ces) csAPI.modifyValue("%s/CE" % cfgBase, ','.join(cesCS)) change = True if change: res = csAPI.commitChanges() if not res['OK']: gLogger.error("Failed to commit changes to CS", res['Message']) DIRACExit(-1) else: if newSite: gLogger.notice( "Successfully added site %s to the CS with name %s and CEs: %s" % (diracSiteName, gridSiteName, ','.join(ces))) else: gLogger.notice("Successfully added new CEs to site %s: %s" % (diracSiteName, ','.join(newCEs)))
class DiracAdmin( API ): """ Administrative functionalities """ ############################################################################# def __init__( self ): """Internal initialization of the DIRAC Admin API. """ super( DiracAdmin, self ).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue( self.section + '/LogLevel', 'DEBUG' ) == 'DEBUG': self.dbg = True self.scratchDir = gConfig.getValue( self.section + '/ScratchDir', '/tmp' ) self.currentDir = os.getcwd() ############################################################################# def uploadProxy( self, group ): """Upload a proxy to the DIRAC WMS. This method Example usage: >>> print diracAdmin.uploadProxy('lhcb_pilot') {'OK': True, 'Value': 0L} :param group: DIRAC Group :type job: string :return: S_OK,S_ERROR :param permanent: Indefinitely update proxy :type permanent: boolean """ return gProxyManager.uploadProxy( diracGroup = group ) ############################################################################# def setProxyPersistency( self, userDN, userGroup, persistent = True ): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ) {'OK': True } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param persistent: Persistent flag :type persistent: boolean :return: S_OK,S_ERROR """ return gProxyManager.setPersistency( userDN, userGroup, persistent ) ############################################################################# def checkProxyUploaded( self, userDN, userGroup, requiredTime ): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ) {'OK': True, 'Value' : True/False } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param requiredTime: Required life time of the uploaded proxy :type requiredTime: boolean :return: S_OK,S_ERROR """ return gProxyManager.userHasProxy( userDN, userGroup, requiredTime ) ############################################################################# def getSiteMask( self, printOutput = False ): """Retrieve current site mask from WMS Administrator service. Example usage: >>> print diracAdmin.getSiteMask() {'OK': True, 'Value': 0L} :return: S_OK,S_ERROR """ wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdmin.getSiteMask() if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: print site return result ############################################################################# def getBannedSites( self, gridType = [], printOutput = False ): """Retrieve current list of banned sites. Example usage: >>> print diracAdmin.getBannedSites() {'OK': True, 'Value': []} :return: S_OK,S_ERROR """ wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) bannedSites = [] totalList = [] result = wmsAdmin.getSiteMask() if not result['OK']: self.log.warn( result['Message'] ) return result sites = result['Value'] if not gridType: result = gConfig.getSections( '/Resources/Sites' ) if not result['OK']: return result gridType = result['Value'] for grid in gridType: result = gConfig.getSections( '/Resources/Sites/%s' % grid ) if not result['OK']: return result totalList += result['Value'] for site in totalList: if not site in sites: bannedSites.append( site ) bannedSites.sort() if printOutput: print '\n'.join( bannedSites ) return S_OK( bannedSites ) ############################################################################# def getSiteSection( self, site, printOutput = False ): """Simple utility to get the list of CEs for DIRAC site name. Example usage: >>> print diracAdmin.getSiteSection('LCG.CERN.ch') {'OK': True, 'Value':} :return: S_OK,S_ERROR """ gridType = site.split( '.' )[0] if not gConfig.getSections( '/Resources/Sites/%s' % ( gridType ) )['OK']: return S_ERROR( '/Resources/Sites/%s is not a valid site section' % ( gridType ) ) result = gConfig.getOptionsDict( '/Resources/Sites/%s/%s' % ( gridType, site ) ) if printOutput and result['OK']: print self.pPrint.pformat( result['Value'] ) return result ############################################################################# def addSiteInMask( self, site, comment, printOutput = False ): """Adds the site to the site mask. Example usage: >>> print diracAdmin.addSiteInMask() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid( site ) if not result['OK']: return result mask = self.getSiteMask() if not mask['OK']: return mask siteMask = mask['Value'] if site in siteMask: return S_ERROR( 'Site %s already in mask of allowed sites' % site ) wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdmin.allowSite( site, comment ) if not result['OK']: return result if printOutput: print 'Allowing %s in site mask' % site return result ############################################################################# def getSiteMaskLogging( self, site = None, printOutput = False ): """Retrieves site mask logging information. Example usage: >>> print diracAdmin.getSiteMaskLogging('LCG.AUVER.fr') {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid( site ) if not result['OK']: return result wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdmin.getSiteMaskLogging( site ) if not result['OK']: return result if site: if not result['Value'].has_key( site ): return S_ERROR( 'Site mask information not available for %s' % ( site ) ) if printOutput: if site: print '\nSite Mask Logging Info for %s\n' % site else: print '\nAll Site Mask Logging Info\n' siteDict = result['Value'] for site, tupleList in siteDict.iteritems(): if not site: print '\n===> %s\n' % site for tup in tupleList: print str( tup[0] ).ljust( 8 ) + str( tup[1] ).ljust( 20 ) + \ '( ' + str( tup[2] ).ljust( len( str( tup[2] ) ) ) + ' ) "' + str( tup[3] ) + '"' print ' ' return result ############################################################################# def banSiteFromMask( self, site, comment, printOutput = False ): """Removes the site from the site mask. Example usage: >>> print diracAdmin.banSiteFromMask() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid( site ) if not result['OK']: return result mask = self.getSiteMask() if not mask['OK']: return mask siteMask = mask['Value'] if not site in siteMask: return S_ERROR( 'Site %s is already banned' % site ) wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdmin.banSite( site, comment ) if not result['OK']: return result if printOutput: print 'Removing %s from site mask' % site return result ############################################################################# @classmethod def __checkSiteIsValid( self, site ): """Internal function to check that a site name is valid. """ sites = getSiteCEMapping() if not sites['OK']: return S_ERROR( 'Could not get site CE mapping' ) siteList = sites['Value'].keys() if not site in siteList: return S_ERROR( 'Specified site %s is not in list of defined sites' % site ) return S_OK( '%s is valid' % site ) ############################################################################# def clearMask( self ): """Removes all sites from the site mask. Should be used with care. Example usage: >>> print diracAdmin.clearMask() {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdmin.clearMask() return result ############################################################################# def getServicePorts( self, setup = '', printOutput = False ): """Checks the service ports for the specified setup. If not given this is taken from the current installation (/DIRAC/Setup) Example usage: >>> print diracAdmin.getServicePorts() {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ if not setup: setup = gConfig.getValue( '/DIRAC/Setup', '' ) setupList = gConfig.getSections( '/DIRAC/Setups', [] ) if not setupList['OK']: return S_ERROR( 'Could not get /DIRAC/Setups sections' ) setupList = setupList['Value'] if not setup in setupList: return S_ERROR( 'Setup %s is not in allowed list: %s' % ( setup, ', '.join( setupList ) ) ) serviceSetups = gConfig.getOptionsDict( '/DIRAC/Setups/%s' % setup ) if not serviceSetups['OK']: return S_ERROR( 'Could not get /DIRAC/Setups/%s options' % setup ) serviceSetups = serviceSetups['Value'] # dict systemList = gConfig.getSections( '/Systems' ) if not systemList['OK']: return S_ERROR( 'Could not get Systems sections' ) systemList = systemList['Value'] result = {} for system in systemList: if serviceSetups.has_key( system ): path = '/Systems/%s/%s/Services' % ( system, serviceSetups[system] ) servicesList = gConfig.getSections( path ) if not servicesList['OK']: self.log.warn( 'Could not get sections in %s' % path ) else: servicesList = servicesList['Value'] if not servicesList: servicesList = [] self.log.verbose( 'System: %s ServicesList: %s' % ( system, ', '.join( servicesList ) ) ) for service in servicesList: spath = '%s/%s/Port' % ( path, service ) servicePort = gConfig.getValue( spath, 0 ) if servicePort: self.log.verbose( 'Found port for %s/%s = %s' % ( system, service, servicePort ) ) result['%s/%s' % ( system, service )] = servicePort else: self.log.warn( 'No port found for %s' % spath ) else: self.log.warn( '%s is not defined in /DIRAC/Setups/%s' % ( system, setup ) ) if printOutput: print self.pPrint.pformat( result ) return S_OK( result ) ############################################################################# def getProxy( self, userDN, userGroup, validity = 43200, limited = False ): """Retrieves a proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadProxy( userDN, userGroup, limited = limited, requiredTimeLeft = validity ) ############################################################################# def getVOMSProxy( self, userDN, userGroup, vomsAttr = False, validity = 43200, limited = False ): """Retrieves a proxy with default 12hr validity and VOMS extensions and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadVOMSProxy( userDN, userGroup, limited = limited, requiredVOMSAttribute = vomsAttr, requiredTimeLeft = validity ) ############################################################################# def getPilotProxy( self, userDN, userGroup, validity = 43200 ): """Retrieves a pilot proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.getPilotProxyFromDIRACGroup( userDN, userGroup, requiredTimeLeft = validity ) ############################################################################# def resetJob( self, jobID ): """Reset a job or list of jobs in the WMS. This operation resets the reschedule counter for a job or list of jobs and allows them to run as new. Example:: >>> print dirac.reset(12345) {'OK': True, 'Value': [12345]} :param job: JobID :type job: integer or list of integers :return: S_OK,S_ERROR """ if isinstance( jobID, basestring ): try: jobID = int( jobID ) except Exception as x: return self._errorReport( str( x ), 'Expected integer or convertible integer for existing jobID' ) elif isinstance( jobID, list ): try: jobID = [int( job ) for job in jobID] except Exception as x: return self._errorReport( str( x ), 'Expected integer or convertible integer for existing jobIDs' ) jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = False ) result = jobManager.resetJob( jobID ) return result ############################################################################# def getJobPilotOutput( self, jobID, directory = '' ): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> print dirac.getJobPilotOutput(12345) {'OK': True, StdOut:'',StdError:''} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not directory: directory = self.currentDir if not os.path.exists( directory ): return self._errorReport( 'Directory %s does not exist' % directory ) wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdmin.getJobPilotOutput( jobID ) if not result['OK']: return result outputPath = '%s/pilot_%s' % ( directory, jobID ) if os.path.exists( outputPath ): self.log.info( 'Remove %s and retry to continue' % outputPath ) return S_ERROR( 'Remove %s and retry to continue' % outputPath ) if not os.path.exists( outputPath ): self.log.verbose( 'Creating directory %s' % outputPath ) os.mkdir( outputPath ) outputs = result['Value'] if outputs.has_key( 'StdOut' ): stdout = '%s/std.out' % ( outputPath ) with open( stdout, 'w' ) as fopen: fopen.write( outputs['StdOut'] ) self.log.verbose( 'Standard output written to %s' % ( stdout ) ) else: self.log.warn( 'No standard output returned' ) if outputs.has_key( 'StdError' ): stderr = '%s/std.err' % ( outputPath ) with open( stderr, 'w' ) as fopen: fopen.write( outputs['StdError'] ) self.log.verbose( 'Standard error written to %s' % ( stderr ) ) else: self.log.warn( 'No standard error returned' ) self.log.always( 'Outputs retrieved in %s' % outputPath ) return result ############################################################################# def getPilotOutput( self, gridReference, directory = '' ): """Retrieve the pilot output (std.out and std.err) for an existing job in the WMS. >>> print dirac.getJobPilotOutput(12345) {'OK': True, 'Value': {}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not isinstance( gridReference, basestring ): return self._errorReport( 'Expected string for pilot reference' ) if not directory: directory = self.currentDir if not os.path.exists( directory ): return self._errorReport( 'Directory %s does not exist' % directory ) wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdmin.getPilotOutput( gridReference ) if not result['OK']: return result gridReferenceSmall = gridReference.split( '/' )[-1] if not gridReferenceSmall: gridReferenceSmall = 'reference' outputPath = '%s/pilot_%s' % ( directory, gridReferenceSmall ) if os.path.exists( outputPath ): self.log.info( 'Remove %s and retry to continue' % outputPath ) return S_ERROR( 'Remove %s and retry to continue' % outputPath ) if not os.path.exists( outputPath ): self.log.verbose( 'Creating directory %s' % outputPath ) os.mkdir( outputPath ) outputs = result['Value'] if outputs.has_key( 'StdOut' ): stdout = '%s/std.out' % ( outputPath ) with open( stdout, 'w' ) as fopen: fopen.write( outputs['StdOut'] ) self.log.info( 'Standard output written to %s' % ( stdout ) ) else: self.log.warn( 'No standard output returned' ) if outputs.has_key( 'StdErr' ): stderr = '%s/std.err' % ( outputPath ) with open( stderr, 'w' ) as fopen: fopen.write( outputs['StdErr'] ) self.log.info( 'Standard error written to %s' % ( stderr ) ) else: self.log.warn( 'No standard error returned' ) self.log.always( 'Outputs retrieved in %s' % outputPath ) return result ############################################################################# def getPilotInfo( self, gridReference ): """Retrieve info relative to a pilot reference >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance( gridReference, basestring ): return self._errorReport( 'Expected string for pilot reference' ) wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdmin.getPilotInfo( gridReference ) return result ############################################################################# def killPilot( self, gridReference ): """Kill the pilot specified >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :return: S_OK,S_ERROR """ if not isinstance( gridReference, basestring ): return self._errorReport( 'Expected string for pilot reference' ) wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdmin.killPilot( gridReference ) return result ############################################################################# def getPilotLoggingInfo( self, gridReference ): """Retrieve the pilot logging info for an existing job in the WMS. >>> print dirac.getPilotLoggingInfo(12345) {'OK': True, 'Value': {"The output of the command"}} :param gridReference: Gridp pilot job reference Id :type gridReference: string :return: S_OK,S_ERROR """ if type( gridReference ) not in types.StringTypes: return self._errorReport( 'Expected string for pilot reference' ) wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) return wmsAdmin.getPilotLoggingInfo( gridReference ) ############################################################################# def getJobPilots( self, jobID ): """Extract the list of submitted pilots and their status for a given jobID from the WMS. Useful information is printed to the screen. >>> print dirac.getJobPilots() {'OK': True, 'Value': {PilotID:{StatusDict}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if isinstance( jobID, basestring ): try: jobID = int( jobID ) except Exception as x: return self._errorReport( str( x ), 'Expected integer or string for existing jobID' ) wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdmin.getPilots( jobID ) if result['OK']: print self.pPrint.pformat( result['Value'] ) return result ############################################################################# def getPilotSummary( self, startDate = '', endDate = '' ): """Retrieve the pilot output for an existing job in the WMS. Summary is printed at INFO level, full dictionary of results also returned. >>> print dirac.getPilotSummary() {'OK': True, 'Value': {CE:{Status:Count}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdmin.getPilotSummary( startDate, endDate ) if not result['OK']: return result ceDict = result['Value'] headers = 'CE'.ljust( 28 ) i = 0 for ce, summary in ceDict.iteritems(): states = summary.keys() if len( states ) > i: i = len( states ) for i in xrange( i ): headers += 'Status'.ljust( 12 ) + 'Count'.ljust( 12 ) print headers for ce, summary in ceDict.iteritems(): line = ce.ljust( 28 ) states = summary.keys() states.sort() for state in states: count = str( summary[state] ) line += state.ljust( 12 ) + count.ljust( 12 ) print line return result ############################################################################# def selectRequests( self, jobID = None, requestID = None, requestName = None, requestType = None, status = None, operation = None, ownerDN = None, ownerGroup = None, requestStart = 0, limit = 100, printOutput = False ): """Select requests from the request management system. A few notes on the selection criteria: - jobID is the WMS JobID for the request (if applicable) - requestID is assigned during submission of the request - requestName is the corresponding XML file name - requestType e.g. 'transfer' - status e.g. Done - operation e.g. replicateAndRegister - requestStart e.g. the first request to consider (start from 0 by default) - limit e.g. selection limit (default 100) >>> dirac.selectRequests(jobID='4894') {'OK': True, 'Value': [[<Requests>]]} """ options = {'RequestID':requestID, 'RequestName':requestName, 'JobID':jobID, 'OwnerDN':ownerDN, 'OwnerGroup':ownerGroup, 'RequestType':requestType, 'Status':status, 'Operation':operation} conditions = {} for key, value in options.iteritems(): if value: try: conditions[key] = str( value ) except Exception as x: return self._errorReport( str( x ), 'Expected string for %s field' % key ) try: requestStart = int( requestStart ) limit = int( limit ) except Exception as x: return self._errorReport( str( x ), 'Expected integer for %s field' % limit ) self.log.verbose( 'Will select requests with the following conditions' ) self.log.verbose( self.pPrint.pformat( conditions ) ) requestClient = RPCClient( "RequestManagement/centralURL" ) result = requestClient.getRequestSummaryWeb( conditions, [], requestStart, limit ) if not result['OK']: self.log.warn( result['Message'] ) return result requestIDs = result['Value'] conds = [] for key, value in conditions.iteritems(): if value: conds.append( '%s = %s' % ( key, value ) ) self.log.verbose( '%s request(s) selected with conditions %s and limit %s' % ( len( requestIDs['Records'] ), ', '.join( conds ), limit ) ) if printOutput: requests = [] if len( requestIDs['Records'] ) > limit: requestList = requestIDs['Records'] requests = requestList[:limit] else: requests = requestIDs['Records'] print '%s request(s) selected with conditions %s and limit %s' % ( len( requestIDs['Records'] ), ', '.join( conds ), limit ) print requestIDs['ParameterNames'] for request in requests: print request if not requestIDs: return S_ERROR( 'No requests selected for conditions: %s' % conditions ) else: return result ############################################################################# def getRequestSummary( self, printOutput = False ): """ Get a summary of the requests in the request DB. """ requestClient = RPCClient( "RequestManagement/centralURL", timeout = 120 ) result = requestClient.getDBSummary() if not result['OK']: self.log.warn( result['Message'] ) return result if printOutput: print self.pPrint.pformat( result['Value'] ) return result ############################################################################# def getExternalPackageVersions( self ): """ Simple function that attempts to obtain the external versions for the local DIRAC installation (frequently needed for debugging purposes). """ gLogger.info( 'DIRAC version v%dr%d build %d' % ( DIRAC.majorVersion, DIRAC.minorVersion, DIRAC.patchLevel ) ) try: import lcg_util infoStr = 'Using lcg_util from: \n%s' % lcg_util.__file__ gLogger.info( infoStr ) infoStr = "The version of lcg_utils is %s" % lcg_util.lcg_util_version() gLogger.info( infoStr ) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import lcg_util: %s" % ( x ) gLogger.exception( errStr ) try: import gfalthr as gfal infoStr = "Using gfalthr from: \n%s" % gfal.__file__ gLogger.info( infoStr ) infoStr = "The version of gfalthr is %s" % gfal.gfal_version() gLogger.info( infoStr ) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import gfalthr: %s." % ( x ) gLogger.warn( errStr ) try: import gfal infoStr = "Using gfal from: %s" % gfal.__file__ gLogger.info( infoStr ) infoStr = "The version of gfal is %s" % gfal.gfal_version() gLogger.info( infoStr ) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import gfal: %s" % ( x ) gLogger.exception( errStr ) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', [] ) gLogger.info( 'Default list of protocols are: %s' % ( ', '.join( defaultProtocols ) ) ) return S_OK() ############################################################################# def getSiteProtocols( self, site, printOutput = False ): """ Allows to check the defined protocols for each site SE. """ result = self.__checkSiteIsValid( site ) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % ( site.split( '.' )[0], site ) siteSEs = gConfig.getValue( siteSection, [] ) if not siteSEs: return S_ERROR( 'No SEs found for site %s in section %s' % ( site, siteSection ) ) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', [] ) self.log.verbose( 'Default list of protocols are' ', '.join( defaultProtocols ) ) seInfo = {} siteSEs.sort() for se in siteSEs: sections = gConfig.getSections( '/Resources/StorageElements/%s/' % ( se ) ) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue( '/Resources/StorageElements/%s/%s/ProtocolName' % ( se, section ), '' ) == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % ( se, section ) seProtocols = gConfig.getValue( path, [] ) if not seProtocols: seProtocols = defaultProtocols seInfo[se] = seProtocols if printOutput: print '\nSummary of protocols for StorageElements at site %s' % site print '\nStorageElement'.ljust( 30 ) + 'ProtocolsList'.ljust( 30 ) + '\n' for se, protocols in seInfo.iteritems(): print se.ljust( 30 ) + ', '.join( protocols ).ljust( 30 ) return S_OK( seInfo ) ############################################################################# def setSiteProtocols( self, site, protocolsList, printOutput = False ): """ Allows to set the defined protocols for each SE for a given site. """ result = self.__checkSiteIsValid( site ) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % ( site.split( '.' )[0], site ) siteSEs = gConfig.getValue( siteSection, [] ) if not siteSEs: return S_ERROR( 'No SEs found for site %s in section %s' % ( site, siteSection ) ) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', [] ) self.log.verbose( 'Default list of protocols are', ', '.join( defaultProtocols ) ) for protocol in protocolsList: if not protocol in defaultProtocols: return S_ERROR( 'Requested to set protocol %s in list but %s is not ' 'in default list of protocols:\n%s' % ( protocol, protocol, ', '.join( defaultProtocols ) ) ) modifiedCS = False result = promptUser( 'Do you want to add the following default protocols:' ' %s for SE(s):\n%s' % ( ', '.join( protocolsList ), ', '.join( siteSEs ) ) ) if not result['OK']: return result if result['Value'].lower() != 'y': self.log.always( 'No protocols will be added' ) return S_OK() for se in siteSEs: sections = gConfig.getSections( '/Resources/StorageElements/%s/' % ( se ) ) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue( '/Resources/StorageElements/%s/%s/ProtocolName' % ( se, section ), '' ) == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % ( se, section ) self.log.verbose( 'Setting %s to %s' % ( path, ', '.join( protocolsList ) ) ) result = self.csSetOption( path, ', '.join( protocolsList ) ) if not result['OK']: return result modifiedCS = True if modifiedCS: result = self.csCommitChanges( False ) if not result[ 'OK' ]: return S_ERROR( 'CS Commit failed with message = %s' % ( result[ 'Message' ] ) ) else: if printOutput: print 'Successfully committed changes to CS' else: if printOutput: print 'No modifications to CS required' return S_OK() ############################################################################# def csSetOption( self, optionPath, optionValue ): """ Function to modify an existing value in the CS. """ return self.csAPI.setOption( optionPath, optionValue ) ############################################################################# def csSetOptionComment( self, optionPath, comment ): """ Function to modify an existing value in the CS. """ return self.csAPI.setOptionComment( optionPath, comment ) ############################################################################# def csModifyValue( self, optionPath, newValue ): """ Function to modify an existing value in the CS. """ return self.csAPI.modifyValue( optionPath, newValue ) ############################################################################# def csRegisterUser( self, username, properties ): """ Registers a user in the CS. - username: Username of the user (easy;) - properties: Dict containing: - DN - groups : list/tuple of groups the user belongs to - <others> : More properties of the user, like mail """ return self.csAPI.addUser( username, properties ) ############################################################################# def csDeleteUser( self, user ): """ Deletes a user from the CS. Can take a list of users """ return self.csAPI.deleteUsers( user ) ############################################################################# def csModifyUser( self, username, properties, createIfNonExistant = False ): """ Modify a user in the CS. Takes the same params as in addUser and applies the changes """ return self.csAPI.modifyUser( username, properties, createIfNonExistant ) ############################################################################# def csListUsers( self, group = False ): """ Lists the users in the CS. If no group is specified return all users. """ return self.csAPI.listUsers( group ) ############################################################################# def csDescribeUsers( self, mask = False ): """ List users and their properties in the CS. If a mask is given, only users in the mask will be returned """ return self.csAPI.describeUsers( mask ) ############################################################################# def csModifyGroup( self, groupname, properties, createIfNonExistant = False ): """ Modify a user in the CS. Takes the same params as in addGroup and applies the changes """ return self.csAPI.modifyGroup( groupname, properties, createIfNonExistant ) ############################################################################# def csListHosts( self ): """ Lists the hosts in the CS """ return self.csAPI.listHosts() ############################################################################# def csDescribeHosts( self, mask = False ): """ Gets extended info for the hosts in the CS """ return self.csAPI.describeHosts( mask ) ############################################################################# def csModifyHost( self, hostname, properties, createIfNonExistant = False ): """ Modify a host in the CS. Takes the same params as in addHost and applies the changes """ return self.csAPI.modifyHost( hostname, properties, createIfNonExistant ) ############################################################################# def csListGroups( self ): """ Lists groups in the CS """ return self.csAPI.listGroups() ############################################################################# def csDescribeGroups( self, mask = False ): """ List groups and their properties in the CS. If a mask is given, only groups in the mask will be returned """ return self.csAPI.describeGroups( mask ) ############################################################################# def csSyncUsersWithCFG( self, usersCFG ): """ Synchronize users in cfg with its contents """ return self.csAPI.syncUsersWithCFG( usersCFG ) ############################################################################# def csCommitChanges( self, sortUsers = True ): """ Commit the changes in the CS """ return self.csAPI.commitChanges( sortUsers = False ) ############################################################################# def sendMail( self, address, subject, body, fromAddress = None, localAttempt = True, html = False ): """ Send mail to specified address with body. """ notification = NotificationClient() return notification.sendMail( address, subject, body, fromAddress, localAttempt, html ) ############################################################################# def sendSMS( self, userName, body, fromAddress = None ): """ Send mail to specified address with body. """ if len( body ) > 160: return S_ERROR( 'Exceeded maximum SMS length of 160 characters' ) notification = NotificationClient() return notification.sendSMS( userName, body, fromAddress ) ############################################################################# def getBDIISite( self, site, host = None ): """ Get information about site from BDII at host """ return ldapSite( site, host = host ) ############################################################################# def getBDIICluster( self, ce, host = None ): """ Get information about ce from BDII at host """ return ldapCluster( ce, host = host ) ############################################################################# def getBDIICE( self, ce, host = None ): """ Get information about ce from BDII at host """ return ldapCE( ce, host = host ) ############################################################################# def getBDIIService( self, ce, host = None ): """ Get information about ce from BDII at host """ return ldapService( ce, host = host ) ############################################################################# def getBDIICEState( self, ce, useVO = voName, host = None ): """ Get information about ce state from BDII at host """ return ldapCEState( ce, useVO, host = host ) ############################################################################# def getBDIICEVOView( self, ce, useVO = voName, host = None ): """ Get information about ce voview from BDII at host """ return ldapCEVOView( ce, useVO, host = host ) ############################################################################# def getBDIISE( self, site, useVO = voName, host = None ): """ Get information about SA from BDII at host """ return ldapSE( site, useVO, host = host )
class DiracAdmin(API): """ Administrative functionalities """ ############################################################################# def __init__(self): """Internal initialization of the DIRAC Admin API. """ super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG': self.dbg = True self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp') self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus() ############################################################################# def uploadProxy(self, group): """Upload a proxy to the DIRAC WMS. This method Example usage: >>> print diracAdmin.uploadProxy('lhcb_pilot') {'OK': True, 'Value': 0L} :param group: DIRAC Group :type job: string :return: S_OK,S_ERROR :param permanent: Indefinitely update proxy :type permanent: boolean """ return gProxyManager.uploadProxy(diracGroup=group) ############################################################################# def setProxyPersistency(self, userDN, userGroup, persistent=True): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ) {'OK': True } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param persistent: Persistent flag :type persistent: boolean :return: S_OK,S_ERROR """ return gProxyManager.setPersistency(userDN, userGroup, persistent) ############################################################################# def checkProxyUploaded(self, userDN, userGroup, requiredTime): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ) {'OK': True, 'Value' : True/False } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param requiredTime: Required life time of the uploaded proxy :type requiredTime: boolean :return: S_OK,S_ERROR """ return gProxyManager.userHasProxy(userDN, userGroup, requiredTime) ############################################################################# def getSiteMask(self, printOutput=False, status='Active'): """Retrieve current site mask from WMS Administrator service. Example usage: >>> print diracAdmin.getSiteMask() {'OK': True, 'Value': 0L} :return: S_OK,S_ERROR """ result = self.sitestatus.getSites(siteState=status) if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: print site return result ############################################################################# def getBannedSites(self, gridType=[], printOutput=False): """Retrieve current list of banned and probing sites. Example usage: >>> print diracAdmin.getBannedSites() {'OK': True, 'Value': []} :return: S_OK,S_ERROR """ bannedSites = self.sitestatus.getSites(siteState='Banned') if not bannedSites['OK']: return bannedSites probingSites = self.sitestatus.getSites(siteState='Probing') if not probingSites['OK']: return probingSites mergedList = bannedSites['Value'] + probingSites['Value'] mergedList.sort() if printOutput: print '\n'.join(mergedList) return S_OK(mergedList) ############################################################################# def getSiteSection(self, site, printOutput=False): """Simple utility to get the list of CEs for DIRAC site name. Example usage: >>> print diracAdmin.getSiteSection('LCG.CERN.ch') {'OK': True, 'Value':} :return: S_OK,S_ERROR """ gridType = site.split('.')[0] if not gConfig.getSections('/Resources/Sites/%s' % (gridType))['OK']: return S_ERROR('/Resources/Sites/%s is not a valid site section' % (gridType)) result = gConfig.getOptionsDict('/Resources/Sites/%s/%s' % (gridType, site)) if printOutput and result['OK']: print self.pPrint.pformat(result['Value']) return result ############################################################################# def allowSite(self, site, comment, printOutput=False): """Adds the site to the site mask. Example usage: >>> print diracAdmin.allowSite() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result result = self.getSiteMask(status='Active') if not result['OK']: return result siteMask = result['Value'] if site in siteMask: if printOutput: print 'Site %s is already Active' % site return S_OK('Site %s is already Active' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Active', comment) else: wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.allowSite(site, comment) if not result['OK']: return result if printOutput: print 'Site %s status is set to Active' % site return result ############################################################################# def getSiteMaskLogging(self, site=None, printOutput=False): """Retrieves site mask logging information. Example usage: >>> print diracAdmin.getSiteMaskLogging('LCG.AUVER.fr') {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getSiteMaskLogging(site) if not result['OK']: return result if site: if not result['Value'].has_key(site): return S_ERROR('Site mask information not available for %s' % (site)) if printOutput: if site: print '\nSite Mask Logging Info for %s\n' % site else: print '\nAll Site Mask Logging Info\n' siteDict = result['Value'] for site, tupleList in siteDict.iteritems(): if not site: print '\n===> %s\n' % site for tup in tupleList: print str( tup[0] ).ljust( 8 ) + str( tup[1] ).ljust( 20 ) + \ '( ' + str( tup[2] ).ljust( len( str( tup[2] ) ) ) + ' ) "' + str( tup[3] ) + '"' print ' ' return result ############################################################################# def banSite(self, site, comment, printOutput=False): """Removes the site from the site mask. Example usage: >>> print diracAdmin.banSite() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result mask = self.getSiteMask(status='Banned') if not mask['OK']: return mask siteMask = mask['Value'] if site in siteMask: if printOutput: print 'Site %s is already Banned' % site return S_OK('Site %s is already Banned' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Banned', comment) else: wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.banSite(site, comment) if not result['OK']: return result if printOutput: print 'Site %s status is set to Banned' % site return result ############################################################################# def __checkSiteIsValid(self, site): """Internal function to check that a site name is valid. """ sites = getSiteCEMapping() if not sites['OK']: return S_ERROR('Could not get site CE mapping') siteList = sites['Value'].keys() if not site in siteList: return S_ERROR( 'Specified site %s is not in list of defined sites' % site) return S_OK('%s is valid' % site) ############################################################################# def clearMask(self): """Removes all sites from the site mask. Should be used with care. Example usage: >>> print diracAdmin.clearMask() {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.clearMask() return result ############################################################################# def getServicePorts(self, setup='', printOutput=False): """Checks the service ports for the specified setup. If not given this is taken from the current installation (/DIRAC/Setup) Example usage: >>> print diracAdmin.getServicePorts() {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ if not setup: setup = gConfig.getValue('/DIRAC/Setup', '') setupList = gConfig.getSections('/DIRAC/Setups', []) if not setupList['OK']: return S_ERROR('Could not get /DIRAC/Setups sections') setupList = setupList['Value'] if not setup in setupList: return S_ERROR('Setup %s is not in allowed list: %s' % (setup, ', '.join(setupList))) serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup) if not serviceSetups['OK']: return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup) serviceSetups = serviceSetups['Value'] # dict systemList = gConfig.getSections('/Systems') if not systemList['OK']: return S_ERROR('Could not get Systems sections') systemList = systemList['Value'] result = {} for system in systemList: if serviceSetups.has_key(system): path = '/Systems/%s/%s/Services' % (system, serviceSetups[system]) servicesList = gConfig.getSections(path) if not servicesList['OK']: self.log.warn('Could not get sections in %s' % path) else: servicesList = servicesList['Value'] if not servicesList: servicesList = [] self.log.verbose('System: %s ServicesList: %s' % (system, ', '.join(servicesList))) for service in servicesList: spath = '%s/%s/Port' % (path, service) servicePort = gConfig.getValue(spath, 0) if servicePort: self.log.verbose('Found port for %s/%s = %s' % (system, service, servicePort)) result['%s/%s' % (system, service)] = servicePort else: self.log.warn('No port found for %s' % spath) else: self.log.warn('%s is not defined in /DIRAC/Setups/%s' % (system, setup)) if printOutput: print self.pPrint.pformat(result) return S_OK(result) ############################################################################# def getProxy(self, userDN, userGroup, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadProxy(userDN, userGroup, limited=limited, requiredTimeLeft=validity) ############################################################################# def getVOMSProxy(self, userDN, userGroup, vomsAttr=False, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and VOMS extensions and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadVOMSProxy(userDN, userGroup, limited=limited, requiredVOMSAttribute=vomsAttr, requiredTimeLeft=validity) ############################################################################# def getPilotProxy(self, userDN, userGroup, validity=43200): """Retrieves a pilot proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.getPilotProxyFromDIRACGroup( userDN, userGroup, requiredTimeLeft=validity) ############################################################################# def resetJob(self, jobID): """Reset a job or list of jobs in the WMS. This operation resets the reschedule counter for a job or list of jobs and allows them to run as new. Example:: >>> print dirac.reset(12345) {'OK': True, 'Value': [12345]} :param job: JobID :type job: integer or list of integers :return: S_OK,S_ERROR """ if isinstance(jobID, basestring): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobID' ) elif isinstance(jobID, list): try: jobID = [int(job) for job in jobID] except Exception as x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobIDs' ) jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=False) result = jobManager.resetJob(jobID) return result ############################################################################# def getJobPilotOutput(self, jobID, directory=''): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> print dirac.getJobPilotOutput(12345) {'OK': True, StdOut:'',StdError:''} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getJobPilotOutput(jobID) if not result['OK']: return result outputPath = '%s/pilot_%s' % (directory, jobID) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if outputs.has_key('StdOut'): stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.verbose('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if outputs.has_key('StdError'): stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdError']) self.log.verbose('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotOutput(self, gridReference, directory=''): """Retrieve the pilot output (std.out and std.err) for an existing job in the WMS. >>> print dirac.getJobPilotOutput(12345) {'OK': True, 'Value': {}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotOutput(gridReference) if not result['OK']: return result gridReferenceSmall = gridReference.split('/')[-1] if not gridReferenceSmall: gridReferenceSmall = 'reference' outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if outputs.has_key('StdOut'): stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.info('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if outputs.has_key('StdErr'): stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdErr']) self.log.info('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotInfo(self, gridReference): """Retrieve info relative to a pilot reference >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotInfo(gridReference) return result ############################################################################# def killPilot(self, gridReference): """Kill the pilot specified >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.killPilot(gridReference) return result ############################################################################# def getPilotLoggingInfo(self, gridReference): """Retrieve the pilot logging info for an existing job in the WMS. >>> print dirac.getPilotLoggingInfo(12345) {'OK': True, 'Value': {"The output of the command"}} :param gridReference: Gridp pilot job reference Id :type gridReference: string :return: S_OK,S_ERROR """ if type(gridReference) not in types.StringTypes: return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') return wmsAdmin.getPilotLoggingInfo(gridReference) ############################################################################# def getJobPilots(self, jobID): """Extract the list of submitted pilots and their status for a given jobID from the WMS. Useful information is printed to the screen. >>> print dirac.getJobPilots() {'OK': True, 'Value': {PilotID:{StatusDict}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if isinstance(jobID, basestring): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or string for existing jobID') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilots(jobID) if result['OK']: print self.pPrint.pformat(result['Value']) return result ############################################################################# def getPilotSummary(self, startDate='', endDate=''): """Retrieve the pilot output for an existing job in the WMS. Summary is printed at INFO level, full dictionary of results also returned. >>> print dirac.getPilotSummary() {'OK': True, 'Value': {CE:{Status:Count}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotSummary(startDate, endDate) if not result['OK']: return result ceDict = result['Value'] headers = 'CE'.ljust(28) i = 0 for ce, summary in ceDict.iteritems(): states = summary.keys() if len(states) > i: i = len(states) for i in xrange(i): headers += 'Status'.ljust(12) + 'Count'.ljust(12) print headers for ce, summary in ceDict.iteritems(): line = ce.ljust(28) states = summary.keys() states.sort() for state in states: count = str(summary[state]) line += state.ljust(12) + count.ljust(12) print line return result ############################################################################# def selectRequests(self, jobID=None, requestID=None, requestName=None, requestType=None, status=None, operation=None, ownerDN=None, ownerGroup=None, requestStart=0, limit=100, printOutput=False): """Select requests from the request management system. A few notes on the selection criteria: - jobID is the WMS JobID for the request (if applicable) - requestID is assigned during submission of the request - requestName is the corresponding XML file name - requestType e.g. 'transfer' - status e.g. Done - operation e.g. replicateAndRegister - requestStart e.g. the first request to consider (start from 0 by default) - limit e.g. selection limit (default 100) >>> dirac.selectRequests(jobID='4894') {'OK': True, 'Value': [[<Requests>]]} """ options = { 'RequestID': requestID, 'RequestName': requestName, 'JobID': jobID, 'OwnerDN': ownerDN, 'OwnerGroup': ownerGroup, 'RequestType': requestType, 'Status': status, 'Operation': operation } conditions = {} for key, value in options.iteritems(): if value: try: conditions[key] = str(value) except Exception as x: return self._errorReport( str(x), 'Expected string for %s field' % key) try: requestStart = int(requestStart) limit = int(limit) except Exception as x: return self._errorReport(str(x), 'Expected integer for %s field' % limit) self.log.verbose('Will select requests with the following conditions') self.log.verbose(self.pPrint.pformat(conditions)) requestClient = RPCClient("RequestManagement/centralURL") result = requestClient.getRequestSummaryWeb(conditions, [], requestStart, limit) if not result['OK']: self.log.warn(result['Message']) return result requestIDs = result['Value'] conds = [] for key, value in conditions.iteritems(): if value: conds.append('%s = %s' % (key, value)) self.log.verbose( '%s request(s) selected with conditions %s and limit %s' % (len(requestIDs['Records']), ', '.join(conds), limit)) if printOutput: requests = [] if len(requestIDs['Records']) > limit: requestList = requestIDs['Records'] requests = requestList[:limit] else: requests = requestIDs['Records'] print '%s request(s) selected with conditions %s and limit %s' % ( len(requestIDs['Records']), ', '.join(conds), limit) print requestIDs['ParameterNames'] for request in requests: print request if not requestIDs: return S_ERROR('No requests selected for conditions: %s' % conditions) else: return result ############################################################################# def getRequestSummary(self, printOutput=False): """ Get a summary of the requests in the request DB. """ requestClient = RPCClient("RequestManagement/centralURL", timeout=120) result = requestClient.getDBSummary() if not result['OK']: self.log.warn(result['Message']) return result if printOutput: print self.pPrint.pformat(result['Value']) return result ############################################################################# def getExternalPackageVersions(self): """ Simple function that attempts to obtain the external versions for the local DIRAC installation (frequently needed for debugging purposes). """ gLogger.info( 'DIRAC version v%dr%d build %d' % (DIRAC.majorVersion, DIRAC.minorVersion, DIRAC.patchLevel)) try: import lcg_util infoStr = 'Using lcg_util from: \n%s' % lcg_util.__file__ gLogger.info(infoStr) infoStr = "The version of lcg_utils is %s" % lcg_util.lcg_util_version( ) gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import lcg_util: %s" % ( x) gLogger.exception(errStr) try: import gfalthr as gfal infoStr = "Using gfalthr from: \n%s" % gfal.__file__ gLogger.info(infoStr) infoStr = "The version of gfalthr is %s" % gfal.gfal_version() gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import gfalthr: %s." % ( x) gLogger.warn(errStr) try: import gfal infoStr = "Using gfal from: %s" % gfal.__file__ gLogger.info(infoStr) infoStr = "The version of gfal is %s" % gfal.gfal_version() gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import gfal: %s" % ( x) gLogger.exception(errStr) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', []) gLogger.info('Default list of protocols are: %s' % (', '.join(defaultProtocols))) return S_OK() ############################################################################# def getSiteProtocols(self, site, printOutput=False): """ Allows to check the defined protocols for each site SE. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are' ', '.join(defaultProtocols)) seInfo = {} siteSEs.sort() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue( '/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % ( se, section) seProtocols = gConfig.getValue(path, []) if not seProtocols: seProtocols = defaultProtocols seInfo[se] = seProtocols if printOutput: print '\nSummary of protocols for StorageElements at site %s' % site print '\nStorageElement'.ljust(30) + 'ProtocolsList'.ljust( 30) + '\n' for se, protocols in seInfo.iteritems(): print se.ljust(30) + ', '.join(protocols).ljust(30) return S_OK(seInfo) ############################################################################# def setSiteProtocols(self, site, protocolsList, printOutput=False): """ Allows to set the defined protocols for each SE for a given site. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are', ', '.join(defaultProtocols)) for protocol in protocolsList: if not protocol in defaultProtocols: return S_ERROR( 'Requested to set protocol %s in list but %s is not ' 'in default list of protocols:\n%s' % (protocol, protocol, ', '.join(defaultProtocols))) modifiedCS = False result = promptUser( 'Do you want to add the following default protocols:' ' %s for SE(s):\n%s' % (', '.join(protocolsList), ', '.join(siteSEs))) if not result['OK']: return result if result['Value'].lower() != 'y': self.log.always('No protocols will be added') return S_OK() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue( '/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % ( se, section) self.log.verbose('Setting %s to %s' % (path, ', '.join(protocolsList))) result = self.csSetOption(path, ', '.join(protocolsList)) if not result['OK']: return result modifiedCS = True if modifiedCS: result = self.csCommitChanges(False) if not result['OK']: return S_ERROR('CS Commit failed with message = %s' % (result['Message'])) else: if printOutput: print 'Successfully committed changes to CS' else: if printOutput: print 'No modifications to CS required' return S_OK() ############################################################################# def csSetOption(self, optionPath, optionValue): """ Function to modify an existing value in the CS. """ return self.csAPI.setOption(optionPath, optionValue) ############################################################################# def csSetOptionComment(self, optionPath, comment): """ Function to modify an existing value in the CS. """ return self.csAPI.setOptionComment(optionPath, comment) ############################################################################# def csModifyValue(self, optionPath, newValue): """ Function to modify an existing value in the CS. """ return self.csAPI.modifyValue(optionPath, newValue) ############################################################################# def csRegisterUser(self, username, properties): """ Registers a user in the CS. - username: Username of the user (easy;) - properties: Dict containing: - DN - groups : list/tuple of groups the user belongs to - <others> : More properties of the user, like mail """ return self.csAPI.addUser(username, properties) ############################################################################# def csDeleteUser(self, user): """ Deletes a user from the CS. Can take a list of users """ return self.csAPI.deleteUsers(user) ############################################################################# def csModifyUser(self, username, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addUser and applies the changes """ return self.csAPI.modifyUser(username, properties, createIfNonExistant) ############################################################################# def csListUsers(self, group=False): """ Lists the users in the CS. If no group is specified return all users. """ return self.csAPI.listUsers(group) ############################################################################# def csDescribeUsers(self, mask=False): """ List users and their properties in the CS. If a mask is given, only users in the mask will be returned """ return self.csAPI.describeUsers(mask) ############################################################################# def csModifyGroup(self, groupname, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addGroup and applies the changes """ return self.csAPI.modifyGroup(groupname, properties, createIfNonExistant) ############################################################################# def csListHosts(self): """ Lists the hosts in the CS """ return self.csAPI.listHosts() ############################################################################# def csDescribeHosts(self, mask=False): """ Gets extended info for the hosts in the CS """ return self.csAPI.describeHosts(mask) ############################################################################# def csModifyHost(self, hostname, properties, createIfNonExistant=False): """ Modify a host in the CS. Takes the same params as in addHost and applies the changes """ return self.csAPI.modifyHost(hostname, properties, createIfNonExistant) ############################################################################# def csListGroups(self): """ Lists groups in the CS """ return self.csAPI.listGroups() ############################################################################# def csDescribeGroups(self, mask=False): """ List groups and their properties in the CS. If a mask is given, only groups in the mask will be returned """ return self.csAPI.describeGroups(mask) ############################################################################# def csSyncUsersWithCFG(self, usersCFG): """ Synchronize users in cfg with its contents """ return self.csAPI.syncUsersWithCFG(usersCFG) ############################################################################# def csCommitChanges(self, sortUsers=True): """ Commit the changes in the CS """ return self.csAPI.commitChanges(sortUsers=False) ############################################################################# def sendMail(self, address, subject, body, fromAddress=None, localAttempt=True, html=False): """ Send mail to specified address with body. """ notification = NotificationClient() return notification.sendMail(address, subject, body, fromAddress, localAttempt, html) ############################################################################# def sendSMS(self, userName, body, fromAddress=None): """ Send mail to specified address with body. """ if len(body) > 160: return S_ERROR('Exceeded maximum SMS length of 160 characters') notification = NotificationClient() return notification.sendSMS(userName, body, fromAddress) ############################################################################# def getBDIISite(self, site, host=None): """ Get information about site from BDII at host """ return ldapSite(site, host=host) ############################################################################# def getBDIICluster(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCluster(ce, host=host) ############################################################################# def getBDIICE(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCE(ce, host=host) ############################################################################# def getBDIIService(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapService(ce, host=host) ############################################################################# def getBDIICEState(self, ce, useVO=voName, host=None): """ Get information about ce state from BDII at host """ return ldapCEState(ce, useVO, host=host) ############################################################################# def getBDIICEVOView(self, ce, useVO=voName, host=None): """ Get information about ce voview from BDII at host """ return ldapCEVOView(ce, useVO, host=host) ############################################################################# def getBDIISE(self, site, useVO=voName, host=None): """ Get information about SA from BDII at host """ return ldapSE(site, useVO, host=host)
class Bdii2CSAgent(AgentModule): def __init__(self, *args, **kwargs): """ Defines default parameters """ super(Bdii2CSAgent, self).__init__(*args, **kwargs) self.addressTo = '' self.addressFrom = '' self.voName = [] self.subject = self.am_getModuleParam('fullName') self.alternativeBDIIs = [] self.voBdiiCEDict = {} self.voBdiiSEDict = {} self.host = 'lcg-bdii.cern.ch:2170' self.glue2URLs = [] self.glue2Only = True self.csAPI = None # What to get self.processCEs = True self.selectedSites = [] # Update the CS or not? self.dryRun = False def initialize(self): """ Gets run paramaters from the configuration """ self.addressTo = self.am_getOption('MailTo', self.addressTo) self.addressFrom = self.am_getOption('MailFrom', self.addressFrom) # Create a list of alternative bdii urls self.alternativeBDIIs = self.am_getOption('AlternativeBDIIs', self.alternativeBDIIs) self.host = self.am_getOption('Host', self.host) self.glue2URLs = self.am_getOption('GLUE2URLs', self.glue2URLs) self.glue2Only = self.am_getOption('GLUE2Only', self.glue2Only) # Check if the bdii url is appended by a port number, if not append the default 2170 for index, url in enumerate(self.alternativeBDIIs): if not url.split(':')[-1].isdigit(): self.alternativeBDIIs[index] += ':2170' if self.addressTo and self.addressFrom: self.log.info("MailTo", self.addressTo) self.log.info("MailFrom", self.addressFrom) if self.alternativeBDIIs: self.log.info("AlternativeBDII URLs:", self.alternativeBDIIs) self.processCEs = self.am_getOption('ProcessCEs', self.processCEs) self.selectedSites = self.am_getOption('SelectedSites', []) self.dryRun = self.am_getOption('DryRun', self.dryRun) self.voName = self.am_getOption('VirtualOrganization', self.voName) if not self.voName: self.voName = self.am_getOption('VO', []) if not self.voName or (len(self.voName) == 1 and self.voName[0].lower() == 'all'): # Get all VOs defined in the configuration self.voName = [] result = getVOs() if result['OK']: vos = result['Value'] for vo in vos: vomsVO = getVOOption(vo, "VOMSName") if vomsVO: self.voName.append(vomsVO) if self.voName: self.log.info("Agent will manage VO(s) %s" % self.voName) else: self.log.fatal("VirtualOrganization option not defined for agent") return S_ERROR() self.csAPI = CSAPI() return self.csAPI.initialize() def execute(self): """ General agent execution method """ self.voBdiiCEDict = {} # Get a "fresh" copy of the CS data result = self.csAPI.downloadCSData() if not result['OK']: self.log.warn("Could not download a fresh copy of the CS data", result['Message']) # Refresh the configuration from the master server gConfig.forceRefresh(fromMaster=True) if self.processCEs: self.__lookForNewCEs() self.__updateCEs() return S_OK() def __lookForNewCEs(self): """ Look up BDII for CEs not yet present in the DIRAC CS """ bannedCEs = self.am_getOption('BannedCEs', []) for vo in self.voName: # get the known CEs for a given VO, so we can know the unknowns, or no longer supported, # for a VO res = getQueues(community=vo) if not res['OK']: return res knownCEs = set() for _site, ces in res['Value'].items(): knownCEs.update(ces) knownCEs.update(bannedCEs) result = self.__getBdiiCEInfo(vo) if not result['OK']: continue bdiiInfo = result['Value'] result = getGridCEs(vo, bdiiInfo=bdiiInfo, ceBlackList=knownCEs) if not result['OK']: self.log.error('Failed to get unused CEs', result['Message']) continue # next VO siteDict = result['Value'] unknownCEs = set(result['UnknownCEs']) - set(bannedCEs) body = '' for site in siteDict: newCEs = set(siteDict[site]) # pylint: disable=no-member if not newCEs: continue ceString = '' for ce in newCEs: queueString = '' ceInfo = bdiiInfo[site]['CEs'][ce] newCEString = "CE: %s, GOCDB Site Name: %s" % (ce, site) systemTuple = siteDict[site][ce]['System'] osString = "%s_%s_%s" % (systemTuple) newCEString = "\n%s\n%s\n" % (newCEString, osString) for queue in ceInfo['Queues']: queueStatus = ceInfo['Queues'][queue].get('GlueCEStateStatus', 'UnknownStatus') if 'production' in queueStatus.lower(): ceType = ceInfo['Queues'][queue].get('GlueCEImplementationName', '') queueString += " %s %s %s\n" % (queue, queueStatus, ceType) if queueString: ceString += newCEString ceString += "Queues:\n" ceString += queueString if ceString: body += ceString if siteDict: body = "\nWe are glad to inform You about new CE(s) possibly suitable for %s:\n" % vo + body body += "\n\nTo suppress information about CE add its name to BannedCEs list.\n" body += "Add new Sites/CEs for vo %s with the command:\n" % vo body += "dirac-admin-add-resources --vo %s --ce\n" % vo if unknownCEs: body += '\n\n' body += 'There is no (longer) information about the following CEs for the %s VO.\n' % vo body += '\n'.join(sorted(unknownCEs)) body += '\n\n' if body: self.log.info(body) if self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail(self.addressTo, self.subject, body, self.addressFrom, localAttempt=False, avoidSpam=True) if not result['OK']: self.log.error('Can not send new site notification mail', result['Message']) return S_OK() def __getBdiiCEInfo(self, vo): if vo in self.voBdiiCEDict: return S_OK(self.voBdiiCEDict[vo]) self.log.info("Check for available CEs for VO", vo) totalResult = S_OK({}) message = '' mainResult = getBdiiCEInfo(vo, host=self.host, glue2=self.glue2Only) if not mainResult['OK']: self.log.error("Failed getting information from default bdii", mainResult['Message']) message = mainResult['Message'] for bdii in reversed(self.alternativeBDIIs): resultAlt = getBdiiCEInfo(vo, host=bdii, glue2=self.glue2Only) if resultAlt['OK']: totalResult['Value'].update(resultAlt['Value']) else: self.log.error("Failed getting information from %s " % bdii, resultAlt['Message']) message = (message + "\n" + resultAlt['Message']).strip() for glue2URL in self.glue2URLs: if self.glue2Only: break resultGlue2 = getBdiiCEInfo(vo, host=glue2URL, glue2=True) if resultGlue2['OK']: totalResult['Value'].update(resultGlue2['Value']) else: self.log.error("Failed getting GLUE2 information for", "%s, %s: %s" % (glue2URL, vo, resultGlue2['Message'])) message = (message + "\n" + resultGlue2['Message']).strip() if mainResult['OK']: totalResult['Value'].update(mainResult['Value']) if not totalResult['Value'] and message: # Dict is empty and we have an error message self.log.error("Error during BDII request", message) totalResult = S_ERROR(message) else: self.voBdiiCEDict[vo] = totalResult['Value'] self.__purgeSites(totalResult['Value']) return totalResult def __updateCEs(self): """ Update the Site/CE/queue settings in the CS if they were changed in the BDII """ bdiiChangeSet = set() for vo in self.voName: result = self.__getBdiiCEInfo(vo) if not result['OK']: continue ceBdiiDict = result['Value'] result = getSiteUpdates(vo, bdiiInfo=ceBdiiDict, log=self.log) if not result['OK']: continue bdiiChangeSet = bdiiChangeSet.union(result['Value']) # We have collected all the changes, consolidate VO settings result = self.__updateCS(bdiiChangeSet) return result def __purgeSites(self, ceBdiiDict): """Remove all sites that are not in self.selectedSites. Modifies the ceBdiiDict! """ if not self.selectedSites: return for site in list(ceBdiiDict): ces = list(ceBdiiDict[site]['CEs']) if not ces: self.log.error("No CE information for site:", site) continue siteInCS = 'Not_In_CS' for ce in ces: res = getCESiteMapping(ce) if not res['OK']: self.log.error("Failed to get DIRAC site name for ce", "%s: %s" % (ce, res['Message'])) continue # if the ce is not in the CS the returned value will be empty if ce in res['Value']: siteInCS = res['Value'][ce] break self.log.debug("Checking site %s (%s), aka %s" % (site, ces, siteInCS)) if siteInCS in self.selectedSites: continue self.log.info("Dropping site %s, aka %s" % (site, siteInCS)) ceBdiiDict.pop(site) return def __updateCS(self, bdiiChangeSet): queueVODict = {} changeSet = set() for entry in bdiiChangeSet: section, option, _value, new_value = entry if option == "VO": queueVODict.setdefault(section, set()) queueVODict[section] = queueVODict[section].union(set(new_value.split(','))) else: changeSet.add(entry) for section, VOs in queueVODict.items(): # can be an iterator changeSet.add((section, 'VO', '', ','.join(VOs))) if changeSet: changeList = sorted(changeSet) body = '\n'.join(["%s/%s %s -> %s" % entry for entry in changeList]) if body and self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail(self.addressTo, self.subject, body, self.addressFrom, localAttempt=False, avoidSpam=True) if body: self.log.info('The following configuration changes were detected:') self.log.info(body) for section, option, value, new_value in changeSet: if value == 'Unknown' or not value: self.csAPI.setOption(cfgPath(section, option), new_value) else: self.csAPI.modifyValue(cfgPath(section, option), new_value) if self.dryRun: self.log.info("Dry Run: CS won't be updated") self.csAPI.showDiff() else: result = self.csAPI.commit() if not result['OK']: self.log.error("Error while committing to CS", result['Message']) else: self.log.info("Successfully committed %d changes to CS" % len(changeList)) return result else: self.log.info("No changes found") return S_OK()
class CE2CSAgent(AgentModule): addressTo = "" addressFrom = "" voName = "" subject = "CE2CSAgent" alternativeBDIIs = [] def initialize(self): # TODO: Have no default and if no mail is found then use the diracAdmin group # and resolve all associated mail addresses. self.addressTo = self.am_getOption("MailTo", self.addressTo) self.addressFrom = self.am_getOption("MailFrom", self.addressFrom) # Create a list of alternative bdii urls self.alternativeBDIIs = self.am_getOption("AlternativeBDIIs", []) # Check if the bdii url is appended by a port number, if not append the default 2170 for index, url in enumerate(self.alternativeBDIIs): if not url.split(":")[-1].isdigit(): self.alternativeBDIIs[index] += ":2170" if self.addressTo and self.addressFrom: self.log.info("MailTo", self.addressTo) self.log.info("MailFrom", self.addressFrom) if self.alternativeBDIIs: self.log.info("AlternativeBDII URLs:", self.alternativeBDIIs) self.subject = "CE2CSAgent" # This sets the Default Proxy to used as that defined under # /Operations/Shifter/TestManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption("shifterProxy", "TestManager") self.voName = self.am_getOption("VirtualOrganization", []) if not self.voName: vo = getVO() if vo: self.voName = [vo] if self.voName: self.log.info("Agent will manage VO(s) %s" % self.voName) else: self.log.fatal("VirtualOrganization option not defined for agent") return S_ERROR() self.csAPI = CSAPI() return self.csAPI.initialize() def execute(self): self.log.info("Start Execution") result = getProxyInfo() if not result["OK"]: return result infoDict = result["Value"] self.log.info(formatProxyInfoAsString(infoDict)) # Get a "fresh" copy of the CS data result = self.csAPI.downloadCSData() if not result["OK"]: self.log.warn("Could not download a fresh copy of the CS data", result["Message"]) self.__lookForCE() self.__infoFromCE() self.log.info("End Execution") return S_OK() def __checkAlternativeBDIISite(self, fun, *args): if self.alternativeBDIIs: self.log.warn("Trying to use alternative BDII sites") for site in self.alternativeBDIIs: self.log.info("Trying to contact alternative BDII", site) if len(args) == 1: result = fun(args[0], host=site) elif len(args) == 2: result = fun(args[0], vo=args[1], host=site) if not result["OK"]: self.log.error("Problem contacting alternative BDII", result["Message"]) elif result["OK"]: return result self.log.warn("Also checking alternative BDII sites failed") return result def __lookForCE(self): knownCEs = self.am_getOption("BannedCEs", []) result = gConfig.getSections("/Resources/Sites") if not result["OK"]: return grids = result["Value"] for grid in grids: result = gConfig.getSections("/Resources/Sites/%s" % grid) if not result["OK"]: return sites = result["Value"] for site in sites: opt = gConfig.getOptionsDict("/Resources/Sites/%s/%s" % (grid, site))["Value"] ces = List.fromChar(opt.get("CE", "")) knownCEs += ces response = "" for vo in self.voName: self.log.info("Check for available CEs for VO", vo) response = ldapCEState("", vo) if not response["OK"]: self.log.error("Error during BDII request", response["Message"]) response = self.__checkAlternativeBDIISite(ldapCEState, "", vo) return response newCEs = {} for queue in response["Value"]: try: queueName = queue["GlueCEUniqueID"] except: continue ceName = queueName.split(":")[0] if not ceName in knownCEs: newCEs[ceName] = None self.log.debug("New CE", ceName) body = "" possibleNewSites = [] for ce in newCEs.iterkeys(): response = ldapCluster(ce) if not response["OK"]: self.log.warn("Error during BDII request", response["Message"]) response = self.__checkAlternativeBDIISite(ldapCluster, ce) continue clusters = response["Value"] if len(clusters) != 1: self.log.warn("Error in cluster length", " CE %s Length %d" % (ce, len(clusters))) if len(clusters) == 0: continue cluster = clusters[0] fkey = cluster.get("GlueForeignKey", []) if type(fkey) == type(""): fkey = [fkey] nameBDII = None for entry in fkey: if entry.count("GlueSiteUniqueID"): nameBDII = entry.split("=")[1] break if not nameBDII: continue ceString = "CE: %s, GOCDB Name: %s" % (ce, nameBDII) self.log.info(ceString) response = ldapCE(ce) if not response["OK"]: self.log.warn("Error during BDII request", response["Message"]) response = self.__checkAlternativeBDIISite(ldapCE, ce) continue ceInfos = response["Value"] if len(ceInfos): ceInfo = ceInfos[0] systemName = ceInfo.get("GlueHostOperatingSystemName", "Unknown") systemVersion = ceInfo.get("GlueHostOperatingSystemVersion", "Unknown") systemRelease = ceInfo.get("GlueHostOperatingSystemRelease", "Unknown") else: systemName = "Unknown" systemVersion = "Unknown" systemRelease = "Unknown" osString = "SystemName: %s, SystemVersion: %s, SystemRelease: %s" % ( systemName, systemVersion, systemRelease, ) self.log.info(osString) response = ldapCEState(ce, vo) if not response["OK"]: self.log.warn("Error during BDII request", response["Message"]) response = self.__checkAlternativeBDIISite(ldapCEState, ce, vo) continue newCEString = "\n\n%s\n%s" % (ceString, osString) usefull = False ceStates = response["Value"] for ceState in ceStates: queueName = ceState.get("GlueCEUniqueID", "UnknownName") queueStatus = ceState.get("GlueCEStateStatus", "UnknownStatus") queueString = "%s %s" % (queueName, queueStatus) self.log.info(queueString) newCEString += "\n%s" % queueString if queueStatus.count("Production"): usefull = True if usefull: body += newCEString possibleNewSites.append("dirac-admin-add-site DIRACSiteName %s %s" % (nameBDII, ce)) if body: body = "We are glad to inform You about new CE(s) possibly suitable for %s:\n" % vo + body body += "\n\nTo suppress information about CE add its name to BannedCEs list." for possibleNewSite in possibleNewSites: body = "%s\n%s" % (body, possibleNewSite) self.log.info(body) if self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail( self.addressTo, self.subject, body, self.addressFrom, localAttempt=False ) return S_OK() def __infoFromCE(self): sitesSection = cfgPath("Resources", "Sites") result = gConfig.getSections(sitesSection) if not result["OK"]: return grids = result["Value"] changed = False body = "" for grid in grids: gridSection = cfgPath(sitesSection, grid) result = gConfig.getSections(gridSection) if not result["OK"]: return sites = result["Value"] for site in sites: siteSection = cfgPath(gridSection, site) opt = gConfig.getOptionsDict(siteSection)["Value"] name = opt.get("Name", "") if name: coor = opt.get("Coordinates", "Unknown") mail = opt.get("Mail", "Unknown") result = ldapSite(name) if not result["OK"]: self.log.warn("BDII site %s: %s" % (name, result["Message"])) result = self.__checkAlternativeBDIISite(ldapSite, name) if result["OK"]: bdiiSites = result["Value"] if len(bdiiSites) == 0: self.log.warn(name, "Error in BDII: leng = 0") else: if not len(bdiiSites) == 1: self.log.warn(name, "Warning in BDII: leng = %d" % len(bdiiSites)) bdiiSite = bdiiSites[0] try: longitude = bdiiSite["GlueSiteLongitude"] latitude = bdiiSite["GlueSiteLatitude"] newcoor = "%s:%s" % (longitude, latitude) except: self.log.warn("Error in BDII coordinates") newcoor = "Unknown" try: newmail = bdiiSite["GlueSiteSysAdminContact"].split(":")[-1].strip() except: self.log.warn("Error in BDII mail") newmail = "Unknown" self.log.debug("%s %s %s" % (name, newcoor, newmail)) if newcoor != coor: self.log.info("%s" % (name), "%s -> %s" % (coor, newcoor)) if coor == "Unknown": self.csAPI.setOption(cfgPath(siteSection, "Coordinates"), newcoor) else: self.csAPI.modifyValue(cfgPath(siteSection, "Coordinates"), newcoor) changed = True if newmail != mail: self.log.info("%s" % (name), "%s -> %s" % (mail, newmail)) if mail == "Unknown": self.csAPI.setOption(cfgPath(siteSection, "Mail"), newmail) else: self.csAPI.modifyValue(cfgPath(siteSection, "Mail"), newmail) changed = True ceList = List.fromChar(opt.get("CE", "")) if not ceList: self.log.warn(site, "Empty site list") continue # result = gConfig.getSections( cfgPath( siteSection,'CEs' ) # if not result['OK']: # self.log.debug( "Section CEs:", result['Message'] ) for ce in ceList: ceSection = cfgPath(siteSection, "CEs", ce) result = gConfig.getOptionsDict(ceSection) if not result["OK"]: self.log.debug("Section CE", result["Message"]) wnTmpDir = "Unknown" arch = "Unknown" os = "Unknown" si00 = "Unknown" pilot = "Unknown" ceType = "Unknown" else: ceopt = result["Value"] wnTmpDir = ceopt.get("wnTmpDir", "Unknown") arch = ceopt.get("architecture", "Unknown") os = ceopt.get("OS", "Unknown") si00 = ceopt.get("SI00", "Unknown") pilot = ceopt.get("Pilot", "Unknown") ceType = ceopt.get("CEType", "Unknown") result = ldapCE(ce) if not result["OK"]: self.log.warn("Error in BDII for %s" % ce, result["Message"]) result = self.__checkAlternativeBDIISite(ldapCE, ce) continue try: bdiiCE = result["Value"][0] except: self.log.warn("Error in BDII for %s" % ce, result) bdiiCE = None if bdiiCE: try: newWNTmpDir = bdiiCE["GlueSubClusterWNTmpDir"] except: newWNTmpDir = "Unknown" if wnTmpDir != newWNTmpDir and newWNTmpDir != "Unknown": section = cfgPath(ceSection, "wnTmpDir") self.log.info(section, " -> ".join((wnTmpDir, newWNTmpDir))) if wnTmpDir == "Unknown": self.csAPI.setOption(section, newWNTmpDir) else: self.csAPI.modifyValue(section, newWNTmpDir) changed = True try: newArch = bdiiCE["GlueHostArchitecturePlatformType"] except: newArch = "Unknown" if arch != newArch and newArch != "Unknown": section = cfgPath(ceSection, "architecture") self.log.info(section, " -> ".join((arch, newArch))) if arch == "Unknown": self.csAPI.setOption(section, newArch) else: self.csAPI.modifyValue(section, newArch) changed = True try: newOS = "_".join( ( bdiiCE["GlueHostOperatingSystemName"], bdiiCE["GlueHostOperatingSystemVersion"], bdiiCE["GlueHostOperatingSystemRelease"], ) ) except: newOS = "Unknown" if os != newOS and newOS != "Unknown": section = cfgPath(ceSection, "OS") self.log.info(section, " -> ".join((os, newOS))) if os == "Unknown": self.csAPI.setOption(section, newOS) else: self.csAPI.modifyValue(section, newOS) changed = True body = body + "OS was changed %s -> %s for %s at %s\n" % (os, newOS, ce, site) try: newSI00 = bdiiCE["GlueHostBenchmarkSI00"] except: newSI00 = "Unknown" if si00 != newSI00 and newSI00 != "Unknown": section = cfgPath(ceSection, "SI00") self.log.info(section, " -> ".join((si00, newSI00))) if si00 == "Unknown": self.csAPI.setOption(section, newSI00) else: self.csAPI.modifyValue(section, newSI00) changed = True try: rte = bdiiCE["GlueHostApplicationSoftwareRunTimeEnvironment"] for vo in self.voName: if vo.lower() == "lhcb": if "VO-lhcb-pilot" in rte: newPilot = "True" else: newPilot = "False" else: newPilot = "Unknown" except: newPilot = "Unknown" if pilot != newPilot and newPilot != "Unknown": section = cfgPath(ceSection, "Pilot") self.log.info(section, " -> ".join((pilot, newPilot))) if pilot == "Unknown": self.csAPI.setOption(section, newPilot) else: self.csAPI.modifyValue(section, newPilot) changed = True newVO = "" for vo in self.voName: result = ldapCEState(ce, vo) # getBDIICEVOView if not result["OK"]: self.log.warn("Error in BDII for queue %s" % ce, result["Message"]) result = self.__checkAlternativeBDIISite(ldapCEState, ce, vo) continue try: queues = result["Value"] except: self.log.warn("Error in BDII for queue %s" % ce, result["Massage"]) continue newCEType = "Unknown" for queue in queues: try: queueType = queue["GlueCEImplementationName"] except: queueType = "Unknown" if newCEType == "Unknown": newCEType = queueType else: if queueType != newCEType: self.log.warn( "Error in BDII for CE %s " % ce, "different CE types %s %s" % (newCEType, queueType), ) if newCEType == "ARC-CE": newCEType = "ARC" if ceType != newCEType and newCEType != "Unknown": section = cfgPath(ceSection, "CEType") self.log.info(section, " -> ".join((ceType, newCEType))) if ceType == "Unknown": self.csAPI.setOption(section, newCEType) else: self.csAPI.modifyValue(section, newCEType) changed = True for queue in queues: try: queueName = queue["GlueCEUniqueID"].split("/")[-1] except: self.log.warn("Error in queueName ", queue) continue try: newMaxCPUTime = queue["GlueCEPolicyMaxCPUTime"] except: newMaxCPUTime = None newSI00 = None try: caps = queue["GlueCECapability"] if type(caps) == type(""): caps = [caps] for cap in caps: if cap.count("CPUScalingReferenceSI00"): newSI00 = cap.split("=")[-1] except: newSI00 = None queueSection = cfgPath(ceSection, "Queues", queueName) result = gConfig.getOptionsDict(queueSection) if not result["OK"]: self.log.warn("Section Queues", result["Message"]) maxCPUTime = "Unknown" si00 = "Unknown" allowedVOs = [""] else: queueOpt = result["Value"] maxCPUTime = queueOpt.get("maxCPUTime", "Unknown") si00 = queueOpt.get("SI00", "Unknown") if newVO == "": # Remember previous iteration, if none - read from conf allowedVOs = queueOpt.get("VO", "").split(",") else: # Else use newVO, as it can contain changes, which aren't in conf yet allowedVOs = newVO.split(",") if newMaxCPUTime and (maxCPUTime != newMaxCPUTime): section = cfgPath(queueSection, "maxCPUTime") self.log.info(section, " -> ".join((maxCPUTime, newMaxCPUTime))) if maxCPUTime == "Unknown": self.csAPI.setOption(section, newMaxCPUTime) else: self.csAPI.modifyValue(section, newMaxCPUTime) changed = True if newSI00 and (si00 != newSI00): section = cfgPath(queueSection, "SI00") self.log.info(section, " -> ".join((si00, newSI00))) if si00 == "Unknown": self.csAPI.setOption(section, newSI00) else: self.csAPI.modifyValue(section, newSI00) changed = True modifyVO = True # Flag saying if we need VO option to change newVO = "" if allowedVOs != [""]: for allowedVO in allowedVOs: allowedVO = allowedVO.strip() # Get rid of spaces newVO += allowedVO if allowedVO == vo: # Current VO has been already in list newVO = "" modifyVO = False # Don't change anything break # Skip next 'if', proceed to next VO newVO += ", " if modifyVO: section = cfgPath(queueSection, "VO") newVO += vo self.log.info(section, " -> ".join(("%s" % allowedVOs, newVO))) if allowedVOs == [""]: self.csAPI.setOption(section, newVO) else: self.csAPI.modifyValue(section, newVO) changed = True if changed: self.log.info(body) if body and self.addressTo and self.addressFrom: notification = NotificationClient() result = notification.sendMail(self.addressTo, self.subject, body, self.addressFrom, localAttempt=False) return self.csAPI.commit() else: self.log.info("No changes found") return S_OK()