class DiracAdmin(API): """ Administrative functionalities """ ############################################################################# def __init__(self): """Internal initialization of the DIRAC Admin API. """ super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG': self.dbg = True self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp') self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus() ############################################################################# def uploadProxy(self, group): """Upload a proxy to the DIRAC WMS. This method Example usage: >>> print diracAdmin.uploadProxy('lhcb_pilot') {'OK': True, 'Value': 0L} :param group: DIRAC Group :type job: string :return: S_OK,S_ERROR :param permanent: Indefinitely update proxy :type permanent: boolean """ return gProxyManager.uploadProxy(diracGroup=group) ############################################################################# def setProxyPersistency(self, userDN, userGroup, persistent=True): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ) {'OK': True } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param persistent: Persistent flag :type persistent: boolean :return: S_OK,S_ERROR """ return gProxyManager.setPersistency(userDN, userGroup, persistent) ############################################################################# def checkProxyUploaded(self, userDN, userGroup, requiredTime): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ) {'OK': True, 'Value' : True/False } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param requiredTime: Required life time of the uploaded proxy :type requiredTime: boolean :return: S_OK,S_ERROR """ return gProxyManager.userHasProxy(userDN, userGroup, requiredTime) ############################################################################# def getSiteMask(self, printOutput=False, status='Active'): """Retrieve current site mask from WMS Administrator service. Example usage: >>> print diracAdmin.getSiteMask() {'OK': True, 'Value': 0L} :return: S_OK,S_ERROR """ result = self.sitestatus.getSites(siteState=status) if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: print site return result ############################################################################# def getBannedSites(self, gridType=[], printOutput=False): """Retrieve current list of banned and probing sites. Example usage: >>> print diracAdmin.getBannedSites() {'OK': True, 'Value': []} :return: S_OK,S_ERROR """ bannedSites = self.sitestatus.getSites(siteState='Banned') if not bannedSites['OK']: return bannedSites probingSites = self.sitestatus.getSites(siteState='Probing') if not probingSites['OK']: return probingSites mergedList = bannedSites['Value'] + probingSites['Value'] mergedList.sort() if printOutput: print '\n'.join(mergedList) return S_OK(mergedList) ############################################################################# def getSiteSection(self, site, printOutput=False): """Simple utility to get the list of CEs for DIRAC site name. Example usage: >>> print diracAdmin.getSiteSection('LCG.CERN.ch') {'OK': True, 'Value':} :return: S_OK,S_ERROR """ gridType = site.split('.')[0] if not gConfig.getSections('/Resources/Sites/%s' % (gridType))['OK']: return S_ERROR('/Resources/Sites/%s is not a valid site section' % (gridType)) result = gConfig.getOptionsDict('/Resources/Sites/%s/%s' % (gridType, site)) if printOutput and result['OK']: print self.pPrint.pformat(result['Value']) return result ############################################################################# def allowSite(self, site, comment, printOutput=False): """Adds the site to the site mask. Example usage: >>> print diracAdmin.allowSite() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result result = self.getSiteMask(status='Active') if not result['OK']: return result siteMask = result['Value'] if site in siteMask: if printOutput: print 'Site %s is already Active' % site return S_OK('Site %s is already Active' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Active', comment) else: wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.allowSite(site, comment) if not result['OK']: return result if printOutput: print 'Site %s status is set to Active' % site return result ############################################################################# def getSiteMaskLogging(self, site=None, printOutput=False): """Retrieves site mask logging information. Example usage: >>> print diracAdmin.getSiteMaskLogging('LCG.AUVER.fr') {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getSiteMaskLogging(site) if not result['OK']: return result if site: if not result['Value'].has_key(site): return S_ERROR('Site mask information not available for %s' % (site)) if printOutput: if site: print '\nSite Mask Logging Info for %s\n' % site else: print '\nAll Site Mask Logging Info\n' siteDict = result['Value'] for site, tupleList in siteDict.iteritems(): if not site: print '\n===> %s\n' % site for tup in tupleList: print str( tup[0] ).ljust( 8 ) + str( tup[1] ).ljust( 20 ) + \ '( ' + str( tup[2] ).ljust( len( str( tup[2] ) ) ) + ' ) "' + str( tup[3] ) + '"' print ' ' return result ############################################################################# def banSite(self, site, comment, printOutput=False): """Removes the site from the site mask. Example usage: >>> print diracAdmin.banSite() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result mask = self.getSiteMask(status='Banned') if not mask['OK']: return mask siteMask = mask['Value'] if site in siteMask: if printOutput: print 'Site %s is already Banned' % site return S_OK('Site %s is already Banned' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Banned', comment) else: wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.banSite(site, comment) if not result['OK']: return result if printOutput: print 'Site %s status is set to Banned' % site return result ############################################################################# def __checkSiteIsValid(self, site): """Internal function to check that a site name is valid. """ sites = getSiteCEMapping() if not sites['OK']: return S_ERROR('Could not get site CE mapping') siteList = sites['Value'].keys() if not site in siteList: return S_ERROR( 'Specified site %s is not in list of defined sites' % site) return S_OK('%s is valid' % site) ############################################################################# def clearMask(self): """Removes all sites from the site mask. Should be used with care. Example usage: >>> print diracAdmin.clearMask() {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.clearMask() return result ############################################################################# def getServicePorts(self, setup='', printOutput=False): """Checks the service ports for the specified setup. If not given this is taken from the current installation (/DIRAC/Setup) Example usage: >>> print diracAdmin.getServicePorts() {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ if not setup: setup = gConfig.getValue('/DIRAC/Setup', '') setupList = gConfig.getSections('/DIRAC/Setups', []) if not setupList['OK']: return S_ERROR('Could not get /DIRAC/Setups sections') setupList = setupList['Value'] if not setup in setupList: return S_ERROR('Setup %s is not in allowed list: %s' % (setup, ', '.join(setupList))) serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup) if not serviceSetups['OK']: return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup) serviceSetups = serviceSetups['Value'] # dict systemList = gConfig.getSections('/Systems') if not systemList['OK']: return S_ERROR('Could not get Systems sections') systemList = systemList['Value'] result = {} for system in systemList: if serviceSetups.has_key(system): path = '/Systems/%s/%s/Services' % (system, serviceSetups[system]) servicesList = gConfig.getSections(path) if not servicesList['OK']: self.log.warn('Could not get sections in %s' % path) else: servicesList = servicesList['Value'] if not servicesList: servicesList = [] self.log.verbose('System: %s ServicesList: %s' % (system, ', '.join(servicesList))) for service in servicesList: spath = '%s/%s/Port' % (path, service) servicePort = gConfig.getValue(spath, 0) if servicePort: self.log.verbose('Found port for %s/%s = %s' % (system, service, servicePort)) result['%s/%s' % (system, service)] = servicePort else: self.log.warn('No port found for %s' % spath) else: self.log.warn('%s is not defined in /DIRAC/Setups/%s' % (system, setup)) if printOutput: print self.pPrint.pformat(result) return S_OK(result) ############################################################################# def getProxy(self, userDN, userGroup, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadProxy(userDN, userGroup, limited=limited, requiredTimeLeft=validity) ############################################################################# def getVOMSProxy(self, userDN, userGroup, vomsAttr=False, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and VOMS extensions and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadVOMSProxy(userDN, userGroup, limited=limited, requiredVOMSAttribute=vomsAttr, requiredTimeLeft=validity) ############################################################################# def getPilotProxy(self, userDN, userGroup, validity=43200): """Retrieves a pilot proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.getPilotProxyFromDIRACGroup( userDN, userGroup, requiredTimeLeft=validity) ############################################################################# def resetJob(self, jobID): """Reset a job or list of jobs in the WMS. This operation resets the reschedule counter for a job or list of jobs and allows them to run as new. Example:: >>> print dirac.reset(12345) {'OK': True, 'Value': [12345]} :param job: JobID :type job: integer or list of integers :return: S_OK,S_ERROR """ if isinstance(jobID, basestring): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobID' ) elif isinstance(jobID, list): try: jobID = [int(job) for job in jobID] except Exception as x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobIDs' ) jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=False) result = jobManager.resetJob(jobID) return result ############################################################################# def getJobPilotOutput(self, jobID, directory=''): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> print dirac.getJobPilotOutput(12345) {'OK': True, StdOut:'',StdError:''} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getJobPilotOutput(jobID) if not result['OK']: return result outputPath = '%s/pilot_%s' % (directory, jobID) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if outputs.has_key('StdOut'): stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.verbose('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if outputs.has_key('StdError'): stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdError']) self.log.verbose('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotOutput(self, gridReference, directory=''): """Retrieve the pilot output (std.out and std.err) for an existing job in the WMS. >>> print dirac.getJobPilotOutput(12345) {'OK': True, 'Value': {}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotOutput(gridReference) if not result['OK']: return result gridReferenceSmall = gridReference.split('/')[-1] if not gridReferenceSmall: gridReferenceSmall = 'reference' outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if outputs.has_key('StdOut'): stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.info('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if outputs.has_key('StdErr'): stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdErr']) self.log.info('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotInfo(self, gridReference): """Retrieve info relative to a pilot reference >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotInfo(gridReference) return result ############################################################################# def killPilot(self, gridReference): """Kill the pilot specified >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.killPilot(gridReference) return result ############################################################################# def getPilotLoggingInfo(self, gridReference): """Retrieve the pilot logging info for an existing job in the WMS. >>> print dirac.getPilotLoggingInfo(12345) {'OK': True, 'Value': {"The output of the command"}} :param gridReference: Gridp pilot job reference Id :type gridReference: string :return: S_OK,S_ERROR """ if type(gridReference) not in types.StringTypes: return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') return wmsAdmin.getPilotLoggingInfo(gridReference) ############################################################################# def getJobPilots(self, jobID): """Extract the list of submitted pilots and their status for a given jobID from the WMS. Useful information is printed to the screen. >>> print dirac.getJobPilots() {'OK': True, 'Value': {PilotID:{StatusDict}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if isinstance(jobID, basestring): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or string for existing jobID') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilots(jobID) if result['OK']: print self.pPrint.pformat(result['Value']) return result ############################################################################# def getPilotSummary(self, startDate='', endDate=''): """Retrieve the pilot output for an existing job in the WMS. Summary is printed at INFO level, full dictionary of results also returned. >>> print dirac.getPilotSummary() {'OK': True, 'Value': {CE:{Status:Count}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotSummary(startDate, endDate) if not result['OK']: return result ceDict = result['Value'] headers = 'CE'.ljust(28) i = 0 for ce, summary in ceDict.iteritems(): states = summary.keys() if len(states) > i: i = len(states) for i in xrange(i): headers += 'Status'.ljust(12) + 'Count'.ljust(12) print headers for ce, summary in ceDict.iteritems(): line = ce.ljust(28) states = summary.keys() states.sort() for state in states: count = str(summary[state]) line += state.ljust(12) + count.ljust(12) print line return result ############################################################################# def selectRequests(self, jobID=None, requestID=None, requestName=None, requestType=None, status=None, operation=None, ownerDN=None, ownerGroup=None, requestStart=0, limit=100, printOutput=False): """Select requests from the request management system. A few notes on the selection criteria: - jobID is the WMS JobID for the request (if applicable) - requestID is assigned during submission of the request - requestName is the corresponding XML file name - requestType e.g. 'transfer' - status e.g. Done - operation e.g. replicateAndRegister - requestStart e.g. the first request to consider (start from 0 by default) - limit e.g. selection limit (default 100) >>> dirac.selectRequests(jobID='4894') {'OK': True, 'Value': [[<Requests>]]} """ options = { 'RequestID': requestID, 'RequestName': requestName, 'JobID': jobID, 'OwnerDN': ownerDN, 'OwnerGroup': ownerGroup, 'RequestType': requestType, 'Status': status, 'Operation': operation } conditions = {} for key, value in options.iteritems(): if value: try: conditions[key] = str(value) except Exception as x: return self._errorReport( str(x), 'Expected string for %s field' % key) try: requestStart = int(requestStart) limit = int(limit) except Exception as x: return self._errorReport(str(x), 'Expected integer for %s field' % limit) self.log.verbose('Will select requests with the following conditions') self.log.verbose(self.pPrint.pformat(conditions)) requestClient = RPCClient("RequestManagement/centralURL") result = requestClient.getRequestSummaryWeb(conditions, [], requestStart, limit) if not result['OK']: self.log.warn(result['Message']) return result requestIDs = result['Value'] conds = [] for key, value in conditions.iteritems(): if value: conds.append('%s = %s' % (key, value)) self.log.verbose( '%s request(s) selected with conditions %s and limit %s' % (len(requestIDs['Records']), ', '.join(conds), limit)) if printOutput: requests = [] if len(requestIDs['Records']) > limit: requestList = requestIDs['Records'] requests = requestList[:limit] else: requests = requestIDs['Records'] print '%s request(s) selected with conditions %s and limit %s' % ( len(requestIDs['Records']), ', '.join(conds), limit) print requestIDs['ParameterNames'] for request in requests: print request if not requestIDs: return S_ERROR('No requests selected for conditions: %s' % conditions) else: return result ############################################################################# def getRequestSummary(self, printOutput=False): """ Get a summary of the requests in the request DB. """ requestClient = RPCClient("RequestManagement/centralURL", timeout=120) result = requestClient.getDBSummary() if not result['OK']: self.log.warn(result['Message']) return result if printOutput: print self.pPrint.pformat(result['Value']) return result ############################################################################# def getExternalPackageVersions(self): """ Simple function that attempts to obtain the external versions for the local DIRAC installation (frequently needed for debugging purposes). """ gLogger.info( 'DIRAC version v%dr%d build %d' % (DIRAC.majorVersion, DIRAC.minorVersion, DIRAC.patchLevel)) try: import lcg_util infoStr = 'Using lcg_util from: \n%s' % lcg_util.__file__ gLogger.info(infoStr) infoStr = "The version of lcg_utils is %s" % lcg_util.lcg_util_version( ) gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import lcg_util: %s" % ( x) gLogger.exception(errStr) try: import gfalthr as gfal infoStr = "Using gfalthr from: \n%s" % gfal.__file__ gLogger.info(infoStr) infoStr = "The version of gfalthr is %s" % gfal.gfal_version() gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import gfalthr: %s." % ( x) gLogger.warn(errStr) try: import gfal infoStr = "Using gfal from: %s" % gfal.__file__ gLogger.info(infoStr) infoStr = "The version of gfal is %s" % gfal.gfal_version() gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import gfal: %s" % ( x) gLogger.exception(errStr) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', []) gLogger.info('Default list of protocols are: %s' % (', '.join(defaultProtocols))) return S_OK() ############################################################################# def getSiteProtocols(self, site, printOutput=False): """ Allows to check the defined protocols for each site SE. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are' ', '.join(defaultProtocols)) seInfo = {} siteSEs.sort() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue( '/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % ( se, section) seProtocols = gConfig.getValue(path, []) if not seProtocols: seProtocols = defaultProtocols seInfo[se] = seProtocols if printOutput: print '\nSummary of protocols for StorageElements at site %s' % site print '\nStorageElement'.ljust(30) + 'ProtocolsList'.ljust( 30) + '\n' for se, protocols in seInfo.iteritems(): print se.ljust(30) + ', '.join(protocols).ljust(30) return S_OK(seInfo) ############################################################################# def setSiteProtocols(self, site, protocolsList, printOutput=False): """ Allows to set the defined protocols for each SE for a given site. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are', ', '.join(defaultProtocols)) for protocol in protocolsList: if not protocol in defaultProtocols: return S_ERROR( 'Requested to set protocol %s in list but %s is not ' 'in default list of protocols:\n%s' % (protocol, protocol, ', '.join(defaultProtocols))) modifiedCS = False result = promptUser( 'Do you want to add the following default protocols:' ' %s for SE(s):\n%s' % (', '.join(protocolsList), ', '.join(siteSEs))) if not result['OK']: return result if result['Value'].lower() != 'y': self.log.always('No protocols will be added') return S_OK() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue( '/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % ( se, section) self.log.verbose('Setting %s to %s' % (path, ', '.join(protocolsList))) result = self.csSetOption(path, ', '.join(protocolsList)) if not result['OK']: return result modifiedCS = True if modifiedCS: result = self.csCommitChanges(False) if not result['OK']: return S_ERROR('CS Commit failed with message = %s' % (result['Message'])) else: if printOutput: print 'Successfully committed changes to CS' else: if printOutput: print 'No modifications to CS required' return S_OK() ############################################################################# def csSetOption(self, optionPath, optionValue): """ Function to modify an existing value in the CS. """ return self.csAPI.setOption(optionPath, optionValue) ############################################################################# def csSetOptionComment(self, optionPath, comment): """ Function to modify an existing value in the CS. """ return self.csAPI.setOptionComment(optionPath, comment) ############################################################################# def csModifyValue(self, optionPath, newValue): """ Function to modify an existing value in the CS. """ return self.csAPI.modifyValue(optionPath, newValue) ############################################################################# def csRegisterUser(self, username, properties): """ Registers a user in the CS. - username: Username of the user (easy;) - properties: Dict containing: - DN - groups : list/tuple of groups the user belongs to - <others> : More properties of the user, like mail """ return self.csAPI.addUser(username, properties) ############################################################################# def csDeleteUser(self, user): """ Deletes a user from the CS. Can take a list of users """ return self.csAPI.deleteUsers(user) ############################################################################# def csModifyUser(self, username, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addUser and applies the changes """ return self.csAPI.modifyUser(username, properties, createIfNonExistant) ############################################################################# def csListUsers(self, group=False): """ Lists the users in the CS. If no group is specified return all users. """ return self.csAPI.listUsers(group) ############################################################################# def csDescribeUsers(self, mask=False): """ List users and their properties in the CS. If a mask is given, only users in the mask will be returned """ return self.csAPI.describeUsers(mask) ############################################################################# def csModifyGroup(self, groupname, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addGroup and applies the changes """ return self.csAPI.modifyGroup(groupname, properties, createIfNonExistant) ############################################################################# def csListHosts(self): """ Lists the hosts in the CS """ return self.csAPI.listHosts() ############################################################################# def csDescribeHosts(self, mask=False): """ Gets extended info for the hosts in the CS """ return self.csAPI.describeHosts(mask) ############################################################################# def csModifyHost(self, hostname, properties, createIfNonExistant=False): """ Modify a host in the CS. Takes the same params as in addHost and applies the changes """ return self.csAPI.modifyHost(hostname, properties, createIfNonExistant) ############################################################################# def csListGroups(self): """ Lists groups in the CS """ return self.csAPI.listGroups() ############################################################################# def csDescribeGroups(self, mask=False): """ List groups and their properties in the CS. If a mask is given, only groups in the mask will be returned """ return self.csAPI.describeGroups(mask) ############################################################################# def csSyncUsersWithCFG(self, usersCFG): """ Synchronize users in cfg with its contents """ return self.csAPI.syncUsersWithCFG(usersCFG) ############################################################################# def csCommitChanges(self, sortUsers=True): """ Commit the changes in the CS """ return self.csAPI.commitChanges(sortUsers=False) ############################################################################# def sendMail(self, address, subject, body, fromAddress=None, localAttempt=True, html=False): """ Send mail to specified address with body. """ notification = NotificationClient() return notification.sendMail(address, subject, body, fromAddress, localAttempt, html) ############################################################################# def sendSMS(self, userName, body, fromAddress=None): """ Send mail to specified address with body. """ if len(body) > 160: return S_ERROR('Exceeded maximum SMS length of 160 characters') notification = NotificationClient() return notification.sendSMS(userName, body, fromAddress) ############################################################################# def getBDIISite(self, site, host=None): """ Get information about site from BDII at host """ return ldapSite(site, host=host) ############################################################################# def getBDIICluster(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCluster(ce, host=host) ############################################################################# def getBDIICE(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCE(ce, host=host) ############################################################################# def getBDIIService(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapService(ce, host=host) ############################################################################# def getBDIICEState(self, ce, useVO=voName, host=None): """ Get information about ce state from BDII at host """ return ldapCEState(ce, useVO, host=host) ############################################################################# def getBDIICEVOView(self, ce, useVO=voName, host=None): """ Get information about ce voview from BDII at host """ return ldapCEVOView(ce, useVO, host=host) ############################################################################# def getBDIISE(self, site, useVO=voName, host=None): """ Get information about SA from BDII at host """ return ldapSE(site, useVO, host=host)
class SiteInspectorAgent(AgentModule): """ SiteInspectorAgent The SiteInspectorAgent agent is an agent that is used to get the all the site names and trigger PEP to evaluate their status. """ # Max number of worker threads by default __maxNumberOfThreads = 15 # Inspection freqs, defaults, the lower, the higher priority to be checked. # Error state usually means there is a glitch somewhere, so it has the highest # priority. __checkingFreqs = { 'Active': 20, 'Degraded': 20, 'Probing': 20, 'Banned': 15, 'Unknown': 10, 'Error': 5 } def __init__(self, *args, **kwargs): AgentModule.__init__(self, *args, **kwargs) # ElementType, to be defined among Site, Resource or Node self.sitesToBeChecked = None self.threadPool = None self.siteClient = None self.clients = {} def initialize(self): """ Standard initialize. """ maxNumberOfThreads = self.am_getOption('maxNumberOfThreads', self.__maxNumberOfThreads) self.threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) self.siteClient = SiteStatus() self.clients['SiteStatus'] = self.siteClient self.clients['ResourceManagementClient'] = ResourceManagementClient() return S_OK() def execute(self): """ execute This is the main method of the agent. It gets the sites from the Database, calculates how many threads should be started and spawns them. Each thread will get a site from the queue until it is empty. At the end, the method will join the queue such that the agent will not terminate a cycle until all sites have been processed. """ # Gets sites to be checked ( returns a Queue ) sitesToBeChecked = self.getSitesToBeChecked() if not sitesToBeChecked['OK']: self.log.error(sitesToBeChecked['Message']) return sitesToBeChecked self.sitesToBeChecked = sitesToBeChecked['Value'] queueSize = self.sitesToBeChecked.qsize() pollingTime = self.am_getPollingTime() # Assigns number of threads on the fly such that we exhaust the PollingTime # without having to spawn too many threads. We assume 10 seconds per element # to be processed ( actually, it takes something like 1 sec per element ): # numberOfThreads = elements * 10(s/element) / pollingTime numberOfThreads = int(math.ceil(queueSize * 10. / pollingTime)) self.log.info('Needed %d threads to process %d elements' % (numberOfThreads, queueSize)) for _x in xrange(numberOfThreads): jobUp = self.threadPool.generateJobAndQueueIt(self._execute) if not jobUp['OK']: self.log.error(jobUp['Message']) self.log.info('blocking until all sites have been processed') # block until all tasks are done self.sitesToBeChecked.join() self.log.info('done') return S_OK() def getSitesToBeChecked(self): """ getElementsToBeChecked This method gets all the site names from the SiteStatus table, after that it get the details of each site (status, name, etc..) and adds them to a queue. """ toBeChecked = Queue.Queue() res = self.siteClient.getSites('All') if not res['OK']: return res # get the current status res = self.siteClient.getSiteStatuses(res['Value']) if not res['OK']: return res # filter elements for site in res['Value']: status = res['Value'].get(site, 'Unknown') toBeChecked.put({ 'status': status, 'name': site, 'site': site, 'element': 'Site', 'statusType': 'all', 'elementType': 'Site' }) return S_OK(toBeChecked) # Private methods ............................................................ def _execute(self): """ Method run by each of the thread that is in the ThreadPool. It enters a loop until there are no sites on the queue. On each iteration, it evaluates the policies for such site and enforces the necessary actions. If there are no more sites in the queue, the loop is finished. """ pep = PEP(clients=self.clients) while True: try: site = self.sitesToBeChecked.get_nowait() except Queue.Empty: return S_OK() resEnforce = pep.enforce(site) if not resEnforce['OK']: self.log.error('Failed policy enforcement', resEnforce['Message']) self.sitesToBeChecked.task_done() continue # Used together with join ! self.sitesToBeChecked.task_done()
class DiracAdmin(API): """ Administrative functionalities """ ############################################################################# def __init__(self): """Internal initialization of the DIRAC Admin API. """ super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG': self.dbg = True self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp') self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus() self._siteSet = set(getSites().get('Value', [])) ############################################################################# def uploadProxy(self): """Upload a proxy to the DIRAC WMS. This method Example usage: >>> print diracAdmin.uploadProxy('dteam_pilot') {'OK': True, 'Value': 0L} :return: S_OK,S_ERROR :param permanent: Indefinitely update proxy :type permanent: boolean """ return gProxyManager.uploadProxy() ############################################################################# def setProxyPersistency(self, userDN, userGroup, persistent=True): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> gLogger.notice(diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True )) {'OK': True } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param persistent: Persistent flag :type persistent: boolean :return: S_OK,S_ERROR """ return gProxyManager.setPersistency(userDN, userGroup, persistent) ############################################################################# def checkProxyUploaded(self, userDN, userGroup, requiredTime): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> gLogger.notice(diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True )) {'OK': True, 'Value' : True/False } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param requiredTime: Required life time of the uploaded proxy :type requiredTime: boolean :return: S_OK,S_ERROR """ return gProxyManager.userHasProxy(userDN, userGroup, requiredTime) ############################################################################# def getSiteMask(self, printOutput=False, status='Active'): """Retrieve current site mask from WMS Administrator service. Example usage: >>> gLogger.notice(diracAdmin.getSiteMask()) {'OK': True, 'Value': 0L} :return: S_OK,S_ERROR """ result = self.sitestatus.getSites(siteState=status) if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: gLogger.notice(site) return result ############################################################################# def getBannedSites(self, printOutput=False): """Retrieve current list of banned and probing sites. Example usage: >>> gLogger.notice(diracAdmin.getBannedSites()) {'OK': True, 'Value': []} :return: S_OK,S_ERROR """ bannedSites = self.sitestatus.getSites(siteState='Banned') if not bannedSites['OK']: return bannedSites probingSites = self.sitestatus.getSites(siteState='Probing') if not probingSites['OK']: return probingSites mergedList = sorted(bannedSites['Value'] + probingSites['Value']) if printOutput: gLogger.notice('\n'.join(mergedList)) return S_OK(mergedList) ############################################################################# def getSiteSection(self, site, printOutput=False): """Simple utility to get the list of CEs for DIRAC site name. Example usage: >>> gLogger.notice(diracAdmin.getSiteSection('LCG.CERN.ch')) {'OK': True, 'Value':} :return: S_OK,S_ERROR """ gridType = site.split('.')[0] if not gConfig.getSections('/Resources/Sites/%s' % (gridType))['OK']: return S_ERROR('/Resources/Sites/%s is not a valid site section' % (gridType)) result = gConfig.getOptionsDict('/Resources/Sites/%s/%s' % (gridType, site)) if printOutput and result['OK']: gLogger.notice(self.pPrint.pformat(result['Value'])) return result ############################################################################# def allowSite(self, site, comment, printOutput=False): """Adds the site to the site mask. Example usage: >>> gLogger.notice(diracAdmin.allowSite()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result result = self.getSiteMask(status='Active') if not result['OK']: return result siteMask = result['Value'] if site in siteMask: if printOutput: gLogger.notice('Site %s is already Active' % site) return S_OK('Site %s is already Active' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Active', comment) else: result = WMSAdministratorClient().allowSite(site, comment) if not result['OK']: return result if printOutput: gLogger.notice('Site %s status is set to Active' % site) return result ############################################################################# def getSiteMaskLogging(self, site=None, printOutput=False): """Retrieves site mask logging information. Example usage: >>> gLogger.notice(diracAdmin.getSiteMaskLogging('LCG.AUVER.fr')) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result if self.rssFlag: result = ResourceStatusClient().selectStatusElement('Site', 'History', name=site) else: result = WMSAdministratorClient().getSiteMaskLogging(site) if not result['OK']: return result if printOutput: if site: gLogger.notice('\nSite Mask Logging Info for %s\n' % site) else: gLogger.notice('\nAll Site Mask Logging Info\n') sitesLogging = result['Value'] if isinstance(sitesLogging, dict): for siteName, tupleList in sitesLogging.items( ): # can be an iterator if not siteName: gLogger.notice('\n===> %s\n' % siteName) for tup in tupleList: stup = str(tup[0]).ljust(8) + str(tup[1]).ljust(20) stup += '( ' + str(tup[2]).ljust(len(str( tup[2]))) + ' ) "' + str(tup[3]) + '"' gLogger.notice(stup) gLogger.notice(' ') elif isinstance(sitesLogging, list): sitesLoggingList = [(sl[1], sl[3], sl[4]) for sl in sitesLogging] for siteLog in sitesLoggingList: gLogger.notice(siteLog) return S_OK() ############################################################################# def banSite(self, site, comment, printOutput=False): """Removes the site from the site mask. Example usage: >>> gLogger.notice(diracAdmin.banSite()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result mask = self.getSiteMask(status='Banned') if not mask['OK']: return mask siteMask = mask['Value'] if site in siteMask: if printOutput: gLogger.notice('Site %s is already Banned' % site) return S_OK('Site %s is already Banned' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Banned', comment) else: result = WMSAdministratorClient().banSite(site, comment) if not result['OK']: return result if printOutput: gLogger.notice('Site %s status is set to Banned' % site) return result ############################################################################# def __checkSiteIsValid(self, site): """Internal function to check that a site name is valid. """ if isinstance(site, (list, set, dict)): site = set(site) - self._siteSet if not site: return S_OK() elif site in self._siteSet: return S_OK() return S_ERROR('Specified site %s is not in list of defined sites' % str(site)) ############################################################################# def getServicePorts(self, setup='', printOutput=False): """Checks the service ports for the specified setup. If not given this is taken from the current installation (/DIRAC/Setup) Example usage: >>> gLogger.notice(diracAdmin.getServicePorts()) {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ if not setup: setup = gConfig.getValue('/DIRAC/Setup', '') setupList = gConfig.getSections('/DIRAC/Setups', []) if not setupList['OK']: return S_ERROR('Could not get /DIRAC/Setups sections') setupList = setupList['Value'] if setup not in setupList: return S_ERROR('Setup %s is not in allowed list: %s' % (setup, ', '.join(setupList))) serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup) if not serviceSetups['OK']: return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup) serviceSetups = serviceSetups['Value'] # dict systemList = gConfig.getSections('/Systems') if not systemList['OK']: return S_ERROR('Could not get Systems sections') systemList = systemList['Value'] result = {} for system in systemList: if system in serviceSetups: path = '/Systems/%s/%s/Services' % (system, serviceSetups[system]) servicesList = gConfig.getSections(path) if not servicesList['OK']: self.log.warn('Could not get sections in %s' % path) else: servicesList = servicesList['Value'] if not servicesList: servicesList = [] self.log.verbose('System: %s ServicesList: %s' % (system, ', '.join(servicesList))) for service in servicesList: spath = '%s/%s/Port' % (path, service) servicePort = gConfig.getValue(spath, 0) if servicePort: self.log.verbose('Found port for %s/%s = %s' % (system, service, servicePort)) result['%s/%s' % (system, service)] = servicePort else: self.log.warn('No port found for %s' % spath) else: self.log.warn('%s is not defined in /DIRAC/Setups/%s' % (system, setup)) if printOutput: gLogger.notice(self.pPrint.pformat(result)) return S_OK(result) ############################################################################# def getProxy(self, userDN, userGroup, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> gLogger.notice(diracAdmin.getProxy()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadProxy(userDN, userGroup, limited=limited, requiredTimeLeft=validity) ############################################################################# def getVOMSProxy(self, userDN, userGroup, vomsAttr=False, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and VOMS extensions and stores this in a file in the local directory by default. Example usage: >>> gLogger.notice(diracAdmin.getVOMSProxy()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadVOMSProxy(userDN, userGroup, limited=limited, requiredVOMSAttribute=vomsAttr, requiredTimeLeft=validity) ############################################################################# def getPilotProxy(self, userDN, userGroup, validity=43200): """Retrieves a pilot proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> gLogger.notice(diracAdmin.getVOMSProxy()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.getPilotProxyFromDIRACGroup( userDN, userGroup, requiredTimeLeft=validity) ############################################################################# def resetJob(self, jobID): """Reset a job or list of jobs in the WMS. This operation resets the reschedule counter for a job or list of jobs and allows them to run as new. Example:: >>> gLogger.notice(dirac.reset(12345)) {'OK': True, 'Value': [12345]} :param job: JobID :type job: integer or list of integers :return: S_OK,S_ERROR """ if isinstance(jobID, six.string_types): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobID' ) elif isinstance(jobID, list): try: jobID = [int(job) for job in jobID] except Exception as x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobIDs' ) result = JobManagerClient(useCertificates=False).resetJob(jobID) return result ############################################################################# def getJobPilotOutput(self, jobID, directory=''): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> gLogger.notice(dirac.getJobPilotOutput(12345)) {'OK': True, StdOut:'',StdError:''} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) result = WMSAdministratorClient().getJobPilotOutput(jobID) if not result['OK']: return result outputPath = '%s/pilot_%s' % (directory, jobID) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.verbose('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdError' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdError']) self.log.verbose('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotOutput(self, gridReference, directory=''): """Retrieve the pilot output (std.out and std.err) for an existing job in the WMS. >>> gLogger.notice(dirac.getJobPilotOutput(12345)) {'OK': True, 'Value': {}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) result = PilotManagerClient().getPilotOutput(gridReference) if not result['OK']: return result gridReferenceSmall = gridReference.split('/')[-1] if not gridReferenceSmall: gridReferenceSmall = 'reference' outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.info('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdErr' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdErr']) self.log.info('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotInfo(self, gridReference): """Retrieve info relative to a pilot reference >>> gLogger.notice(dirac.getPilotInfo(12345)) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') result = PilotManagerClient().getPilotInfo(gridReference) return result ############################################################################# def killPilot(self, gridReference): """Kill the pilot specified >>> gLogger.notice(dirac.getPilotInfo(12345)) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') result = PilotManagerClient().killPilot(gridReference) return result ############################################################################# def getPilotLoggingInfo(self, gridReference): """Retrieve the pilot logging info for an existing job in the WMS. >>> gLogger.notice(dirac.getPilotLoggingInfo(12345)) {'OK': True, 'Value': {"The output of the command"}} :param gridReference: Gridp pilot job reference Id :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') return PilotManagerClient().getPilotLoggingInfo(gridReference) ############################################################################# def getJobPilots(self, jobID): """Extract the list of submitted pilots and their status for a given jobID from the WMS. Useful information is printed to the screen. >>> gLogger.notice(dirac.getJobPilots()) {'OK': True, 'Value': {PilotID:{StatusDict}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if isinstance(jobID, six.string_types): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or string for existing jobID') result = PilotManagerClient().getPilots(jobID) if result['OK']: gLogger.notice(self.pPrint.pformat(result['Value'])) return result ############################################################################# def getPilotSummary(self, startDate='', endDate=''): """Retrieve the pilot output for an existing job in the WMS. Summary is printed at INFO level, full dictionary of results also returned. >>> gLogger.notice(dirac.getPilotSummary()) {'OK': True, 'Value': {CE:{Status:Count}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ result = PilotManagerClient().getPilotSummary(startDate, endDate) if not result['OK']: return result ceDict = result['Value'] headers = 'CE'.ljust(28) i = 0 for ce, summary in ceDict.iteritems(): states = summary.keys() if len(states) > i: i = len(states) for i in xrange(i): headers += 'Status'.ljust(12) + 'Count'.ljust(12) gLogger.notice(headers) for ce, summary in ceDict.iteritems(): line = ce.ljust(28) states = sorted(summary) for state in states: count = str(summary[state]) line += state.ljust(12) + count.ljust(12) gLogger.notice(line) return result ############################################################################# def setSiteProtocols(self, site, protocolsList, printOutput=False): """ Allows to set the defined protocols for each SE for a given site. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are', ', '.join(defaultProtocols)) for protocol in protocolsList: if protocol not in defaultProtocols: return S_ERROR( 'Requested to set protocol %s in list but %s is not ' 'in default list of protocols:\n%s' % (protocol, protocol, ', '.join(defaultProtocols))) modifiedCS = False result = promptUser( 'Do you want to add the following default protocols:' ' %s for SE(s):\n%s' % (', '.join(protocolsList), ', '.join(siteSEs))) if not result['OK']: return result if result['Value'].lower() != 'y': self.log.always('No protocols will be added') return S_OK() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue( '/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % ( se, section) self.log.verbose('Setting %s to %s' % (path, ', '.join(protocolsList))) result = self.csSetOption(path, ', '.join(protocolsList)) if not result['OK']: return result modifiedCS = True if modifiedCS: result = self.csCommitChanges(False) if not result['OK']: return S_ERROR('CS Commit failed with message = %s' % (result['Message'])) else: if printOutput: gLogger.notice('Successfully committed changes to CS') else: if printOutput: gLogger.notice('No modifications to CS required') return S_OK() ############################################################################# def csSetOption(self, optionPath, optionValue): """ Function to modify an existing value in the CS. """ return self.csAPI.setOption(optionPath, optionValue) ############################################################################# def csSetOptionComment(self, optionPath, comment): """ Function to modify an existing value in the CS. """ return self.csAPI.setOptionComment(optionPath, comment) ############################################################################# def csModifyValue(self, optionPath, newValue): """ Function to modify an existing value in the CS. """ return self.csAPI.modifyValue(optionPath, newValue) ############################################################################# def csRegisterUser(self, username, properties): """ Registers a user in the CS. - username: Username of the user (easy;) - properties: Dict containing: - DN - groups : list/tuple of groups the user belongs to - <others> : More properties of the user, like mail """ return self.csAPI.addUser(username, properties) ############################################################################# def csDeleteUser(self, user): """ Deletes a user from the CS. Can take a list of users """ return self.csAPI.deleteUsers(user) ############################################################################# def csModifyUser(self, username, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addUser and applies the changes """ return self.csAPI.modifyUser(username, properties, createIfNonExistant) ############################################################################# def csListUsers(self, group=False): """ Lists the users in the CS. If no group is specified return all users. """ return self.csAPI.listUsers(group) ############################################################################# def csDescribeUsers(self, mask=False): """ List users and their properties in the CS. If a mask is given, only users in the mask will be returned """ return self.csAPI.describeUsers(mask) ############################################################################# def csModifyGroup(self, groupname, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addGroup and applies the changes """ return self.csAPI.modifyGroup(groupname, properties, createIfNonExistant) ############################################################################# def csListHosts(self): """ Lists the hosts in the CS """ return self.csAPI.listHosts() ############################################################################# def csDescribeHosts(self, mask=False): """ Gets extended info for the hosts in the CS """ return self.csAPI.describeHosts(mask) ############################################################################# def csModifyHost(self, hostname, properties, createIfNonExistant=False): """ Modify a host in the CS. Takes the same params as in addHost and applies the changes """ return self.csAPI.modifyHost(hostname, properties, createIfNonExistant) ############################################################################# def csListGroups(self): """ Lists groups in the CS """ return self.csAPI.listGroups() ############################################################################# def csDescribeGroups(self, mask=False): """ List groups and their properties in the CS. If a mask is given, only groups in the mask will be returned """ return self.csAPI.describeGroups(mask) ############################################################################# def csSyncUsersWithCFG(self, usersCFG): """ Synchronize users in cfg with its contents """ return self.csAPI.syncUsersWithCFG(usersCFG) ############################################################################# def csCommitChanges(self, sortUsers=True): """ Commit the changes in the CS """ return self.csAPI.commitChanges(sortUsers=False) ############################################################################# def sendMail(self, address, subject, body, fromAddress=None, localAttempt=True, html=False): """ Send mail to specified address with body. """ notification = NotificationClient() return notification.sendMail(address, subject, body, fromAddress, localAttempt, html) ############################################################################# def sendSMS(self, userName, body, fromAddress=None): """ Send mail to specified address with body. """ if len(body) > 160: return S_ERROR('Exceeded maximum SMS length of 160 characters') notification = NotificationClient() return notification.sendSMS(userName, body, fromAddress) ############################################################################# def getBDIISite(self, site, host=None): """ Get information about site from BDII at host """ return ldapSite(site, host=host) ############################################################################# def getBDIICluster(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCluster(ce, host=host) ############################################################################# def getBDIICE(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCE(ce, host=host) ############################################################################# def getBDIIService(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapService(ce, host=host) ############################################################################# def getBDIICEState(self, ce, useVO=voName, host=None): """ Get information about ce state from BDII at host """ return ldapCEState(ce, useVO, host=host) ############################################################################# def getBDIICEVOView(self, ce, useVO=voName, host=None): """ Get information about ce voview from BDII at host """ return ldapCEVOView(ce, useVO, host=host)
class PilotDirector( object ): """ Base Pilot Director class. Derived classes must implement: * __init__( self, submitPool ): that must call the parent class __init__ method and then do its own initialization * configure( self, csSection, submitPool ): that must call the parent class configure method and the do its own configuration * _submitPilot( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ) * _listMatch( self, proxy, jdl, taskQueueID, rb ) * _getChildrenReferences( self, proxy, parentReference, taskQueueID ) Derived classes might implement: * configureFromSection( self, mySection ): to reload from a CS section the additional datamembers they might have defined. If additional datamembers are defined, they must: - be declared in the __init__ - be reconfigured in the configureFromSection method by executing self.reloadConfiguration( csSection, submitPool ) in their configure method """ gridMiddleware = '' def __init__( self, submitPool ): """ Define the logger and some defaults """ if submitPool == self.gridMiddleware: self.log = gLogger.getSubLogger( '%sPilotDirector' % self.gridMiddleware ) else: self.log = gLogger.getSubLogger( '%sPilotDirector/%s' % ( self.gridMiddleware, submitPool ) ) self.pilot = DIRAC_PILOT self.submitPoolOption = '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % submitPool self.extraPilotOptions = [] self.installVersion = DIRAC_VERSION self.installProject = DIRAC_PROJECT self.installation = DIRAC_INSTALLATION self.pilotExtensionsList = [] self.virtualOrganization = VIRTUAL_ORGANIZATION self.install = DIRAC_INSTALL self.extraModules = DIRAC_MODULES self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.targetGrids = [ self.gridMiddleware ] self.enableListMatch = ENABLE_LISTMATCH self.listMatchDelay = LISTMATCH_DELAY self.listMatchCache = DictCache() self.privatePilotFraction = PRIVATE_PILOT_FRACTION self.errorClearTime = ERROR_CLEAR_TIME self.errorTicketTime = ERROR_TICKET_TIME self.errorMailAddress = DIRAC.errorMail self.alarmMailAddress = DIRAC.alarmMail self.mailFromAddress = FROM_MAIL self.siteClient = SiteStatus() if not 'log' in self.__dict__: self.log = gLogger.getSubLogger( 'PilotDirector' ) self.log.info( 'Initialized' ) def configure( self, csSection, submitPool ): """ Here goes common configuration for all PilotDirectors """ self.configureFromSection( csSection ) self.reloadConfiguration( csSection, submitPool ) # Get the defaults for the Setup where the Director is running opsHelper = Operations() self.installVersion = opsHelper.getValue( cfgPath( 'Pilot', 'Version' ), [ self.installVersion ] )[0] self.installProject = opsHelper.getValue( cfgPath( 'Pilot', 'Project' ), self.installProject ) self.installation = opsHelper.getValue( cfgPath( 'Pilot', 'Installation' ), self.installation ) self.pilotExtensionsList = opsHelper.getValue( "Pilot/Extensions", self.pilotExtensionsList ) self.log.info( '===============================================' ) self.log.info( 'Configuration:' ) self.log.info( '' ) self.log.info( ' Target Grids: ', ', '.join( self.targetGrids ) ) self.log.info( ' Install script: ', self.install ) self.log.info( ' Pilot script: ', self.pilot ) self.log.info( ' Pilot modules', self.extraModules ) self.log.info( ' Install Ver: ', self.installVersion ) if self.installProject: self.log.info( ' Project: ', self.installProject ) if self.installation: self.log.info( ' Installation: ', self.installation ) if self.extraPilotOptions: self.log.info( ' Extra Options: ', ' '.join( self.extraPilotOptions ) ) self.log.info( ' ListMatch: ', self.enableListMatch ) self.log.info( ' Private %: ', self.privatePilotFraction * 100 ) if self.enableListMatch: self.log.info( ' ListMatch Delay:', self.listMatchDelay ) self.listMatchCache.purgeExpired() def reloadConfiguration( self, csSection, submitPool ): """ Common Configuration can be overwriten for each GridMiddleware """ mySection = csSection + '/' + self.gridMiddleware self.configureFromSection( mySection ) # And Again for each SubmitPool mySection = csSection + '/' + submitPool self.configureFromSection( mySection ) def configureFromSection( self, mySection ): """ reload from CS """ self.pilot = gConfig.getValue( mySection + '/PilotScript' , self.pilot ) self.installVersion = gConfig.getValue( mySection + '/Version' , self.installVersion ) self.extraPilotOptions = gConfig.getValue( mySection + '/ExtraPilotOptions' , self.extraPilotOptions ) self.install = gConfig.getValue( mySection + '/InstallScript' , self.install ) self.extraModules = gConfig.getValue( mySection + '/ExtraPilotModules' , [] ) + self.extraModules self.installProject = gConfig.getValue( mySection + '/Project' , self.installProject ) self.installation = gConfig.getValue( mySection + '/Installation' , self.installation ) self.maxJobsInFillMode = gConfig.getValue( mySection + '/MaxJobsInFillMode' , self.maxJobsInFillMode ) self.targetGrids = gConfig.getValue( mySection + '/TargetGrids' , self.targetGrids ) self.enableListMatch = gConfig.getValue( mySection + '/EnableListMatch' , self.enableListMatch ) self.listMatchDelay = gConfig.getValue( mySection + '/ListMatchDelay' , self.listMatchDelay ) self.errorClearTime = gConfig.getValue( mySection + '/ErrorClearTime' , self.errorClearTime ) self.errorTicketTime = gConfig.getValue( mySection + '/ErrorTicketTime' , self.errorTicketTime ) self.errorMailAddress = gConfig.getValue( mySection + '/ErrorMailAddress' , self.errorMailAddress ) self.alarmMailAddress = gConfig.getValue( mySection + '/AlarmMailAddress' , self.alarmMailAddress ) self.mailFromAddress = gConfig.getValue( mySection + '/MailFromAddress' , self.mailFromAddress ) self.privatePilotFraction = gConfig.getValue( mySection + '/PrivatePilotFraction' , self.privatePilotFraction ) virtualOrganization = gConfig.getValue( mySection + '/VirtualOrganization' , '' ) if not virtualOrganization: virtualOrganization = getVOForGroup( 'NonExistingGroup' ) if not virtualOrganization: virtualOrganization = self.virtualOrganization self.virtualOrganization = virtualOrganization def _resolveCECandidates( self, taskQueueDict ): """ Return a list of CEs for this TaskQueue """ # assume user knows what they're doing and avoid site mask e.g. sam jobs if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']: self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( taskQueueDict['GridCEs'] ) ) return taskQueueDict['GridCEs'] # Get the mask ret = self.siteClient.getSites() if not ret['OK']: self.log.error( 'Can not retrieve site Mask from DB:', ret['Message'] ) return [] siteMask = ret['Value'] if not siteMask: self.log.error( 'Site mask is empty' ) return [] self.log.verbose( 'Site Mask: %s' % ', '.join( siteMask ) ) # remove banned sites from siteMask if 'BannedSites' in taskQueueDict: for site in taskQueueDict['BannedSites']: if site in siteMask: siteMask.remove( site ) self.log.verbose( 'Removing banned site %s from site Mask' % site ) # remove from the mask if a Site is given siteMask = [ site for site in siteMask if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ] if not siteMask: # pilot can not be submitted self.log.info( 'No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID'] ) return [] self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) ) # Get CE's associates to the given site Names ceMask = [] for grid in self.targetGrids: section = '/Resources/Sites/%s' % grid ret = gConfig.getSections( section ) if not ret['OK']: # this is hack, maintained until LCG is added as TargetGrid for the gLite SubmitPool section = '/Resources/Sites/LCG' ret = gConfig.getSections( section ) if not ret['OK']: self.log.error( 'Could not obtain CEs from CS', ret['Message'] ) continue gridSites = ret['Value'] for siteName in gridSites: if siteName in siteMask: ret = gConfig.getValue( '%s/%s/CE' % ( section, siteName ), [] ) for ce in ret: submissionMode = gConfig.getValue( '%s/%s/CEs/%s/SubmissionMode' % ( section, siteName, ce ), 'gLite' ) if submissionMode == self.gridMiddleware and ce not in ceMask: ceMask.append( ce ) if not ceMask: self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) ) self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( ceMask ) ) return ceMask def _getPilotOptions( self, taskQueueDict, pilotsToSubmit ): # Need to limit the maximum number of pilots to submit at once # For generic pilots this is limited by the number of use of the tokens and the # maximum number of jobs in Filling mode, but for private Jobs we need an extra limitation: pilotsToSubmit = max( min( pilotsToSubmit, int( 50 / self.maxJobsInFillMode ) ), 1 ) pilotOptions = [] privateIfGenericTQ = self.privatePilotFraction > random.random() privateTQ = ( 'PilotTypes' in taskQueueDict and 'private' in [ t.lower() for t in taskQueueDict['PilotTypes'] ] ) forceGeneric = 'ForceGeneric' in taskQueueDict submitPrivatePilot = ( privateIfGenericTQ or privateTQ ) and not forceGeneric if submitPrivatePilot: self.log.verbose( 'Submitting private pilots for TaskQueue %s' % taskQueueDict['TaskQueueID'] ) ownerDN = taskQueueDict['OwnerDN'] ownerGroup = taskQueueDict['OwnerGroup'] # User Group requirement pilotOptions.append( '-G %s' % taskQueueDict['OwnerGroup'] ) # check if group allows jobsharing ownerGroupProperties = getPropertiesForGroup( ownerGroup ) if not 'JobSharing' in ownerGroupProperties: # Add Owner requirement to pilot pilotOptions.append( "-O '%s'" % ownerDN ) if privateTQ: pilotOptions.append( '-o /Resources/Computing/CEDefaults/PilotType=private' ) maxJobsInFillMode = self.maxJobsInFillMode else: #For generic jobs we'll submit mixture of generic and private pilots self.log.verbose( 'Submitting generic pilots for TaskQueue %s' % taskQueueDict['TaskQueueID'] ) #ADRI: Find the generic group result = findGenericPilotCredentials( group = taskQueueDict[ 'OwnerGroup' ] ) if not result[ 'OK' ]: self.log.error( ERROR_GENERIC_CREDENTIALS, result[ 'Message' ] ) return S_ERROR( ERROR_GENERIC_CREDENTIALS ) ownerDN, ownerGroup = result[ 'Value' ] result = gProxyManager.requestToken( ownerDN, ownerGroup, max( pilotsToSubmit, self.maxJobsInFillMode ) ) if not result[ 'OK' ]: self.log.error( ERROR_TOKEN, result['Message'] ) return S_ERROR( ERROR_TOKEN ) ( token, numberOfUses ) = result[ 'Value' ] pilotsToSubmit = min( numberOfUses, pilotsToSubmit ) pilotOptions.append( '-o /Security/ProxyToken=%s' % token ) pilotsToSubmit = max( 1, ( pilotsToSubmit - 1 ) / self.maxJobsInFillMode + 1 ) maxJobsInFillMode = int( numberOfUses / pilotsToSubmit ) # Use Filling mode pilotOptions.append( '-M %s' % maxJobsInFillMode ) # Debug pilotOptions.append( '-d' ) # Setup. pilotOptions.append( '-S %s' % taskQueueDict['Setup'] ) # CS Servers csServers = gConfig.getServersList() if len( csServers ) > 3: # Remove the master master = gConfigurationData.getMasterServer() if master in csServers: csServers.remove( master ) pilotOptions.append( '-C %s' % ",".join( csServers ) ) # DIRAC Extensions to be used in pilots # ubeda: I'm not entirely sure if we can use here the same opsHelper as in line # line +352 pilotExtensionsList = Operations().getValue( "Pilot/Extensions", [] ) extensionsList = [] if pilotExtensionsList: if pilotExtensionsList[0] != 'None': extensionsList = pilotExtensionsList else: extensionsList = getCSExtensions() if extensionsList: pilotOptions.append( '-e %s' % ",".join( extensionsList ) ) #Get DIRAC version and project, There might be global Setup defaults and per VO/Setup defaults (from configure) opsHelper = Operations( group = taskQueueDict['OwnerGroup'], setup = taskQueueDict['Setup'] ) # Requested version of DIRAC (it can be a list, so we take the fist one) version = opsHelper.getValue( cfgPath( 'Pilot', 'Version' ) , [ self.installVersion ] )[0] pilotOptions.append( '-r %s' % version ) # Requested Project to install installProject = opsHelper.getValue( cfgPath( 'Pilot', 'Project' ) , self.installProject ) if installProject: pilotOptions.append( '-l %s' % installProject ) installation = opsHelper.getValue( cfgPath( 'Pilot', 'Installation' ), self.installation ) if installation: pilotOptions.append( "-V %s" % installation ) # Requested CPU time pilotOptions.append( '-T %s' % taskQueueDict['CPUTime'] ) if self.submitPoolOption not in self.extraPilotOptions: pilotOptions.append( self.submitPoolOption ) if self.extraPilotOptions: pilotOptions.extend( self.extraPilotOptions ) return S_OK( ( pilotOptions, pilotsToSubmit, ownerDN, ownerGroup, submitPrivatePilot, privateTQ ) ) def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ): """ This method must be implemented on the Backend specific derived class. This is problem with the Director, not with the Job so we must return S_OK Return S_ERROR if not defined. """ self.log.error( '_submitPilots method not implemented' ) return S_OK() def _submitPilot( self, proxy, pilotsToSubmit, jdl, taskQueueID, rb ): """ Submit pilot and get back the reference """ self.log.error( '_submitPilot method not implemented' ) return S_OK() def _listMatch( self, proxy, jdl, taskQueueID, rb ): """ This method must be implemented on the Backend specific derived class. """ self.log.error( '_listMatch method not implemented' ) return S_OK() def _getChildrenReferences( self, proxy, parentReference, taskQueueID ): """ This method must be implemented on the Backend specific derived class. """ self.log.error( '_getChildrenReferences method not implemented' ) return S_OK() def submitPilots( self, taskQueueDict, pilotsToSubmit, workDir = None ): """ Submit pilot for the given TaskQueue, this method just insert the request in the corresponding ThreadPool, the submission is done from the Thread Pool job """ try: taskQueueID = taskQueueDict['TaskQueueID'] self.log.verbose( 'Submitting Pilot' ) ceMask = self._resolveCECandidates( taskQueueDict ) if not ceMask: return S_ERROR( 'No CE available for TaskQueue %d' % int( taskQueueID ) ) result = self._getPilotOptions( taskQueueDict, pilotsToSubmit ) if not result['OK']: return result ( pilotOptions, pilotsPerJob, ownerDN, ownerGroup, submitPrivatePilot, privateTQ ) = result['Value'] # get a valid proxy, submit with a long proxy to avoid renewal ret = self._getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft = 86400 * 5 ) if not ret['OK']: self.log.error( ret['Message'] ) self.log.error( 'No proxy Available', 'User "%s", Group "%s"' % ( ownerDN, ownerGroup ) ) return S_ERROR( ERROR_PROXY ) proxy = ret['Value'] # Now call a Grid Specific method to handle the final submission of the pilots return self._submitPilots( workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ) except Exception: self.log.exception( 'Error in Pilot Submission' ) return S_OK( 0 ) def _getPilotProxyFromDIRACGroup( self, ownerDN, ownerGroup, requiredTimeLeft ): """ To be overwritten if a given Pilot does not require a full proxy """ self.log.info( "Downloading %s@%s proxy" % ( ownerDN, ownerGroup ) ) return gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft ) def exceptionCallBack( self, threadedJob, exceptionInfo ): self.log.exception( 'Error in Pilot Submission' )
class SiteInspectorAgent( AgentModule ): """ SiteInspectorAgent The SiteInspectorAgent agent is an agent that is used to get the all the site names and trigger PEP to evaluate their status. """ # Max number of worker threads by default __maxNumberOfThreads = 15 # Inspection freqs, defaults, the lower, the higher priority to be checked. # Error state usually means there is a glitch somewhere, so it has the highest # priority. __checkingFreqs = {'Active' : 20, 'Degraded' : 20, 'Probing' : 20, 'Banned' : 15, 'Unknown' : 10, 'Error' : 5} def __init__( self, *args, **kwargs ): AgentModule.__init__( self, *args, **kwargs ) # ElementType, to be defined among Site, Resource or Node self.sitesToBeChecked = None self.threadPool = None self.siteClient = None self.clients = {} def initialize( self ): """ Standard initialize. """ maxNumberOfThreads = self.am_getOption( 'maxNumberOfThreads', self.__maxNumberOfThreads ) self.threadPool = ThreadPool( maxNumberOfThreads, maxNumberOfThreads ) self.siteClient = SiteStatus() self.clients['SiteStatus'] = self.siteClient self.clients['ResourceManagementClient'] = ResourceManagementClient() return S_OK() def execute( self ): """ execute This is the main method of the agent. It gets the sites from the Database, calculates how many threads should be started and spawns them. Each thread will get a site from the queue until it is empty. At the end, the method will join the queue such that the agent will not terminate a cycle until all sites have been processed. """ # Gets sites to be checked ( returns a Queue ) sitesToBeChecked = self.getSitesToBeChecked() if not sitesToBeChecked['OK']: self.log.error( sitesToBeChecked['Message'] ) return sitesToBeChecked self.sitesToBeChecked = sitesToBeChecked['Value'] queueSize = self.sitesToBeChecked.qsize() pollingTime = self.am_getPollingTime() # Assigns number of threads on the fly such that we exhaust the PollingTime # without having to spawn too many threads. We assume 10 seconds per element # to be processed ( actually, it takes something like 1 sec per element ): # numberOfThreads = elements * 10(s/element) / pollingTime numberOfThreads = int( math.ceil( queueSize * 10. / pollingTime ) ) self.log.info( 'Needed %d threads to process %d elements' % ( numberOfThreads, queueSize ) ) for _x in xrange( numberOfThreads ): jobUp = self.threadPool.generateJobAndQueueIt( self._execute ) if not jobUp['OK']: self.log.error( jobUp['Message'] ) self.log.info( 'blocking until all sites have been processed' ) # block until all tasks are done self.sitesToBeChecked.join() self.log.info( 'done') return S_OK() def getSitesToBeChecked( self ): """ getElementsToBeChecked This method gets all the site names from the SiteStatus table, after that it get the details of each site (status, name, etc..) and adds them to a queue. """ toBeChecked = Queue.Queue() res = self.siteClient.getSites('All') if not res['OK']: return res # get the current status res = self.siteClient.getSiteStatuses( res['Value'] ) if not res['OK']: return res # filter elements for site in res['Value']: status = res['Value'].get(site, 'Unknown') toBeChecked.put( { 'status': status, 'name': site, 'site' : site, 'element' : 'Site', 'statusType': 'all', 'elementType': 'Site' } ) return S_OK( toBeChecked ) # Private methods ............................................................ def _execute( self ): """ Method run by each of the thread that is in the ThreadPool. It enters a loop until there are no sites on the queue. On each iteration, it evaluates the policies for such site and enforces the necessary actions. If there are no more sites in the queue, the loop is finished. """ pep = PEP( clients = self.clients ) while True: try: site = self.sitesToBeChecked.get_nowait() except Queue.Empty: return S_OK() resEnforce = pep.enforce( site ) if not resEnforce['OK']: self.log.error( 'Failed policy enforcement', resEnforce['Message'] ) self.sitesToBeChecked.task_done() continue # Used together with join ! self.sitesToBeChecked.task_done()
class DiracAdmin(API): """ Administrative functionalities """ ############################################################################# def __init__(self): """Internal initialization of the DIRAC Admin API. """ super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG': self.dbg = True self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp') self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus() ############################################################################# def uploadProxy(self, group): """Upload a proxy to the DIRAC WMS. This method Example usage: >>> print diracAdmin.uploadProxy('lhcb_pilot') {'OK': True, 'Value': 0L} :param group: DIRAC Group :type job: string :return: S_OK,S_ERROR :param permanent: Indefinitely update proxy :type permanent: boolean """ return gProxyManager.uploadProxy(diracGroup=group) ############################################################################# def setProxyPersistency(self, userDN, userGroup, persistent=True): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ) {'OK': True } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param persistent: Persistent flag :type persistent: boolean :return: S_OK,S_ERROR """ return gProxyManager.setPersistency(userDN, userGroup, persistent) ############################################################################# def checkProxyUploaded(self, userDN, userGroup, requiredTime): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ) {'OK': True, 'Value' : True/False } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param requiredTime: Required life time of the uploaded proxy :type requiredTime: boolean :return: S_OK,S_ERROR """ return gProxyManager.userHasProxy(userDN, userGroup, requiredTime) ############################################################################# def getSiteMask(self, printOutput=False, status='Active'): """Retrieve current site mask from WMS Administrator service. Example usage: >>> print diracAdmin.getSiteMask() {'OK': True, 'Value': 0L} :return: S_OK,S_ERROR """ result = self.sitestatus.getSites(siteState=status) if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: print site return result ############################################################################# def getBannedSites(self, printOutput=False): """Retrieve current list of banned and probing sites. Example usage: >>> print diracAdmin.getBannedSites() {'OK': True, 'Value': []} :return: S_OK,S_ERROR """ bannedSites = self.sitestatus.getSites(siteState='Banned') if not bannedSites['OK']: return bannedSites probingSites = self.sitestatus.getSites(siteState='Probing') if not probingSites['OK']: return probingSites mergedList = sorted(bannedSites['Value'] + probingSites['Value']) if printOutput: print '\n'.join(mergedList) return S_OK(mergedList) ############################################################################# def getSiteSection(self, site, printOutput=False): """Simple utility to get the list of CEs for DIRAC site name. Example usage: >>> print diracAdmin.getSiteSection('LCG.CERN.ch') {'OK': True, 'Value':} :return: S_OK,S_ERROR """ gridType = site.split('.')[0] if not gConfig.getSections('/Resources/Sites/%s' % (gridType))['OK']: return S_ERROR('/Resources/Sites/%s is not a valid site section' % (gridType)) result = gConfig.getOptionsDict('/Resources/Sites/%s/%s' % (gridType, site)) if printOutput and result['OK']: print self.pPrint.pformat(result['Value']) return result ############################################################################# def allowSite(self, site, comment, printOutput=False): """Adds the site to the site mask. Example usage: >>> print diracAdmin.allowSite() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result result = self.getSiteMask(status='Active') if not result['OK']: return result siteMask = result['Value'] if site in siteMask: if printOutput: print 'Site %s is already Active' % site return S_OK('Site %s is already Active' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Active', comment) else: wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.allowSite(site, comment) if not result['OK']: return result if printOutput: print 'Site %s status is set to Active' % site return result ############################################################################# def getSiteMaskLogging(self, site=None, printOutput=False): """Retrieves site mask logging information. Example usage: >>> print diracAdmin.getSiteMaskLogging('LCG.AUVER.fr') {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result if self.rssFlag: result = ResourceStatusClient().selectStatusElement('Site', 'History', name=site) else: result = RPCClient('WorkloadManagement/WMSAdministrator').getSiteMaskLogging(site) if not result['OK']: return result if printOutput: if site: print '\nSite Mask Logging Info for %s\n' % site else: print '\nAll Site Mask Logging Info\n' sitesLogging = result['Value'] if isinstance(sitesLogging, dict): for siteName, tupleList in sitesLogging.iteritems(): if not siteName: print '\n===> %s\n' % siteName for tup in tupleList: print str(tup[0]).ljust(8) + str(tup[1]).ljust(20) + \ '( ' + str(tup[2]).ljust(len(str(tup[2]))) + ' ) "' + str(tup[3]) + '"' print ' ' elif isinstance(sitesLogging, list): result = [(sl[1], sl[3], sl[4]) for sl in sitesLogging] return result ############################################################################# def banSite(self, site, comment, printOutput=False): """Removes the site from the site mask. Example usage: >>> print diracAdmin.banSite() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result mask = self.getSiteMask(status='Banned') if not mask['OK']: return mask siteMask = mask['Value'] if site in siteMask: if printOutput: print 'Site %s is already Banned' % site return S_OK('Site %s is already Banned' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Banned', comment) else: wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.banSite(site, comment) if not result['OK']: return result if printOutput: print 'Site %s status is set to Banned' % site return result ############################################################################# def __checkSiteIsValid(self, site): """Internal function to check that a site name is valid. """ sites = getSiteCEMapping() if not sites['OK']: return S_ERROR('Could not get site CE mapping') siteList = sites['Value'].keys() if site not in siteList: return S_ERROR('Specified site %s is not in list of defined sites' % site) return S_OK('%s is valid' % site) ############################################################################# def clearMask(self): """Removes all sites from the site mask. Should be used with care. Example usage: >>> print diracAdmin.clearMask() {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.clearMask() return result ############################################################################# def getServicePorts(self, setup='', printOutput=False): """Checks the service ports for the specified setup. If not given this is taken from the current installation (/DIRAC/Setup) Example usage: >>> print diracAdmin.getServicePorts() {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ if not setup: setup = gConfig.getValue('/DIRAC/Setup', '') setupList = gConfig.getSections('/DIRAC/Setups', []) if not setupList['OK']: return S_ERROR('Could not get /DIRAC/Setups sections') setupList = setupList['Value'] if setup not in setupList: return S_ERROR('Setup %s is not in allowed list: %s' % (setup, ', '.join(setupList))) serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup) if not serviceSetups['OK']: return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup) serviceSetups = serviceSetups['Value'] # dict systemList = gConfig.getSections('/Systems') if not systemList['OK']: return S_ERROR('Could not get Systems sections') systemList = systemList['Value'] result = {} for system in systemList: if system in serviceSetups: path = '/Systems/%s/%s/Services' % (system, serviceSetups[system]) servicesList = gConfig.getSections(path) if not servicesList['OK']: self.log.warn('Could not get sections in %s' % path) else: servicesList = servicesList['Value'] if not servicesList: servicesList = [] self.log.verbose('System: %s ServicesList: %s' % (system, ', '.join(servicesList))) for service in servicesList: spath = '%s/%s/Port' % (path, service) servicePort = gConfig.getValue(spath, 0) if servicePort: self.log.verbose('Found port for %s/%s = %s' % (system, service, servicePort)) result['%s/%s' % (system, service)] = servicePort else: self.log.warn('No port found for %s' % spath) else: self.log.warn('%s is not defined in /DIRAC/Setups/%s' % (system, setup)) if printOutput: print self.pPrint.pformat(result) return S_OK(result) ############################################################################# def getProxy(self, userDN, userGroup, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadProxy(userDN, userGroup, limited=limited, requiredTimeLeft=validity) ############################################################################# def getVOMSProxy(self, userDN, userGroup, vomsAttr=False, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and VOMS extensions and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadVOMSProxy(userDN, userGroup, limited=limited, requiredVOMSAttribute=vomsAttr, requiredTimeLeft=validity) ############################################################################# def getPilotProxy(self, userDN, userGroup, validity=43200): """Retrieves a pilot proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.getPilotProxyFromDIRACGroup(userDN, userGroup, requiredTimeLeft=validity) ############################################################################# def resetJob(self, jobID): """Reset a job or list of jobs in the WMS. This operation resets the reschedule counter for a job or list of jobs and allows them to run as new. Example:: >>> print dirac.reset(12345) {'OK': True, 'Value': [12345]} :param job: JobID :type job: integer or list of integers :return: S_OK,S_ERROR """ if isinstance(jobID, basestring): try: jobID = int(jobID) except Exception as x: return self._errorReport(str(x), 'Expected integer or convertible integer for existing jobID') elif isinstance(jobID, list): try: jobID = [int(job) for job in jobID] except Exception as x: return self._errorReport(str(x), 'Expected integer or convertible integer for existing jobIDs') jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=False) result = jobManager.resetJob(jobID) return result ############################################################################# def getJobPilotOutput(self, jobID, directory=''): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> print dirac.getJobPilotOutput(12345) {'OK': True, StdOut:'',StdError:''} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getJobPilotOutput(jobID) if not result['OK']: return result outputPath = '%s/pilot_%s' % (directory, jobID) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.verbose('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdError' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdError']) self.log.verbose('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotOutput(self, gridReference, directory=''): """Retrieve the pilot output (std.out and std.err) for an existing job in the WMS. >>> print dirac.getJobPilotOutput(12345) {'OK': True, 'Value': {}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotOutput(gridReference) if not result['OK']: return result gridReferenceSmall = gridReference.split('/')[-1] if not gridReferenceSmall: gridReferenceSmall = 'reference' outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.info('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdErr' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdErr']) self.log.info('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotInfo(self, gridReference): """Retrieve info relative to a pilot reference >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotInfo(gridReference) return result ############################################################################# def killPilot(self, gridReference): """Kill the pilot specified >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.killPilot(gridReference) return result ############################################################################# def getPilotLoggingInfo(self, gridReference): """Retrieve the pilot logging info for an existing job in the WMS. >>> print dirac.getPilotLoggingInfo(12345) {'OK': True, 'Value': {"The output of the command"}} :param gridReference: Gridp pilot job reference Id :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') return wmsAdmin.getPilotLoggingInfo(gridReference) ############################################################################# def getJobPilots(self, jobID): """Extract the list of submitted pilots and their status for a given jobID from the WMS. Useful information is printed to the screen. >>> print dirac.getJobPilots() {'OK': True, 'Value': {PilotID:{StatusDict}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if isinstance(jobID, basestring): try: jobID = int(jobID) except Exception as x: return self._errorReport(str(x), 'Expected integer or string for existing jobID') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilots(jobID) if result['OK']: print self.pPrint.pformat(result['Value']) return result ############################################################################# def getPilotSummary(self, startDate='', endDate=''): """Retrieve the pilot output for an existing job in the WMS. Summary is printed at INFO level, full dictionary of results also returned. >>> print dirac.getPilotSummary() {'OK': True, 'Value': {CE:{Status:Count}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotSummary(startDate, endDate) if not result['OK']: return result ceDict = result['Value'] headers = 'CE'.ljust(28) i = 0 for ce, summary in ceDict.iteritems(): states = summary.keys() if len(states) > i: i = len(states) for i in xrange(i): headers += 'Status'.ljust(12) + 'Count'.ljust(12) print headers for ce, summary in ceDict.iteritems(): line = ce.ljust(28) states = sorted(summary) for state in states: count = str(summary[state]) line += state.ljust(12) + count.ljust(12) print line return result ############################################################################# def selectRequests(self, jobID=None, requestID=None, requestName=None, requestType=None, status=None, operation=None, ownerDN=None, ownerGroup=None, requestStart=0, limit=100, printOutput=False): """Select requests from the request management system. A few notes on the selection criteria: - jobID is the WMS JobID for the request (if applicable) - requestID is assigned during submission of the request - requestName is the corresponding XML file name - requestType e.g. 'transfer' - status e.g. Done - operation e.g. replicateAndRegister - requestStart e.g. the first request to consider (start from 0 by default) - limit e.g. selection limit (default 100) >>> dirac.selectRequests(jobID='4894') {'OK': True, 'Value': [[<Requests>]]} """ options = {'RequestID': requestID, 'RequestName': requestName, 'JobID': jobID, 'OwnerDN': ownerDN, 'OwnerGroup': ownerGroup, 'RequestType': requestType, 'Status': status, 'Operation': operation} conditions = {} for key, value in options.iteritems(): if value: try: conditions[key] = str(value) except Exception as x: return self._errorReport(str(x), 'Expected string for %s field' % key) try: requestStart = int(requestStart) limit = int(limit) except Exception as x: return self._errorReport(str(x), 'Expected integer for %s field' % limit) self.log.verbose('Will select requests with the following conditions') self.log.verbose(self.pPrint.pformat(conditions)) requestClient = RPCClient("RequestManagement/centralURL") result = requestClient.getRequestSummaryWeb(conditions, [], requestStart, limit) if not result['OK']: self.log.warn(result['Message']) return result requestIDs = result['Value'] conds = [] for key, value in conditions.iteritems(): if value: conds.append('%s = %s' % (key, value)) self.log.verbose('%s request(s) selected with conditions %s and limit %s' % (len(requestIDs['Records']), ', '.join(conds), limit)) if printOutput: requests = [] if len(requestIDs['Records']) > limit: requestList = requestIDs['Records'] requests = requestList[:limit] else: requests = requestIDs['Records'] print '%s request(s) selected with conditions %s and limit %s' % (len(requestIDs['Records']), ', '.join(conds), limit) print requestIDs['ParameterNames'] for request in requests: print request if not requestIDs: return S_ERROR('No requests selected for conditions: %s' % conditions) else: return result ############################################################################# def getRequestSummary(self, printOutput=False): """ Get a summary of the requests in the request DB. """ requestClient = RPCClient("RequestManagement/centralURL", timeout=120) result = requestClient.getDBSummary() if not result['OK']: self.log.warn(result['Message']) return result if printOutput: print self.pPrint.pformat(result['Value']) return result ############################################################################# def getExternalPackageVersions(self): """ Simple function that attempts to obtain the external versions for the local DIRAC installation (frequently needed for debugging purposes). """ gLogger.info('DIRAC version v%dr%d build %d' % (DIRAC.majorVersion, DIRAC.minorVersion, DIRAC.patchLevel)) try: import lcg_util # pylint: disable=import-error infoStr = 'Using lcg_util from: \n%s' % lcg_util.__file__ gLogger.info(infoStr) infoStr = "The version of lcg_utils is %s" % lcg_util.lcg_util_version() gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import lcg_util: %s" % (x) gLogger.exception(errStr) try: import gfalthr as gfal # pylint: disable=import-error infoStr = "Using gfalthr from: \n%s" % gfal.__file__ gLogger.info(infoStr) infoStr = "The version of gfalthr is %s" % gfal.gfal_version() gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import gfalthr: %s." % (x) gLogger.warn(errStr) try: import gfal # pylint: disable=import-error infoStr = "Using gfal from: %s" % gfal.__file__ gLogger.info(infoStr) infoStr = "The version of gfal is %s" % gfal.gfal_version() gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import gfal: %s" % (x) gLogger.exception(errStr) defaultProtocols = gConfig.getValue('/Resources/StorageElements/DefaultProtocols', []) gLogger.info('Default list of protocols are: %s' % (', '.join(defaultProtocols))) return S_OK() ############################################################################# def getSiteProtocols(self, site, printOutput=False): """ Allows to check the defined protocols for each site SE. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue('/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are' ', '.join(defaultProtocols)) seInfo = {} siteSEs.sort() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue('/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % (se, section) seProtocols = gConfig.getValue(path, []) if not seProtocols: seProtocols = defaultProtocols seInfo[se] = seProtocols if printOutput: print '\nSummary of protocols for StorageElements at site %s' % site print '\nStorageElement'.ljust(30) + 'ProtocolsList'.ljust(30) + '\n' for se, protocols in seInfo.iteritems(): print se.ljust(30) + ', '.join(protocols).ljust(30) return S_OK(seInfo) ############################################################################# def setSiteProtocols(self, site, protocolsList, printOutput=False): """ Allows to set the defined protocols for each SE for a given site. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue('/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are', ', '.join(defaultProtocols)) for protocol in protocolsList: if protocol not in defaultProtocols: return S_ERROR('Requested to set protocol %s in list but %s is not ' 'in default list of protocols:\n%s' % (protocol, protocol, ', '.join(defaultProtocols))) modifiedCS = False result = promptUser('Do you want to add the following default protocols:' ' %s for SE(s):\n%s' % (', '.join(protocolsList), ', '.join(siteSEs))) if not result['OK']: return result if result['Value'].lower() != 'y': self.log.always('No protocols will be added') return S_OK() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue('/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % (se, section) self.log.verbose('Setting %s to %s' % (path, ', '.join(protocolsList))) result = self.csSetOption(path, ', '.join(protocolsList)) if not result['OK']: return result modifiedCS = True if modifiedCS: result = self.csCommitChanges(False) if not result['OK']: return S_ERROR('CS Commit failed with message = %s' % (result['Message'])) else: if printOutput: print 'Successfully committed changes to CS' else: if printOutput: print 'No modifications to CS required' return S_OK() ############################################################################# def csSetOption(self, optionPath, optionValue): """ Function to modify an existing value in the CS. """ return self.csAPI.setOption(optionPath, optionValue) ############################################################################# def csSetOptionComment(self, optionPath, comment): """ Function to modify an existing value in the CS. """ return self.csAPI.setOptionComment(optionPath, comment) ############################################################################# def csModifyValue(self, optionPath, newValue): """ Function to modify an existing value in the CS. """ return self.csAPI.modifyValue(optionPath, newValue) ############################################################################# def csRegisterUser(self, username, properties): """ Registers a user in the CS. - username: Username of the user (easy;) - properties: Dict containing: - DN - groups : list/tuple of groups the user belongs to - <others> : More properties of the user, like mail """ return self.csAPI.addUser(username, properties) ############################################################################# def csDeleteUser(self, user): """ Deletes a user from the CS. Can take a list of users """ return self.csAPI.deleteUsers(user) ############################################################################# def csModifyUser(self, username, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addUser and applies the changes """ return self.csAPI.modifyUser(username, properties, createIfNonExistant) ############################################################################# def csListUsers(self, group=False): """ Lists the users in the CS. If no group is specified return all users. """ return self.csAPI.listUsers(group) ############################################################################# def csDescribeUsers(self, mask=False): """ List users and their properties in the CS. If a mask is given, only users in the mask will be returned """ return self.csAPI.describeUsers(mask) ############################################################################# def csModifyGroup(self, groupname, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addGroup and applies the changes """ return self.csAPI.modifyGroup(groupname, properties, createIfNonExistant) ############################################################################# def csListHosts(self): """ Lists the hosts in the CS """ return self.csAPI.listHosts() ############################################################################# def csDescribeHosts(self, mask=False): """ Gets extended info for the hosts in the CS """ return self.csAPI.describeHosts(mask) ############################################################################# def csModifyHost(self, hostname, properties, createIfNonExistant=False): """ Modify a host in the CS. Takes the same params as in addHost and applies the changes """ return self.csAPI.modifyHost(hostname, properties, createIfNonExistant) ############################################################################# def csListGroups(self): """ Lists groups in the CS """ return self.csAPI.listGroups() ############################################################################# def csDescribeGroups(self, mask=False): """ List groups and their properties in the CS. If a mask is given, only groups in the mask will be returned """ return self.csAPI.describeGroups(mask) ############################################################################# def csSyncUsersWithCFG(self, usersCFG): """ Synchronize users in cfg with its contents """ return self.csAPI.syncUsersWithCFG(usersCFG) ############################################################################# def csCommitChanges(self, sortUsers=True): """ Commit the changes in the CS """ return self.csAPI.commitChanges(sortUsers=False) ############################################################################# def sendMail(self, address, subject, body, fromAddress=None, localAttempt=True, html=False): """ Send mail to specified address with body. """ notification = NotificationClient() return notification.sendMail(address, subject, body, fromAddress, localAttempt, html) ############################################################################# def sendSMS(self, userName, body, fromAddress=None): """ Send mail to specified address with body. """ if len(body) > 160: return S_ERROR('Exceeded maximum SMS length of 160 characters') notification = NotificationClient() return notification.sendSMS(userName, body, fromAddress) ############################################################################# def getBDIISite(self, site, host=None): """ Get information about site from BDII at host """ return ldapSite(site, host=host) ############################################################################# def getBDIICluster(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCluster(ce, host=host) ############################################################################# def getBDIICE(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCE(ce, host=host) ############################################################################# def getBDIIService(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapService(ce, host=host) ############################################################################# def getBDIICEState(self, ce, useVO=voName, host=None): """ Get information about ce state from BDII at host """ return ldapCEState(ce, useVO, host=host) ############################################################################# def getBDIICEVOView(self, ce, useVO=voName, host=None): """ Get information about ce voview from BDII at host """ return ldapCEVOView(ce, useVO, host=host) ############################################################################# def getBDIISE(self, site, useVO=voName, host=None): """ Get information about SA from BDII at host """ return ldapSE(site, useVO, host=host)
class Matcher(object): """ Logic for matching """ def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper) self.siteClient = SiteStatus() def selectJob(self, resourceDescription, credDict): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict(resourceDescription, credDict) # Make a nice print of the resource matching parameters toPrintDict = dict(resourceDict) if "MaxRAM" in resourceDescription: toPrintDict['MaxRAM'] = resourceDescription['MaxRAM'] if "NumberOfProcessors" in resourceDescription: toPrintDict['NumberOfProcessors'] = resourceDescription[ 'NumberOfProcessors'] toPrintDict['Tag'] = [] if "Tag" in resourceDict: for tag in resourceDict['Tag']: if not tag.endswith('GB') and not tag.endswith('Processors'): toPrintDict['Tag'].append(tag) if not toPrintDict['Tag']: toPrintDict.pop('Tag') gLogger.info('Resource description for matching', printDict(toPrintDict)) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site']) result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond) if not result['OK']: raise RuntimeError(result['Message']) result = result['Value'] if not result['matchFound']: self.log.info("No match found") return {} jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status']) if not resAtt['OK']: raise RuntimeError('Could not retrieve job attributes') if not resAtt['Value']: raise RuntimeError("No attributes returned for job") if not resAtt['Value']['Status'] == 'Waiting': self.log.error('Job matched by the TQ is not in Waiting state', str(jobID)) result = self.tqDB.deleteJob(jobID) if not result['OK']: raise RuntimeError(result['Message']) raise RuntimeError("Job %s is not in Waiting state" % str(jobID)) self._reportStatus(resourceDict, jobID) result = self.jobDB.getJobJDL(jobID) if not result['OK']: raise RuntimeError("Failed to get the job JDL") resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info("Match time: [%s]" % str(matchTime)) gMonitor.addMark("matchTime", matchTime) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters(jobID) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not resAtt['OK']: raise RuntimeError('Could not retrieve job attributes') if not resAtt['Value']: raise RuntimeError('No attributes returned for job') if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): self.limiter.updateDelayCounters(resourceDict['Site'], jobID) pilotInfoReportedFlag = resourceDict.get('PilotInfoReportedFlag', False) if not pilotInfoReportedFlag: self._updatePilotInfo(resourceDict) self._updatePilotJobMapping(resourceDict, jobID) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict(self, resourceDescription, credDict): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription(resourceDescription) resourceDict = self._checkCredentials(resourceDict, credDict) self._checkPilotVersion(resourceDict) if not self._checkMask(resourceDict): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose("Resource description:") for key in resourceDict: self.log.verbose("%s : %s" % (key.rjust(20), resourceDict[key])) return resourceDict def _processResourceDescription(self, resourceDescription): """ Check and form the resource description dictionary resourceDescription is a ceDict coming from a JobAgent, for example. """ resourceDict = {} for name in singleValueDefFields: if resourceDescription.has_key(name): resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] for name in tagMatchFields: if name in resourceDescription and resourceDescription[name]: resourceDict[name] = resourceDescription[name] rname = 'Required%s' % name if rname in resourceDescription: resourceDict[rname] = resourceDescription[rname] if 'JobID' in resourceDescription: resourceDict['JobID'] = resourceDescription['JobID'] # Convert MaxRAM and NumberOfProcessors parameters into a list of tags maxRAM = resourceDescription.get('MaxRAM') if maxRAM: try: maxRAM = int(maxRAM) / 1000 except ValueError: maxRAM = None nProcessors = resourceDescription.get('NumberOfProcessors') if nProcessors: try: nProcessors = int(nProcessors) except ValueError: nProcessors = None for param, key in [(maxRAM, 'GB'), (nProcessors, 'Processors')]: if param and param <= 128: paramList = range(2, param + 1) paramTags = ['%d%s' % (par, key) for par in paramList] if paramTags: resourceDict.setdefault("Tag", []).extend(paramTags) if "WholeNode" in resourceDescription: resourceDict.setdefault("Tag", []).append("WholeNode") if 'Tag' in resourceDict: resourceDict['Tag'] = list(set(resourceDict['Tag'])) for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag'): if k in resourceDescription: resourceDict[k] = resourceDescription[k] return resourceDict def _reportStatus(self, resourceDict, jobID): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes(jobID, attNames, attValues) if not result['OK']: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % (jobID, result['Message'])) else: self.log.verbose("Set job attributes for jobID %s" % jobID) result = self.jlDB.addLoggingRecord(jobID, status='Matched', minor='Assigned', source='Matcher') if not result['OK']: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % (jobID, result['Message'])) else: self.log.verbose("Added logging record for jobID %s" % jobID) def _checkMask(self, resourceDict): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if not 'Site' in resourceDict: self.log.error("Missing Site Name in Resource JDL") raise RuntimeError("Missing Site Name in Resource JDL") # Get common site mask and check the agent site result = self.siteClient.getSites(siteState='Active') if not result['OK']: self.log.error("Internal error", "getSiteMask: %s" % result['Message']) raise RuntimeError("Internal error") maskList = result['Value'] if resourceDict['Site'] not in maskList: return False return True def _updatePilotInfo(self, resourceDict): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get('PilotReference', '') if pilotReference: gridCE = resourceDict.get('GridCE', 'Unknown') site = resourceDict.get('Site', 'Unknown') benchmark = resourceDict.get('PilotBenchmark', 0.0) self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % (pilotReference, gridCE, site, benchmark)) result = self.pilotAgentsDB.setPilotStatus(pilotReference, status='Running', gridSite=site, destination=gridCE, benchmark=benchmark) if not result['OK']: self.log.warn( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % (pilotReference, result['Message'])) def _updatePilotJobMapping(self, resourceDict, jobID): """ Update pilot to job mapping information """ pilotReference = resourceDict.get('PilotReference', '') if pilotReference: result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID) if not result['OK']: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % (pilotReference, result['Message'])) result = self.pilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False) if not result['OK']: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % (pilotReference, result['Message'])) def _checkCredentials(self, resourceDict, credDict): """ Check if we can get a job given the passed credentials """ if Properties.GENERIC_PILOT in credDict['properties']: # You can only match groups in the same VO if credDict['group'] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get('VirtualOrganization', '') else: vo = Registry.getVOForGroup(credDict['group']) result = Registry.getGroupsForVO(vo) if result['OK']: resourceDict['OwnerGroup'] = result['Value'] else: raise RuntimeError(result['Message']) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict['properties']: self.log.notice( "Setting the resource DN to the credentials DN") resourceDict['OwnerDN'] = credDict['DN'] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict['properties']: resourceDict['OwnerGroup'] = credDict['group'] self.log.notice( "Setting the resource group to the credentials group") if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN'] != credDict['DN']: ownerDN = resourceDict['OwnerDN'] result = Registry.getGroupsForDN(resourceDict['OwnerDN']) if not result['OK']: raise RuntimeError(result['Message']) if credDict['group'] not in result['Value']: # DN is not in the same group! bad boy. self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN) resourceDict['OwnerDN'] = credDict['DN'] # Nothing special, group and DN have to be the same else: resourceDict['OwnerDN'] = credDict['DN'] resourceDict['OwnerGroup'] = credDict['group'] return resourceDict def _checkPilotVersion(self, resourceDict): """ Check the pilot DIRAC version """ if self.opsHelper.getValue("Pilot/CheckVersion", True): if 'ReleaseVersion' not in resourceDict: if not 'DIRACVersion' in resourceDict: raise RuntimeError( 'Version check requested and not provided by Pilot') else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue("Pilot/Version", []) if validVersions and pilotVersion not in validVersions: raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \ ( pilotVersion, ",".join( validVersions ) ) ) # Check project if requested validProject = self.opsHelper.getValue("Pilot/Project", "") if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError( "Version check requested but expected project %s not received" % validProject) if resourceDict['ReleaseProject'] != validProject: raise RuntimeError( "Version check requested \ but expected project %s != received %s" % (validProject, resourceDict['ReleaseProject']))