def setUp(self): """ Setup for unit tests """ logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename='proxy_unittests.log', filemode='w') logger_name = 'ProxyTest' self.logger = logging.getLogger(logger_name) self.dict = { 'logger': self.logger, 'vo': 'cms', 'group': group, 'role': role, 'myProxySvr': myProxySvr, 'proxyValidity': '192:00', 'min_time_left': 36000, 'uisource': uiPath } self.proxyPath = None self.proxy = Proxy(self.dict)
def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel proxyArgs = {'logger': logging.getLogger(), 'cleanEnvironment': True} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY self.userCertFile = self.proxy.getUserCertFilename() # X509_USER_CERT # credential lifetime warning/error thresholds, in days self.credThresholds = {'proxy': {'error': 3, 'warning': 5}, 'certificate': {'error': 10, 'warning': 20}} # Monitoring setup self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None) self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None) self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False) self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None) self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('cms-mb.cern.ch', 61313)]) # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here if hasattr(self.config, "Tier0Feeder"): self.isT0agent = True self.producer = "tier0wmagent" else: self.isT0agent = False self.producer = "wmagent" localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl)
def __init__(self, config): BasePlugin.__init__(self, config) self.locationDict = {} myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.locationAction = daoFactory(classname="Locations.GetSiteInfo") self.packageDir = None if os.path.exists( os.path.join(getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py')): self.unpacker = os.path.join( getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py') else: self.unpacker = os.path.join(getWMBASE(), 'WMCore/WMRuntime/Unpacker.py') self.agent = getattr(config.Agent, 'agentName', 'WMAgent') self.sandbox = None self.scriptFile = config.JobSubmitter.submitScript self.defaultTaskPriority = getattr(config.BossAir, 'defaultTaskPriority', 0) self.maxTaskPriority = getattr(config.BossAir, 'maxTaskPriority', 1e7) self.jobsPerSubmit = getattr(config.JobSubmitter, 'jobsPerSubmit', 200) self.extraMem = getattr(config.JobSubmitter, 'extraMemoryPerCore', 500) # Required for global pool accounting self.acctGroup = getattr(config.BossAir, 'acctGroup', "production") self.acctGroupUser = getattr(config.BossAir, 'acctGroupUser', "cmsdataops") # Build a requirement string. All CMS resources match DESIRED_Sites on the START # expression side; however, there are currently some resources (T2_CH_CERN_HLT) # that are missing the REQUIRED_OS logic. Hence, we duplicate it here. # TODO(bbockelm): Remove reqStr once HLT has upgraded. self.reqStr = ( '((REQUIRED_OS=?="any") || ' '(GLIDEIN_REQUIRED_OS =?= "any") || ' 'stringListMember(GLIDEIN_REQUIRED_OS, REQUIRED_OS)) && ' '(AuthenticatedIdentity =!= "*****@*****.**")') if hasattr(config.BossAir, 'condorRequirementsString'): self.reqStr = config.BossAir.condorRequirementsString # x509 proxy handling proxy = Proxy({'logger': myThread.logger}) self.x509userproxy = proxy.getProxyFilename() self.x509userproxysubject = proxy.getSubject() self.x509userproxyfqan = proxy.getAttributeFromProxy( self.x509userproxy) # Remove the x509 ads if the job is matching a volunteer resource self.x509Expr = 'ifThenElse("$$(GLIDEIN_CMSSite)" =?= "T3_CH_Volunteer",undefined,"%s")' return
def get_proxy(self, ad): result = None vo = 'cms' group = '' role = '' if 'CRAB_UserVO' in ad and ad['CRAB_UserVO']: vo = ad['CRAB_UserVO'] if 'CRAB_UserGroup' in ad and ad['CRAB_UserGroup'] and ad['CRAB_UserGroup'] != classad.Value.Undefined: group = ad['CRAB_UserGroup'] if 'CRAB_UserRole' in ad and ad['CRAB_UserRole'] and ad['CRAB_UserRole'] != classad.Value.Undefined: role = ad['CRAB_UserRole'] proxycfg = {'vo': vo, 'logger': self.logger, 'myProxySvr': self.config.Services.MyProxy, 'myproxyAccount': self.config.TaskWorker.resturl, 'proxyValidity' : '144:0', 'min_time_left' : MINPROXYLENGTH, ## do we need this ? or should we use self.myproxylen? 'userDN' : ad['CRAB_UserDN'], 'group' : group, 'role' : role, 'server_key': self.config.MyProxy.serverhostkey, 'server_cert': self.config.MyProxy.serverhostcert, 'serverDN': self.config.MyProxy.serverdn, 'uisource': getattr(self.config.MyProxy, 'uisource', ''), 'credServerPath': self.config.MyProxy.credpath, 'cleanEnvironment' : getattr(self.config.MyProxy, 'cleanEnvironment', False)} proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename(serverRenewer=True) proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) if timeleft is None or timeleft <= 0: self.logger.error("Impossible to retrieve proxy from %s for %s." %(proxycfg['myProxySvr'], proxycfg['userDN'])) raise Exception("Failed to retrieve proxy.") return userproxy
def __init__(self, config): BasePlugin.__init__(self, config) self.locationDict = {} myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.locationAction = daoFactory(classname="Locations.GetSiteInfo") self.packageDir = None # if agent is running in a container, Unpacker.py must come from a directory # on the host so the condor schedd can see it # config.General.workDir should always be bind mounted to the container if getattr(config.Agent, "isDocker", False): unpackerPath = os.path.join(config.General.workDir + "/Docker/WMRuntime/Unpacker.py") else: unpackerPath = os.path.join( getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py') if os.path.exists(unpackerPath): self.unpacker = unpackerPath else: self.unpacker = os.path.join(getWMBASE(), 'WMCore/WMRuntime/Unpacker.py') self.agent = getattr(config.Agent, 'agentName', 'WMAgent') self.sandbox = None self.scriptFile = config.JobSubmitter.submitScript self.defaultTaskPriority = getattr(config.BossAir, 'defaultTaskPriority', 0) self.maxTaskPriority = getattr(config.BossAir, 'maxTaskPriority', 1e7) self.jobsPerSubmit = getattr(config.JobSubmitter, 'jobsPerSubmit', 200) self.extraMem = getattr(config.JobSubmitter, 'extraMemoryPerCore', 500) # Required for global pool accounting self.acctGroup = getattr(config.BossAir, 'acctGroup', "production") self.acctGroupUser = getattr(config.BossAir, 'acctGroupUser', "cmsdataops") if hasattr(config.BossAir, 'condorRequirementsString'): self.reqStr = config.BossAir.condorRequirementsString else: self.reqStr = None # x509 proxy handling proxy = Proxy({'logger': myThread.logger}) self.x509userproxy = proxy.getProxyFilename() # These are added now by the condor client #self.x509userproxysubject = proxy.getSubject() #self.x509userproxyfqan = proxy.getAttributeFromProxy(self.x509userproxy) return
def testMyProxyEnvironment(self): """ Test the myProxyEnvironment context manager In this test a new Proxy and MyProxy are initialized """ myProxy = Proxy(self.dict) # Create the proxy myProxy.create() proxyPath = myProxy.getProxyFilename() userDN = myProxy.getSubject() self.assertTrue(os.path.exists(proxyPath)) # Delegate and check the proxy myProxy.delegate(credential=proxyPath, serverRenewer=True) valid = myProxy.checkMyProxy() self.assertTrue(valid) # Make sure X509_USER_PROXY exists only in the context manager and corresponds to a file if 'X509_USER_PROXY' in os.environ: del os.environ['X509_USER_PROXY'] self.assertFalse('X509_USER_PROXY' in os.environ) with myProxyEnvironment(userDN=userDN, serverCert=serverCert, serverKey=serverKey, myproxySrv='myproxy.cern.ch', proxyDir='/tmp/', logger=self.logger): self.assertTrue('X509_USER_PROXY' in os.environ) self.assertTrue(os.path.exists(os.environ['X509_USER_PROXY'])) self.assertFalse('X509_USER_PROXY' in os.environ) return
def query_database(self): cred = Proxy({'logger': logging.getLogger("WMCore")}) dbs = DASWrapper(self.dbs_instance, ca_info=cred.getProxyFilename()) baseinfo = dbs.listFileSummaries(dataset=self.dataset) if baseinfo is None or (len(baseinfo) == 1 and baseinfo[0] is None): raise ValueError('unable to retrive information for dataset {}'.format(self.dataset)) if not self.file_based: result = self.__cache.cached(self.dataset, self.lumi_mask, baseinfo) if result: return result total_lumis = sum([info['num_lumi'] for info in baseinfo]) result = DatasetInfo() result.total_events = sum([info['num_event'] for info in baseinfo]) for info in dbs.listFiles(dataset=self.dataset, detail=True): fn = info['logical_file_name'] result.files[fn].events = info['event_count'] result.files[fn].size = info['file_size'] if self.file_based: for info in dbs.listFiles(dataset=self.dataset): fn = info['logical_file_name'] result.files[fn].lumis = [(-2, -2)] else: blocks = dbs.listBlocks(dataset=self.dataset) if self.lumi_mask: unmasked_lumis = LumiList(filename=self.lumi_mask) for block in blocks: runs = dbs.listFileLumis(block_name=block['block_name']) for run in runs: fn = run['logical_file_name'] for lumi in run['lumi_section_num']: if not self.lumi_mask or ((run['run_num'], lumi) in unmasked_lumis): result.files[fn].lumis.append((run['run_num'], lumi)) elif self.lumi_mask and ((run['run_num'], lumi) not in unmasked_lumis): result.masked_units += 1 result.unmasked_units = sum([len(f.lumis) for f in result.files.values()]) result.total_units = result.unmasked_units + result.masked_units if not self.file_based: self.__cache.cache(self.dataset, self.lumi_mask, baseinfo, result) result.stop_on_file_boundary = (result.total_units != total_lumis) and not self.file_based if result.stop_on_file_boundary: logger.debug("split lumis detected in {} - " "{} unique (run, lumi) but " "{} unique (run, lumi, file) - " "enforcing a limit of one file per task".format(self.dataset, total_lumis, result.total_units)) return result
def __init__(self, config): BasePlugin.__init__(self, config) self.locationDict = {} myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.locationAction = daoFactory(classname="Locations.GetSiteInfo") self.packageDir = None if os.path.exists(os.path.join(getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py')): self.unpacker = os.path.join(getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py') else: self.unpacker = os.path.join(getWMBASE(), 'WMCore/WMRuntime/Unpacker.py') self.agent = getattr(config.Agent, 'agentName', 'WMAgent') self.sandbox = None self.scriptFile = config.JobSubmitter.submitScript self.defaultTaskPriority = getattr(config.BossAir, 'defaultTaskPriority', 0) self.maxTaskPriority = getattr(config.BossAir, 'maxTaskPriority', 1e7) self.jobsPerSubmit = getattr(config.JobSubmitter, 'jobsPerSubmit', 200) self.extraMem = getattr(config.JobSubmitter, 'extraMemoryPerCore', 500) # Required for global pool accounting self.acctGroup = getattr(config.BossAir, 'acctGroup', "production") self.acctGroupUser = getattr(config.BossAir, 'acctGroupUser', "cmsdataops") # Build a requirement string. All CMS resources match DESIRED_Sites on the START # expression side; however, there are currently some resources (T2_CH_CERN_HLT) # that are missing the REQUIRED_OS logic. Hence, we duplicate it here. # TODO(bbockelm): Remove reqStr once HLT has upgraded. self.reqStr = ('((REQUIRED_OS=?="any") || ' '(GLIDEIN_REQUIRED_OS =?= "any") || ' 'stringListMember(GLIDEIN_REQUIRED_OS, REQUIRED_OS)) && ' '(AuthenticatedIdentity =!= "*****@*****.**")') if hasattr(config.BossAir, 'condorRequirementsString'): self.reqStr = config.BossAir.condorRequirementsString # x509 proxy handling proxy = Proxy({'logger': myThread.logger}) self.x509userproxy = proxy.getProxyFilename() self.x509userproxysubject = proxy.getSubject() self.x509userproxyfqan = proxy.getAttributeFromProxy(self.x509userproxy) # Remove the x509 ads if the job is matching a volunteer resource self.x509Expr = 'ifThenElse("$$(GLIDEIN_CMSSite)" =?= "T3_CH_Volunteer",undefined,"%s")' return
def validate(self): if self.dataset in Dataset.__dsets: return True if self.lumi_mask: self.lumi_mask = self.__get_mask(self.lumi_mask) cred = Proxy({'logger': logging.getLogger("WMCore")}) dbs = DASWrapper(self.dbs_instance, ca_info=cred.getProxyFilename()) baseinfo = dbs.listFileSummaries(dataset=self.dataset) if baseinfo is None or (len(baseinfo) == 1 and baseinfo[0] is None): return False return True
def __call__(self): server = HTTPRequests(self.serverurl, self.proxyfilename) self.logger.debug("Looking up detailed status of task %s" % self.cachedinfo["RequestName"]) dictresult, status, reason = server.get(self.uri, data={"workflow": self.cachedinfo["RequestName"]}) dictresult = dictresult["result"][0] # take just the significant part if status != 200: msg = "Problem retrieving status:\ninput:%s\noutput:%s\nreason:%s" % ( str(self.cachedinfo["RequestName"]), str(dictresult), str(reason), ) raise RESTCommunicationException(msg) self.logger.debug(dictresult) # should be something like {u'result': [[123, u'ciao'], [456, u'ciao']]} self.logger.info("Task name:\t\t\t%s" % self.cachedinfo["RequestName"]) self.logger.info("Task status:\t\t\t%s" % dictresult["status"]) # Print the url of the panda monitor if dictresult["taskFailureMsg"]: self.logger.error( "%sError during task injection:%s\t%s" % (colors.RED, colors.NORMAL, dictresult["taskFailureMsg"]) ) elif dictresult["jobSetID"]: p = Proxy({"logger": self.logger}) username = urllib.quote(p.getUserName()) self.logger.info( "Panda url:\t\t\thttp://panda.cern.ch/server/pandamon/query?job=*&jobsetID=%s&user=%s" % (dictresult["jobSetID"], username) ) if dictresult["jobdefErrors"]: self.logger.error( "%sSubmission partially failed:%s\t%s jobgroup not submittet out of %s:" % (colors.RED, colors.NORMAL, dictresult["failedJobdefs"], dictresult["totalJobdefs"]) ) for error in dictresult["jobdefErrors"]: self.logger.info("\t%s" % error) # Print information about jobs states = dictresult["jobsPerStatus"] total = sum(states[st] for st in states) frmt = "" for status in states: frmt += status + " %s\t" % self._percentageString(states[status], total) if frmt: self.logger.info("Details:\t\t\t%s" % frmt)
class Proxy(Configurable): """ Wrapper around CMS credentials. Parameters ---------- renew : bool Whether to renew proxy. Defaults to `True`. """ _mutable = {} def __init__(self, renew=True): self.renew = renew self.__proxy = WMProxy({'logger': logging.getLogger("WMCore"), 'proxyValidity': '192:00'}) self.__setup() def __setup(self): if self.check() and self.__proxy.getTimeLeft() > 4 * 3600: if 'X509_USER_PROXY' not in os.environ: os.environ['X509_USER_PROXY'] = self.__proxy.getProxyFilename() elif self.renew: self.__proxy.renew() if self.__proxy.getTimeLeft() < 4 * 3600: raise AttributeError("could not renew proxy") os.environ['X509_USER_PROXY'] = self.__proxy.getProxyFilename() else: raise AttributeError("please renew or disable your proxy") def __getstate__(self): state = dict(self.__dict__) del state['_Proxy__proxy'] return state def __setstate__(self, state): self.__dict__.update(state) with PartiallyMutable.unlock(): self.__proxy = WMProxy({'logger': logging.getLogger("WMCore"), 'proxyValidity': '192:00'}) self.__setup() def check(self): left = self.__proxy.getTimeLeft() if left == 0: return False elif left < 4 * 3600: logger.warn("only {0}:{1:02} left in proxy lifetime!".format(left / 3600, left / 60)) return True def expires(self): return int(time.time()) + self.__proxy.getTimeLeft() def time_left(self): return self.__proxy.getTimeLeft()
def __call__(self): server = HTTPRequests(self.serverurl, self.proxyfilename) self.logger.debug('Looking up detailed status of task %s' % self.cachedinfo['RequestName']) dictresult, status, reason = server.get( self.uri, data={'workflow': self.cachedinfo['RequestName']}) dictresult = dictresult['result'][0] #take just the significant part if status != 200: msg = "Problem retrieving status:\ninput:%s\noutput:%s\nreason:%s" % ( str(self.cachedinfo['RequestName']), str(dictresult), str(reason)) raise RESTCommunicationException(msg) self.logger.debug( dictresult ) #should be something like {u'result': [[123, u'ciao'], [456, u'ciao']]} self.logger.info("Task name:\t\t\t%s" % self.cachedinfo['RequestName']) self.logger.info("Task status:\t\t\t%s" % dictresult['status']) #Print the url of the panda monitor if dictresult['taskFailureMsg']: self.logger.error( "%sError during task injection:%s\t%s" % (colors.RED, colors.NORMAL, dictresult['taskFailureMsg'])) elif dictresult['jobSetID']: p = Proxy({'logger': self.logger}) username = urllib.quote(p.getUserName()) self.logger.info( "Panda url:\t\t\thttp://panda.cern.ch/server/pandamon/query?job=*&jobsetID=%s&user=%s" % (dictresult['jobSetID'], username)) if dictresult['jobdefErrors']: self.logger.error("%sSubmission partially failed:%s\t%s jobgroup not submittet out of %s:" % (colors.RED, colors.NORMAL,\ dictresult['failedJobdefs'], dictresult['totalJobdefs'])) for error in dictresult['jobdefErrors']: self.logger.info("\t%s" % error) #Print information about jobs states = dictresult['jobsPerStatus'] total = sum(states[st] for st in states) frmt = '' for status in states: frmt += status + ' %s\t' % self._percentageString( states[status], total) if frmt: self.logger.info('Details:\t\t\t%s' % frmt)
def getProxy(userdn, group, role, defaultDelegation, logger): """ _getProxy_ """ log.debug("Retrieving proxy for %s" % userdn) proxy = Proxy(defaultDelegation) proxyPath = proxy.getProxyFilename( True ) timeleft = proxy.getTimeLeft( proxyPath ) if timeleft is not None and timeleft > 3600: return (True, proxyPath) proxyPath = proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft( proxyPath ) if timeleft is not None and timeleft > 0: return (True, proxyPath) return (False, None)
def createNewMyProxy(self, timeleftthreshold=0, nokey=False): """ Handles the MyProxy creation Let the following variables be timeleftthreshold: the proxy in myproxy should be delegated for at least this time (14 days) myproxytimeleft: current validity of your proxy in myproxy usercertDaysLeft: the number of days left before your user certificate expire myproxyDesiredValidity: delegate the proxy in myproxy for that time (30 days) If we need to renew the proxy in myproxy because its atributes has changed or because it is valid for less time than timeleftthreshold then we do it. Before doing that, we check when the user certificate is expiring. If it's within the timeleftthreshold (myproxytimeleft < timeleftthreshold) we delegate the proxy just for the time we need (checking first if we did not already do it since at some point usercertDaysLeft ~= myproxytimeleft and we don't need to delegate it at every command even though myproxytimeleft < timeleftthreshold). Note that a warning message is printed at every command it usercertDaysLeft < timeleftthreshold """ myproxy = Proxy ( self.defaultDelegation ) myproxy.userDN = myproxy.getSubjectFromCert(self.certLocation) myproxytimeleft = 0 self.logger.debug("Getting myproxy life time left for %s" % self.defaultDelegation["myProxySvr"]) # return an integer that indicates the number of seconds to the expiration of the proxy in myproxy myproxytimeleft = myproxy.getMyProxyTimeLeft(serverRenewer=True, nokey=nokey) self.logger.debug("Myproxy is valid: %i" % myproxytimeleft) trustRetrListChanged = myproxy.trustedRetrievers!=self.defaultDelegation['serverDN'] #list on the REST and on myproxy are different if myproxytimeleft < timeleftthreshold or self.proxyChanged or trustRetrListChanged: # checking the enddate of the user certificate usercertDaysLeft = myproxy.getUserCertEnddate() if usercertDaysLeft == 0: msg = "%sYOUR USER CERTIFICATE IS EXPIRED (OR WILL EXPIRE TODAY). CANNOT SUBMIT%s"\ % (colors.RED, colors.NORMAL) raise ProxyCreationException(msg) #if the certificate is going to expire print a warning. This is going to bre printed at every command if #the myproxytimeleft is inferior to the timeleftthreshold if usercertDaysLeft < self.myproxyDesiredValidity: self.logger.info("%sYour user certificate is going to expire in %s days. Please renew it! %s"\ % (colors.RED, usercertDaysLeft, colors.NORMAL) ) #check if usercertDaysLeft ~= myproxytimeleft which means we already delegated the proxy for as long as we could if abs(usercertDaysLeft*60*60*24 - myproxytimeleft) < 60*60*24 and not trustRetrListChanged: #less than one day between usercertDaysLeft and myproxytimeleft return #adjust the myproxy delegation time accordingly to the user cert validity self.logger.info("%sDelegating your proxy for %s days instead of %s %s"\ % (colors.RED, usercertDaysLeft, self.myproxyDesiredValidity, colors.NORMAL) ) myproxy.myproxyValidity = "%i:00" % (usercertDaysLeft*24) # creating the proxy self.logger.debug("Delegating a myproxy for %s hours" % self.defaultDelegation['myproxyValidity'] ) try: myproxy.delegate(serverRenewer = True, nokey=nokey) self.logger.debug("My-proxy delegated.") except Exception, ex: raise ProxyCreationException("Problems delegating My-proxy. %s"%ex._message)
def __init__(self, config): BasePlugin.__init__(self, config) self.locationDict = {} myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.locationAction = daoFactory(classname="Locations.GetSiteInfo") self.packageDir = None if os.path.exists( os.path.join(getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py')): self.unpacker = os.path.join( getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py') else: self.unpacker = os.path.join(getWMBASE(), 'WMCore/WMRuntime/Unpacker.py') self.agent = getattr(config.Agent, 'agentName', 'WMAgent') self.sandbox = None self.scriptFile = config.JobSubmitter.submitScript self.defaultTaskPriority = getattr(config.BossAir, 'defaultTaskPriority', 0) self.maxTaskPriority = getattr(config.BossAir, 'maxTaskPriority', 1e7) self.jobsPerSubmit = getattr(config.JobSubmitter, 'jobsPerSubmit', 200) # Required for global pool accounting self.acctGroup = getattr(config.BossAir, 'acctGroup', "production") self.acctGroupUser = getattr(config.BossAir, 'acctGroupUser', "cmsdataops") # Build a requirement string self.reqStr = "stringListMember(GLIDEIN_CMSSite, DESIRED_Sites) && ((REQUIRED_OS=?=\"any\") || (GLIDEIN_REQUIRED_OS=?=REQUIRED_OS)) && (TARGET.Cpus >= RequestCpus)" if hasattr(config.BossAir, 'condorRequirementsString'): self.reqStr = config.BossAir.condorRequirementsString # x509 proxy handling proxy = Proxy({'logger': myThread.logger}) self.x509userproxy = proxy.getProxyFilename() self.x509userproxysubject = proxy.getSubject() return
def setUp(self): """ Setup for unit tests """ logging.basicConfig( level=logging.DEBUG, format="%(asctime)s %(name)-12s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M", filename="proxy_unittests.log", filemode="w", ) logger_name = "ProxyTest" self.logger = logging.getLogger(logger_name) self.dict = { "logger": self.logger, "vo": "cms", "group": group, "role": role, "myProxySvr": myProxySvr, "proxyValidity": "192:00", "min_time_left": 36000, "uisource": uiPath, } self.proxyPath = None self.proxy = Proxy(self.dict)
def proxy(self): try: proxy = Proxy(self.defaultDelegation) except CredentialException, ex: self.logger.debug(ex) raise EnvironmentException('Problem with Grid environment: %s ' % ex._message)
def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY self.userCertFile = self.proxy.getUserCertFilename() # X509_USER_CERT # credential lifetime warning/error thresholds, in days self.credThresholds = {'proxy': {'error': 3, 'warning': 5}, 'certificate': {'error': 10, 'warning': 20}} # Monitoring setup self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None) self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None) self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False) self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None) self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('cms-mb.cern.ch', 61313)]) # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here if hasattr(self.config, "Tier0Feeder"): self.isT0agent = True self.producer = "tier0wmagent" else: self.isT0agent = False self.producer = "wmagent" localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl)
def __init__(self, config): BasePlugin.__init__(self, config) self.locationDict = {} myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.locationAction = daoFactory(classname="Locations.GetSiteInfo") self.packageDir = None if os.path.exists(os.path.join(getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py')): self.unpacker = os.path.join(getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py') else: self.unpacker = os.path.join(getWMBASE(), 'WMCore/WMRuntime/Unpacker.py') self.agent = getattr(config.Agent, 'agentName', 'WMAgent') self.sandbox = None self.scriptFile = config.JobSubmitter.submitScript self.defaultTaskPriority = getattr(config.BossAir, 'defaultTaskPriority', 0) self.maxTaskPriority = getattr(config.BossAir, 'maxTaskPriority', 1e7) self.jobsPerSubmit = getattr(config.JobSubmitter, 'jobsPerSubmit', 200) self.extraMem = getattr(config.JobSubmitter, 'extraMemoryPerCore', 500) # Required for global pool accounting self.acctGroup = getattr(config.BossAir, 'acctGroup', "production") self.acctGroupUser = getattr(config.BossAir, 'acctGroupUser', "cmsdataops") # Build a requirement string self.reqStr = ('stringListMember(GLIDEIN_CMSSite, DESIRED_Sites) ' '&& ((REQUIRED_OS=?="any") || stringListMember(GLIDEIN_REQUIRED_OS, REQUIRED_OS))' '&& (TARGET.Cpus >= RequestCpus)') if hasattr(config.BossAir, 'condorRequirementsString'): self.reqStr = config.BossAir.condorRequirementsString # x509 proxy handling proxy = Proxy({'logger': myThread.logger}) self.x509userproxy = proxy.getProxyFilename() self.x509userproxysubject = proxy.getSubject() return
def proxy(self): try: proxy = Proxy(self.defaultDelegation) except CredentialException as ex: self.logger.debug(ex) raise EnvironmentException('Problem with Grid environment: %s ' % str(ex)) return proxy
class Proxy(object): ''' CMS uses proxies constantly. This class is a wrapper function around WMCore proxy handling, to allow the user to update/check/delete their proxy in myproxy and update/check the local proxy ''' def __init__(self): ''' Constructor ''' self.helper = WMCoreProxy({'logger' : logging}) def getProxyFilename(self): return self.helper.getProxyFilename() def initProxy(self): self.helper.create() def deleteProxy(self): self.helper.destroy() def uploadToMyproxy(self, allowedDN): self.helper.serverDN = allowedDN self.helper.delegate( None, True )
def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel self.jsonFile = config.AgentStatusWatcher.jsonFile proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl)
def wrapped_func(*args, **kwargs): logger = logging.getLogger("CRABLogger.Utils") myproxyserver = "myproxy.cern.ch" userdn = kwargs['userdn'] defaultDelegation = { 'logger': logger, 'proxyValidity': '192:00', 'min_time_left': 36000, 'server_key': serverKey, 'server_cert': serverCert, } timeleftthreshold = 60 * 60 * 24 mypclient = SimpleMyProxy(defaultDelegation) userproxy = None userhash = sha1(kwargs['userdn']).hexdigest() if serverDN: try: userproxy = mypclient.logonRenewMyProxy( username=userhash, myproxyserver=myproxyserver, myproxyport=7512) except MyProxyException as me: # Unsure if this works in standalone mode... cherrypy.log(str(me)) cherrypy.log(str(serverKey)) cherrypy.log(str(serverCert)) invalidp = InvalidParameter( "Impossible to retrieve proxy from %s for %s and hash %s" % (myproxyserver, kwargs['userdn'], userhash)) setattr(invalidp, 'trace', str(me)) raise invalidp else: if not re.match(RX_CERT, userproxy): raise InvalidParameter( "Retrieved malformed proxy from %s for %s and hash %s" % (myproxyserver, kwargs['userdn'], userhash)) else: proxy = Proxy(defaultDelegation) userproxy = proxy.getProxyFilename() kwargs['userproxy'] = userproxy out = func(*args, **kwargs) return out
def getProxy(config, userdn, group, role): """ _getProxy_ """ defaultDelegation = getDefaultDelegation(config, "cms", "myproxy.cern.ch", threading.currentThread().logger) defaultDelegation['userDN'] = userdn defaultDelegation['group'] = group defaultDelegation['role'] = role logging.debug("Retrieving proxy for %s" % userdn) proxy = Proxy(defaultDelegation) proxyPath = proxy.getProxyFilename( True ) timeleft = proxy.getTimeLeft( proxyPath ) if timeleft is not None and timeleft > 3600: return (True, proxyPath) proxyPath = proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft( proxyPath ) if timeleft is not None and timeleft > 0: return (True, proxyPath) return (False, None)
def getProxy(userdn, group, role, defaultDelegation, logger): """ _getProxy_ """ logger.debug("Retrieving proxy for %s" % userdn) config = defaultDelegation config['userDN'] = userdn config['group'] = group config['role'] = role proxy = Proxy(defaultDelegation) proxyPath = proxy.getProxyFilename(True) timeleft = proxy.getTimeLeft(proxyPath) if timeleft is not None and timeleft > 3600: return True, proxyPath proxyPath = proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(proxyPath) if timeleft is not None and timeleft > 0: return True, proxyPath return False, None
def tryProxyLogon(self, proxycfg=None): """ Utility function to allow trying with diffenent myproxy configurations. It tries to retrieve a valid proxy from myproxy using the configuration passed as argument. See WMCore.Credential.Proxy for configuration details. If successful returns the proxy filename and list of VOMS groups for later addition via voms-proxy-init. If not rises a TW exception. Note that logonRenewMyProxy() does not rise exceptions. """ # WMCore proxy methods are awfully verbose, reduce logging level when using them with tempSetLogLevel(logger=self.logger, level=logging.ERROR): proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename( serverRenewer=True) # this only returns a filename proxy.logonRenewMyProxy( ) # this tries to create the proxy, but if it fails it does not rise usergroups = set(proxy.getAllUserGroups( userproxy)) # get VOMS groups from created proxy (if any) timeleft = proxy.getTimeLeft( userproxy ) # this is the way to tell if proxy creation succeeded errmsg = '' if timeleft is None or timeleft <= 0: errmsg = "Impossible to retrieve proxy from %s for %s." % ( proxycfg['myProxySvr'], proxycfg['userDN']) if timeleft < (5 * 24 * 3600): errmsg = "Could not get a proxy valid for at least 5-days from %s for %s." % ( proxycfg['myProxySvr'], proxycfg['userDN']) if errmsg: self.logger.error(errmsg) self.logger.error("Will try again in verbose mode") self.logger.error( "===========PROXY ERROR START ==========================") with tempSetLogLevel(logger=self.logger, level=logging.DEBUG): proxy.logonRenewMyProxy() self.logger.error( "===========PROXY ERROR END ==========================") raise TaskWorkerException(errmsg) hoursleft = timeleft / 3600 minutesleft = (timeleft % 3600) / 60 self.logger.info('retrieved proxy lifetime in h:m: %d:%d', hoursleft, minutesleft) return (userproxy, usergroups)
def execute(self, *args, **kwargs): result = None proxycfg = {'vo': kwargs['task']['tm_user_vo'], 'logger': self.logger, 'myProxySvr': self.config.Services.MyProxy, 'proxyValidity' : '144:0', 'min_time_left' : 36000, ## do we need this ? or should we use self.myproxylen? 'userDN' : kwargs['task']['tm_user_dn'], 'group' : kwargs['task']['tm_user_group'] if kwargs['task']['tm_user_group'] else '', 'role' : kwargs['task']['tm_user_role'] if kwargs['task']['tm_user_role'] else '', 'server_key': self.config.MyProxy.serverhostkey, 'server_cert': self.config.MyProxy.serverhostcert, 'serverDN': self.config.MyProxy.serverdn, 'uisource': getattr(self.config.MyProxy, 'uisource', ''), 'credServerPath': self.config.MyProxy.credpath, 'myproxyAccount' : self.server['host'], 'cleanEnvironment' : getattr(self.config.MyProxy, 'cleanEnvironment', False) } proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename(serverRenewer=True) proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) if timeleft is None or timeleft <= 0: msg = "Impossible to retrieve proxy from %s for %s." % (proxycfg['myProxySvr'], proxycfg['userDN']) raise TaskWorkerException(msg) else: kwargs['task']['user_proxy'] = userproxy result = Result(task=kwargs['task'], result='OK') return result
def get_proxy(self, ad): result = None vo = 'cms' group = '' role = '' if 'CRAB_UserVO' in ad and ad['CRAB_UserVO']: vo = ad['CRAB_UserVO'] if 'CRAB_UserGroup' in ad and ad['CRAB_UserGroup'] and ad['CRAB_UserGroup'] != classad.Value.Undefined: group = ad['CRAB_UserGroup'] if 'CRAB_UserRole' in ad and ad['CRAB_UserRole'] and ad['CRAB_UserRole'] != classad.Value.Undefined: role = ad['CRAB_UserRole'] print(vo, group, role) proxycfg = {'vo': vo, 'logger': self.logger, 'myProxySvr': self.config.Services.MyProxy, 'myproxyAccount': self.config.TaskWorker.resturl, 'proxyValidity' : '144:0', 'min_time_left' : MINPROXYLENGTH, ## do we need this ? or should we use self.myproxylen? 'userDN' : ad['CRAB_UserDN'], 'group' : group, 'role' : role, 'server_key': self.config.MyProxy.serverhostkey, 'server_cert': self.config.MyProxy.serverhostcert, 'serverDN': self.config.MyProxy.serverdn, 'uisource': getattr(self.config.MyProxy, 'uisource', ''), 'credServerPath': self.config.MyProxy.credpath, 'cleanEnvironment' : getattr(self.config.MyProxy, 'cleanEnvironment', False)} proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename(serverRenewer=True) proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) if timeleft is None or timeleft <= 0: self.logger.error("Impossible to retrieve proxy from %s for %s." %(proxycfg['myProxySvr'], proxycfg['userDN'])) raise Exception("Failed to retrieve proxy.") return userproxy
def get_proxy(self, ad): result = None vo = "cms" group = "" role = "" if "CRAB_UserVO" in ad and ad["CRAB_UserVO"]: vo = ad["CRAB_UserVO"] if "CRAB_UserGroup" in ad and ad["CRAB_UserGroup"] and ad["CRAB_UserGroup"] != classad.Value.Undefined: group = ad["CRAB_UserGroup"] if "CRAB_UserRole" in ad and ad["CRAB_UserRole"] and ad["CRAB_UserRole"] != classad.Value.Undefined: role = ad["CRAB_UserRole"] print vo, group, role proxycfg = { "vo": vo, "logger": self.logger, "myProxySvr": self.config.Services.MyProxy, "myproxyAccount": self.config.TaskWorker.resturl, "proxyValidity": "144:0", "min_time_left": MINPROXYLENGTH, ## do we need this ? or should we use self.myproxylen? "userDN": ad["CRAB_UserDN"], "group": group, "role": role, "server_key": self.config.MyProxy.serverhostkey, "server_cert": self.config.MyProxy.serverhostcert, "serverDN": self.config.MyProxy.serverdn, "uisource": getattr(self.config.MyProxy, "uisource", ""), "credServerPath": self.config.MyProxy.credpath, "cleanEnvironment": getattr(self.config.MyProxy, "cleanEnvironment", False), } proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename(serverRenewer=True) proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) if timeleft is None or timeleft <= 0: self.logger.error( "Impossible to retrieve proxy from %s for %s." % (proxycfg["myProxySvr"], proxycfg["userDN"]) ) raise Exception("Failed to retrieve proxy.") return userproxy
def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY self.userCertFile = self.proxy.getUserCertFilename() # X509_USER_CERT # credential lifetime warning/error thresholds, in days self.credThresholds = { 'proxy': { 'error': 3, 'warning': 5 }, 'certificate': { 'error': 10, 'warning': 20 } } localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) # Monitoring setup self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None) self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None) self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False) self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None) self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('dashb-mb.cern.ch', 61113)])
def execute(self, *args, **kwargs): result = None proxycfg = { "vo": kwargs["task"]["tm_user_vo"], "logger": self.logger, "myProxySvr": self.config.Services.MyProxy, "proxyValidity": "24:0", "min_time_left": 36000, ## do we need this ? or should we use self.myproxylen? "userDN": kwargs["task"]["tm_user_dn"], "group": kwargs["task"]["tm_user_group"] if kwargs["task"]["tm_user_group"] else "", "role": kwargs["task"]["tm_user_role"] if kwargs["task"]["tm_user_role"] else "", "server_key": self.config.MyProxy.serverhostkey, "server_cert": self.config.MyProxy.serverhostcert, "serverDN": self.config.MyProxy.serverdn, "uisource": self.config.MyProxy.uisource, "credServerPath": self.config.MyProxy.credpath, } proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename(serverRenewer=True) proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) if timeleft is None or timeleft <= 0: msg = "Impossible to retrieve proxy from %s for %s." % (proxycfg["myProxySvr"], proxycfg["userDN"]) self.logger.error("Setting %s as failed" % str(kwargs["task"]["tm_taskname"])) configreq = { "workflow": kwargs["task"]["tm_taskname"], "status": "FAILED", "subresource": "failure", "failure": b64encode(msg), } self.logger.error(str(configreq)) self.server.post(self.resturl, data=urllib.urlencode(configreq)) raise StopHandler(msg) else: kwargs["task"]["user_proxy"] = userproxy result = Result(task=kwargs["task"], result="OK") return result
def execute(self, *args, **kwargs): result = None proxycfg = {'vo': kwargs['task']['tm_user_vo'], 'logger': self.logger, 'myProxySvr': self.config.Services.MyProxy, 'proxyValidity' : '144:0', 'min_time_left' : 36000, ## do we need this ? or should we use self.myproxylen? 'userDN' : kwargs['task']['tm_user_dn'], 'group' : kwargs['task']['tm_user_group'] if kwargs['task']['tm_user_group'] else '', 'role' : kwargs['task']['tm_user_role'] if kwargs['task']['tm_user_role'] else '', 'server_key': self.config.MyProxy.serverhostkey, 'server_cert': self.config.MyProxy.serverhostcert, 'serverDN': self.config.MyProxy.serverdn, 'uisource': getattr(self.config.MyProxy, 'uisource', ''), 'credServerPath': self.config.MyProxy.credpath, 'myproxyAccount' : self.server['host'], 'cleanEnvironment' : getattr(self.config.MyProxy, 'cleanEnvironment', False) } proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename(serverRenewer=True) proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) if timeleft is None or timeleft <= 0: msg = "Impossible to retrieve proxy from %s for %s." %(proxycfg['myProxySvr'], proxycfg['userDN']) self.logger.error("Setting %s as failed" % str(kwargs['task']['tm_taskname'])) configreq = {'workflow': kwargs['task']['tm_taskname'], 'status': "FAILED", 'subresource': 'failure', 'failure': b64encode(msg)} self.logger.error(str(configreq)) self.server.post(self.resturi, data = urllib.urlencode(configreq)) raise StopHandler(msg) else: kwargs['task']['user_proxy'] = userproxy result = Result(task=kwargs['task'], result='OK') return result
def wrapped_func(*args, **kwargs): logger = logging.getLogger("CRABLogger.Utils") myproxyserver = "myproxy.cern.ch" userdn = kwargs['userdn'] defaultDelegation = {'logger': logger, 'proxyValidity' : '192:00', 'min_time_left' : 36000, 'server_key': serverKey, 'server_cert': serverCert,} timeleftthreshold = 60 * 60 * 24 mypclient = SimpleMyProxy(defaultDelegation) userproxy = None userhash = sha1(kwargs['userdn']).hexdigest() if serverDN: try: userproxy = mypclient.logonRenewMyProxy(username=userhash, myproxyserver=myproxyserver, myproxyport=7512) except MyProxyException as me: # Unsure if this works in standalone mode... cherrypy.log(str(me)) cherrypy.log(str(serverKey)) cherrypy.log(str(serverCert)) invalidp = InvalidParameter("Impossible to retrieve proxy from %s for %s and hash %s" % (myproxyserver, kwargs['userdn'], userhash)) setattr(invalidp, 'trace', str(me)) raise invalidp else: if not re.match(RX_CERT, userproxy): raise InvalidParameter("Retrieved malformed proxy from %s for %s and hash %s" % (myproxyserver, kwargs['userdn'], userhash)) else: proxy = Proxy(defaultDelegation) userproxy = proxy.getProxyFilename() kwargs['userproxy'] = userproxy out = func(*args, **kwargs) return out
def execute(self, *args, **kwargs): result = None proxycfg = { 'vo': kwargs['task']['tm_user_vo'], 'logger': self.logger, 'myProxySvr': self.config.Services.MyProxy, 'proxyValidity': '24:0', 'min_time_left': 36000, ## do we need this ? or should we use self.myproxylen? 'userDN': kwargs['task']['tm_user_dn'], 'group': kwargs['task']['tm_user_group'] if kwargs['task']['tm_user_group'] else '', 'role': kwargs['task']['tm_user_role'] if kwargs['task']['tm_user_role'] else '', 'server_key': self.config.MyProxy.serverhostkey, 'server_cert': self.config.MyProxy.serverhostcert, 'serverDN': self.config.MyProxy.serverdn, 'uisource': self.config.MyProxy.uisource, 'credServerPath': self.config.MyProxy.credpath, } proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename(serverRenewer=True) proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) if timeleft is None or timeleft <= 0: msg = "Impossible to retrieve proxy from %s for %s." % ( proxycfg['myProxySvr'], proxycfg['userDN']) self.logger.error("Setting %s as failed" % str(kwargs['task']['tm_taskname'])) configreq = { 'workflow': kwargs['task']['tm_taskname'], 'status': "FAILED", 'subresource': 'failure', 'failure': b64encode(msg) } self.logger.error(str(configreq)) self.server.post(self.resturl, data=urllib.urlencode(configreq)) raise StopHandler(msg) else: kwargs['task']['user_proxy'] = userproxy result = Result(task=kwargs['task'], result='OK') return result
def setupMyProxy(self): """ _setupMyProxy_ Setup a WMCore.Credential.Proxy object with which to retrieve proxies from myproxy using the server Cert """ args = {} if self.setupScript: args['uisource'] = self.setupScript args['server_cert'] = self.serverCert args['server_key'] = self.serverKey args['myProxySvr'] = self.myproxySrv args['credServerPath'] = self.proxyDir args['logger'] = logging return Proxy(args = args)
def setUp(self): """ Setup for unit tests """ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename='proxy_unittests.log', filemode='w') logger_name = 'ProxyTest' self.logger = logging.getLogger(logger_name) self.dict = {'logger': self.logger, 'vo': 'cms', 'group': group, 'role': role, 'myProxySvr': myProxySvr, 'proxyValidity' : '192:00', 'min_time_left' : 36000, 'uisource' : uiPath, 'serverDN' : serverDN} self.proxyPath = None self.proxy = Proxy( self.dict ) self.serverDN = self.dict['serverDN']
def getProxy(defaultDelegation, log): """ _getProxy_ """ log.debug("Retrieving proxy for %s" % defaultDelegation['userDN']) proxy = Proxy(defaultDelegation) proxyPath = proxy.getProxyFilename(True) timeleft = proxy.getTimeLeft(proxyPath) if timeleft is not None and timeleft > 3600: return (True, proxyPath) proxyPath = proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(proxyPath) if timeleft is not None and timeleft > 0: return (True, proxyPath) return (False, None)
def createNewMyProxy(self, timeleftthreshold=0, nokey=False): """ Handles the MyProxy creation """ myproxy = Proxy ( self.defaultDelegation ) myproxy.userDN = myproxy.getSubject() myproxytimeleft = 0 self.logger.debug("Getting myproxy life time left for %s" % self.defaultDelegation["myProxySvr"]) # does it return an integer that indicates? myproxytimeleft = myproxy.getMyProxyTimeLeft(serverRenewer=True, nokey=nokey) self.logger.debug("Myproxy is valid: %i" % myproxytimeleft) if myproxytimeleft < timeleftthreshold or self.proxyChanged: # creating the proxy self.logger.debug("Delegating a myproxy for %s hours" % self.defaultDelegation['myproxyValidity'] ) try: myproxy.delegate(serverRenewer = True, nokey=nokey) self.logger.debug("My-proxy delegated.") except Exception, ex: raise ProxyCreationException("Problems delegating My-proxy. Problem %s"%ex)
def setUp(self): """ Setup for unit tests """ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename='proxy_unittests.log', filemode='w') logger_name = 'ProxyTest' self.logger = logging.getLogger(logger_name) dict = {'logger': self.logger, 'server_key' : '/home/crab/.globus/hostkey.pem', 'server_cert' : '/home/crab/.globus/hostcert.pem', 'vo': 'cms', 'group': 'integration', 'role': 'NULL', 'myProxySvr': 'myproxy.cern.ch', 'proxyValidity' : '192:00', 'min_time_left' : 36000} self.proxyPath = None self.proxy = Proxy( dict ) self.serverKey = dict['server_key'] self.serverDN = None if dict.has_key('serverDN'): self.serverDN = dict['serverDN']
def setUp(self): """ Setup for unit tests """ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename='proxy_unittests.log', filemode='w') logger_name = 'ProxyTest' self.logger = logging.getLogger(logger_name) self.dict = {'logger': self.logger, 'server_key' : '/home/crab/.globus/hostkey.pem', 'server_cert' : '/home/crab/.globus/hostcert.pem', 'vo': 'cms', 'group': 'integration', 'role': 'NULL', 'myProxySvr': 'myproxy.cern.ch', 'proxyValidity' : '192:00', 'min_time_left' : 36000, 'uisource' : '/afs/cern.ch/cms/LCG/LCG-2/UI/cms_ui_env.sh'} #, 'serverDN' : '/C=IT/O=INFN/OU=Host/L=Perugia/CN=crab.pg.infn.it'} self.proxyPath = None self.proxy = Proxy( self.dict ) self.serverKey = self.dict['server_key'] self.serverDN = None if self.dict.has_key('serverDN'): self.serverDN = self.dict['serverDN']
def getProxy(userdn, group, role, defaultDelegation, logger): """ _getProxy_ """ logger.debug("Retrieving proxy for %s" % userdn) config = defaultDelegation config['userDN'] = userdn config['group'] = group config['role'] = role proxy = Proxy(defaultDelegation) proxyPath = proxy.getProxyFilename(True) timeleft = proxy.getTimeLeft(proxyPath) if timeleft is not None and timeleft > 3600: return (True, proxyPath) proxyPath = proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(proxyPath) if timeleft is not None and timeleft > 0: return (True, proxyPath) return (False, None)
def getProxy(config, userdn, group, role): """ _getProxy_ """ defaultDelegation = getDefaultDelegation(config, "cms", "myproxy.cern.ch", threading.currentThread().logger) defaultDelegation['userDN'] = userdn defaultDelegation['group'] = group defaultDelegation['role'] = role logging.debug("Retrieving proxy for %s" % userdn) proxy = Proxy(defaultDelegation) proxyPath = proxy.getProxyFilename(True) timeleft = proxy.getTimeLeft(proxyPath) if timeleft is not None and timeleft > 3600: return (True, proxyPath) proxyPath = proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(proxyPath) if timeleft is not None and timeleft > 0: return (True, proxyPath) return (False, None)
class ProxyTest(unittest.TestCase): def setUp(self): """ Setup for unit tests """ logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename='proxy_unittests.log', filemode='w') logger_name = 'ProxyTest' self.logger = logging.getLogger(logger_name) self.dict = { 'logger': self.logger, 'vo': 'cms', 'group': group, 'role': role, 'myProxySvr': myProxySvr, 'proxyValidity': '192:00', 'min_time_left': 36000, 'uisource': uiPath } self.proxyPath = None self.proxy = Proxy(self.dict) def tearDown(self): """ _tearDown_ Tear down the proxy. """ return def getUserIdentity(self): """ _getUserIdentity_ Retrieve the user's subject from the voms-proxy-info call. """ vomsProxyInfoCall = subprocess.Popen(["voms-proxy-info", "-identity"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) if vomsProxyInfoCall.wait() != 0: return None (stdout, stderr) = vomsProxyInfoCall.communicate() return stdout[0:-1] def getUserAttributes(self): """ _getUserAttributes_ Retrieve the user's attributes from the voms-proxy-info call. """ vomsProxyInfoCall = subprocess.Popen(["voms-proxy-info", "-fqan"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) if vomsProxyInfoCall.wait() != 0: return None (stdout, stderr) = vomsProxyInfoCall.communicate() return stdout[0:-1] @attr("integration") def testAAACreateProxy(self): """ Test if create method creates correctly the proxy. This is sort of bad form to require that this test run first, but the alternative is entering a password for every single invocation """ self.proxy.create() time.sleep(5) proxyPath = self.proxy.getProxyFilename() self.assertTrue(os.path.exists(proxyPath)) @attr("integration") def testCheckProxyTimeLeft(self): """ Test if getTimeLeft method returns correctly the proxy time left. """ timeLeft = self.proxy.getTimeLeft() self.assertEqual(int(timeLeft) / 3600, 191) @attr("integration") def testRenewProxy(self): """ Test if the renew method renews correctly the user proxy. """ time.sleep(70) self.proxy.renew() time.sleep(10) timeLeft = self.proxy.getTimeLeft() self.assertEqual(int(timeLeft) / 3600, 191) @attr("integration") def testDestroyProxy(self): """ Test the proxy destroy method. """ self.proxy.destroy() self.proxyPath = self.proxy.getProxyFilename() self.assertFalse(os.path.exists(self.proxyPath)) # Create the proxy after the destroy self.proxy.create() @attr("integration") def testGetSubject(self): """ _testGetSubject_ Verify that the getSubject() method works correctly. """ subject = self.proxy.getSubject() self.assertEqual(subject, self.getUserIdentity(), "Error: Wrong subject.") return @attr("integration") def testGetUserName(self): """ _testGetUserName_ Verify that the getUserName() method correctly determines the user's name. """ user = self.proxy.getUserName() identity = self.getUserIdentity().split("/")[ len(self.getUserIdentity().split("/")) - 1][3:] self.assertEqual( user, identity, "Error: User name is wrong: |%s|\n|%s|" % (user, identity)) return @attr("integration") def testCheckAttribute(self): """ Test if the checkAttribute method checks correctly the attributes validity. """ valid = self.proxy.checkAttribute() self.assertTrue(valid) @attr("integration") def testCheckTimeLeft(self): """ Test if the check method checks correctly the proxy validity. """ valid = self.proxy.check(self.proxyPath) self.assertTrue(valid) @attr("integration") def testVomsRenewal(self): """ Test if vomsExtensionRenewal method renews correctly the voms-proxy. """ proxyPath = self.proxy.getProxyFilename() time.sleep(70) attribute = self.proxy.prepareAttForVomsRenewal( self.proxy.getAttributeFromProxy(proxyPath)) self.proxy.vomsExtensionRenewal(proxyPath, attribute) vomsTimeLeft = self.proxy.getVomsLife(proxyPath) self.assertEqual(int(vomsTimeLeft) / 3600, 191) @attr("integration") def testElevateAttribute(self): """ Test if the vomsExtensionRenewal method elevate last attributes given. """ proxyPath = self.proxy.getProxyFilename() attribute = self.proxy.prepareAttForVomsRenewal( '/cms/Role=NULL/Capability=NULL') self.proxy.vomsExtensionRenewal(proxyPath, attribute) self.assertEqual(self.proxy.getAttributeFromProxy(proxyPath), '/cms/Role=NULL/Capability=NULL') # Restore the original configuration of the proxy self.proxy.create() @attr("integration") def testUserGroupInProxy(self): """ Test if getUserAttributes method returns correctly the user group. """ self.assertTrue(self.proxy.group, 'No group set. Testing incomplete.') self.assertEqual(self.proxy.group, self.getUserAttributes().split('\n')[0].split('/')[2]) @attr("integration") def testUserRoleInProxy(self): """ Test if getUserAttributes method returns correctly the user role. """ self.assertEqual( self.proxy.role, self.getUserAttributes().split('\n')[0].split('/')[3].split('=') [1]) @attr("integration") def testGetAttributes(self): """ Test getAttributeFromProxy method. """ self.assertTrue(self.proxy.group, 'No group set. Testing incomplete.') if not self.dict['role']: role = 'NULL' else: role = self.dict['role'] proxyPath = self.proxy.getProxyFilename() self.assertEqual( self.proxy.getAttributeFromProxy(proxyPath).split('/')[2], self.dict['group']) self.assertEqual( self.proxy.getAttributeFromProxy(proxyPath).split('/')[3].split( '=')[1], role) @attr("integration") def testGetUserGroupAndRole(self): """ Test GetUserGroupAndRoleFromProxy method. """ if not self.dict['role']: role = 'NULL' else: role = self.dict['role'] proxyPath = self.proxy.getProxyFilename() if self.dict['group'] and self.dict['role']: self.assertEqual( self.proxy.getUserGroupAndRoleFromProxy(proxyPath)[0], self.dict['group']) self.assertEqual( self.proxy.getUserGroupAndRoleFromProxy(proxyPath)[1], role)
def createNewVomsProxy(self, timeleftthreshold=0): """ Handles the proxy creation: - checks if a valid proxy still exists - performs the creation if it is expired """ ## TODO add the change to have user-cert/key defined in the config. userproxy = Proxy( self.defaultDelegation ) userproxy.userDN = userproxy.getSubject() proxytimeleft = 0 self.logger.debug("Getting proxy life time left") # does it return an integer that indicates? proxytimeleft = userproxy.getTimeLeft() self.logger.debug("Proxy is valid: %i" % proxytimeleft) #if it is not expired I check if role and/or group are changed if not proxytimeleft < timeleftthreshold and self.defaultDelegation['role']!=None and self.defaultDelegation['group']!=None: group , role = userproxy.getUserGroupAndRoleFromProxy( userproxy.getProxyFilename()) if group != self.defaultDelegation['group'] or role != self.defaultDelegation['role']: self.proxyChanged = True #if the proxy is expired, or we changed role and/or group, we need to create a new one if proxytimeleft < timeleftthreshold or self.proxyChanged: # creating the proxy self.logger.debug("Creating a proxy for %s hours" % self.defaultDelegation['proxyValidity'] ) userproxy.create() proxytimeleft = userproxy.getTimeLeft() group , role = userproxy.getUserGroupAndRoleFromProxy( userproxy.getProxyFilename()) if proxytimeleft > 0 and group == self.defaultDelegation['group'] and role == self.defaultDelegation['role']: self.logger.debug("Proxy created.") else: raise ProxyCreationException("Problems creating proxy.") return userproxy.getSubject( ), userproxy.getProxyFilename()
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel self.jsonFile = config.AgentStatusWatcher.jsonFile proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter"}) # TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter"}) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = {'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url']} localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params}) self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params}) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate( rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get('query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkProxyLifetime(agentInfo) timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not hasattr(self.config, "Tier0Feeder"): # Tier0 Agent doesn't have LQ. timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) # save locally json file as well with open(self.jsonFile, 'w') as outFile: json.dump(agentInfo, outFile, indent=2) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} results['workByStatus'] = self.workqueueDS.getJobsByStatus() results['workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority() elements = self.workqueueDS.getElementsByStatus(['Available', 'Acquired']) uniSites, posSites = getGlobalSiteStatusSummary(elements, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = {'name': 'CouchServer', 'status': 'ok', 'error_message': ""} if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo() if lastDataUpload['data_last_update']: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']: agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): # direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug("Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug("Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug("Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug("Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug("Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug("List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkProxyLifetime(self, agInfo): """ Check the proxy lifetime (usually X509_USER_CERT) and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :return: same dictionary object plus additional keys/values if needed. """ secsLeft = self.proxy.getTimeLeft(proxy=self.proxyFile) logging.debug("Proxy '%s' lifetime is %d secs", self.proxyFile, secsLeft) if secsLeft <= 86400 * 3: # 3 days proxyWarning = True agInfo['status'] = "error" elif secsLeft <= 86400 * 5: # 5 days proxyWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: proxyWarning = False if proxyWarning: warnMsg = "Agent proxy '%s' must be renewed ASAP. " % self.proxyFile warnMsg += "Its time left is: %.2f hours." % (secsLeft / 3600.) agInfo['proxy_warning'] = warnMsg return
class ProxyTest(unittest.TestCase): def setUp(self): """ Setup for unit tests """ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename='proxy_unittests.log', filemode='w') logger_name = 'ProxyTest' self.logger = logging.getLogger(logger_name) self.dict = {'logger': self.logger, 'server_key' : '/home/crab/.globus/hostkey.pem', 'server_cert' : '/home/crab/.globus/hostcert.pem', 'vo': 'cms', 'group': 'integration', 'role': 'NULL', 'myProxySvr': 'myproxy.cern.ch', 'proxyValidity' : '192:00', 'min_time_left' : 36000, 'uisource' : '/afs/cern.ch/cms/LCG/LCG-2/UI/cms_ui_env.sh'} #, 'serverDN' : '/C=IT/O=INFN/OU=Host/L=Perugia/CN=crab.pg.infn.it'} self.proxyPath = None self.proxy = Proxy( self.dict ) self.serverKey = self.dict['server_key'] self.serverDN = None if self.dict.has_key('serverDN'): self.serverDN = self.dict['serverDN'] def tearDown(self): """ _tearDown_ Tear down the proxy. """ self.proxy.destroy() return def getUserIdentity(self): """ _getUserIdentity_ Retrieve the user's subject from the voms-proxy-info call. """ vomsProxyInfoCall = subprocess.Popen(["voms-proxy-info", "-identity"], stdout = subprocess.PIPE, stderr = subprocess.PIPE) if vomsProxyInfoCall.wait() != 0: return None (stdout, stderr) = vomsProxyInfoCall.communicate() return stdout[0:-1] def getUserAttributes(self): """ _getUserAttributes_ Retrieve the user's attributes from the voms-proxy-info call. """ vomsProxyInfoCall = subprocess.Popen(["voms-proxy-info", "-fqan"], stdout = subprocess.PIPE, stderr = subprocess.PIPE) if vomsProxyInfoCall.wait() != 0: return None (stdout, stderr) = vomsProxyInfoCall.communicate() return stdout[0:-1] @attr("integration") def testDestroyBeforeCreation(self ): """ """ if not os.path.exists( self.serverKey ): self.proxy.destroy( ) self.proxyPath = self.proxy.getProxyFilename() assert not os.path.exists(self.proxyPath) @attr("integration") def testCreateProxy( self ): """ """ if not os.path.exists( self.serverKey ): self.proxy.create() time.sleep( 5 ) proxyPath = self.proxy.getProxyFilename() assert os.path.exists(proxyPath) @attr("integration") def testCheckProxyTimeLeft( self ): """ """ if not os.path.exists( self.serverKey ): self.proxy.create() timeLeft = self.proxy.getTimeLeft() print timeLeft assert ( int(timeLeft) / 3600 ) == 192 @attr("integration") def testRenewProxy( self ): """ """ if not os.path.exists( self.serverKey ): time.sleep( 70 ) self.proxy.renew() time.sleep( 10 ) timeLeft = self.proxy.getTimeLeft() assert ( int(timeLeft) / 3600 ) == 191 @attr("integration") def testDestroyProxy(self ): """ """ if not os.path.exists( self.serverKey ): self.proxy.destroy( ) self.proxyPath = self.proxy.getProxyFilename() assert not os.path.exists(self.proxyPath) @attr("integration") def testGetSubject(self): """ _testGetSubject_ Verify that the getSubject() method works correctly. """ if os.path.exists(self.serverKey): return self.testCreateProxy() subject = self.proxy.getSubject( ) self.assertEqual(subject, self.getUserIdentity(), "Error: Wrong subject.") return @attr("integration") def testGetUserName( self ): """ _testGetUserName_ Verify that the getUserName() method correctly determines the user's name. """ if os.path.exists( self.serverKey ): return self.testCreateProxy() user = self.proxy.getUserName( ) identity = self.getUserIdentity().split("/")[ len(self.getUserIdentity().split("/")) - 1 ][3:] self.assertEqual(user, identity, "Error: User name is wrong: |%s|\n|%s|" % (user, identity)) return @attr("integration") def checkAttribute( self ): """ """ if not os.path.exists( self.serverKey ): valid = self.proxy.checkAttribute( ) assert valid == True @attr("integration") def testCheckTimeLeft( self ): """ """ if not os.path.exists( self.serverKey ): valid = self.proxy.check( self.proxyPath ) assert valid == True @attr("integration") def testDelegateMyProxy( self ): """ """ if not os.path.exists( self.serverKey ): self.proxy.create() self.proxy.delegate( credential = self.proxyPath ) valid = self.proxy.checkMyProxy( ) assert valid == True @attr("integration") def testDelegateServerAndMyProxy( self ): """ """ if not os.path.exists( self.serverKey ): self.proxy.create() self.proxy.delegate( credential = self.proxyPath, serverRenewer = True ) valid = self.proxy.checkMyProxy( checkRenewer = True ) assert valid == True @attr("integration") def testCheckMyProxy( self ): """ """ if not os.path.exists( self.serverKey ) and self.serverDN: self.proxy.create() self.proxy.delegate( ) valid = self.proxy.checkMyProxy( ) assert valid == True @attr("integration") def testCheckMyProxyServer( self ): """ """ if not os.path.exists( self.serverKey ) and self.serverDN: self.proxy.create() self.proxy.delegate( serverRenewer = True ) valid = self.proxy.checkMyProxy( checkRenewer = True ) assert valid == True @attr("integration") def testLogonRenewMyProxy( self ): """ """ if os.path.exists( self.serverKey ): proxyFile = self.proxy.logonRenewMyProxy( ) assert os.path.exists( proxyFile ) @attr("integration") def testRenewMyProxy( self ): """ """ if not os.path.exists( self.serverKey ): self.proxy.create() time.sleep( 70 ) self.proxy.renewMyProxy( proxy = self.proxyPath ) time.sleep( 5 ) timeLeft = self.proxy.getMyProxyTimeLeft( proxy = self.proxyPath ) assert ( int(timeLeft) / 3600 ) == 167 @attr("integration") def testRenewMyProxyForServer( self ): """ """ if not os.path.exists( self.serverKey ) and self.serverDN: self.proxy.create() time.sleep( 70 ) self.proxy.renewMyProxy( proxy = self.proxyPath, serverRenewer = True ) time.sleep( 5 ) timeLeft = self.proxy.getMyProxyTimeLeft( proxy = self.proxyPath, serverRenewer = True ) assert ( int(timeLeft) / 3600 ) == 167 @attr("integration") def testRenewMyProxyByServer( self ): """ """ if os.path.exists( self.serverKey ): proxyPath = self.proxy.getProxyFilename( serverRenewer = True ) self.proxy.logonRenewMyProxy( proxyPath ) timeLeft = self.proxy.getTimeLeft( proxyPath ) assert ( int(timeLeft) / 3600 ) > 120 @attr("integration") def testVomsRenewal( self ): """ """ if not os.path.exists( self.serverKey ): self.proxy.create() proxyPath = self.proxy.getProxyFilename( ) time.sleep( 70 ) attribute = self.proxy.prepareAttForVomsRenewal( self.proxy.getAttributeFromProxy( proxyPath ) ) self.proxy.vomsExtensionRenewal( proxyPath, attribute ) vomsTimeLeft = self.proxy.getVomsLife( proxyPath ) assert ( int(vomsTimeLeft) / 3600 ) == 191 @attr("integration") def testElevateAttribute( self ): """ """ if not os.path.exists( self.serverKey ): self.proxy.create() proxyPath = self.proxy.getProxyFilename( ) # getProxyDetails allows to buid the proxy attribute from the parameters given attribute = self.proxy.prepareAttForVomsRenewal( '/cms/Role=NULL/Capability=NULL' ) self.proxy.vomsExtensionRenewal( proxyPath, attribute ) assert self.proxy.getAttributeFromProxy( proxyPath ) == '/cms/Role=NULL/Capability=NULL' @attr("integration") def testUserGroupInProxy( self ): """ """ if not os.path.exists( self.serverKey ): self.proxy.create() assert self.proxy.group == self.getUserAttributes().split('\n')[0].split('/')[2] @attr("integration") def testUserRoleInProxy( self ): """ """ if not os.path.exists( self.serverKey ): self.proxy.create() assert self.proxy.role == self.getUserAttributes().split('\n')[0].split('/')[3].split('=')[1] @attr("integration") def testGetAttributes( self ): """ """ if not os.path.exists( self.serverKey ): if not self.dict['role']: role = 'NULL' self.proxy.create() assert self.proxy.getAttributeFromProxy().split('/')[2] == self.dict['group'] assert self.proxy.getAttributeFromProxy().split('/')[3].split('=')[1] == role @attr("integration") def testGetAttributes( self ): """ """ if not os.path.exists( self.serverKey ): if not self.dict['role']: role = 'NULL' self.proxy.create() proxyPath = self.proxy.getProxyFilename( ) if self.dict['group'] and self.dict['role']: assert self.proxy.getUserGroupAndRoleFromProxy( proxyPath )[0] == self.dict['group'] assert self.proxy.getUserGroupAndRoleFromProxy( proxyPath )[1] == self.dict['role']
'server_key': serverKey, 'server_cert': serverCert,} timeleftthreshold = 60 * 60 * 24 mypclient = SimpleMyProxy(defaultDelegation) userproxy = None userhash = sha1(kwargs['userdn']).hexdigest() if serverDN: try: userproxy = mypclient.logonRenewMyProxy(username=userhash, myproxyserver=myproxyserver, myproxyport=7512) except MyProxyException, me: # Unsure if this works in standalone mode... cherrypy.log(str(me)) cherrypy.log(str(serverKey)) cherrypy.log(str(serverCert)) invalidp = InvalidParameter("Impossible to retrieve proxy from %s for %s and hash %s" % (myproxyserver, kwargs['userdn'], userhash)) setattr(invalidp, 'trace', str(me)) raise invalidp else: if not re.match(RX_CERT, userproxy): raise InvalidParameter("Retrieved malformed proxy from %s for %s and hash %s" % (myproxyserver, kwargs['userdn'], userhash)) else: proxy = Proxy(defaultDelegation) userproxy = proxy.getProxyFilename() kwargs['userproxy'] = userproxy out = func(*args, **kwargs) return out return wrapped_func
class ProxyTest(unittest.TestCase): def setUp(self): """ Setup for unit tests """ logging.basicConfig( level=logging.DEBUG, format="%(asctime)s %(name)-12s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M", filename="proxy_unittests.log", filemode="w", ) logger_name = "ProxyTest" self.logger = logging.getLogger(logger_name) self.dict = { "logger": self.logger, "vo": "cms", "group": group, "role": role, "myProxySvr": myProxySvr, "proxyValidity": "192:00", "min_time_left": 36000, "uisource": uiPath, } self.proxyPath = None self.proxy = Proxy(self.dict) def tearDown(self): """ _tearDown_ Tear down the proxy. """ return def getUserIdentity(self): """ _getUserIdentity_ Retrieve the user's subject from the voms-proxy-info call. """ vomsProxyInfoCall = subprocess.Popen( ["voms-proxy-info", "-identity"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) if vomsProxyInfoCall.wait() != 0: return None (stdout, stderr) = vomsProxyInfoCall.communicate() return stdout[0:-1] def getUserAttributes(self): """ _getUserAttributes_ Retrieve the user's attributes from the voms-proxy-info call. """ vomsProxyInfoCall = subprocess.Popen( ["voms-proxy-info", "-fqan"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) if vomsProxyInfoCall.wait() != 0: return None (stdout, stderr) = vomsProxyInfoCall.communicate() return stdout[0:-1] @attr("integration") def testGetUserCertEnddate(self): """ Test if getTimeLeft method returns correctly the proxy time left. """ daysleft = self.proxy.getUserCertEnddate() self.assertEqual(daysleft, 29) # set this as the number of days left in .globus/usercert.pem @attr("integration") def testAAACreateProxy(self): """ Test if create method creates correctly the proxy. This is sort of bad form to require that this test run first, but the alternative is entering a password for every single invocation """ self.proxy.create() time.sleep(5) proxyPath = self.proxy.getProxyFilename() self.assertTrue(os.path.exists(proxyPath)) @attr("integration") def testCheckProxyTimeLeft(self): """ Test if getTimeLeft method returns correctly the proxy time left. """ timeLeft = self.proxy.getTimeLeft() self.assertEqual(int(timeLeft) / 3600, 191) @attr("integration") def testRenewProxy(self): """ Test if the renew method renews correctly the user proxy. """ time.sleep(70) self.proxy.renew() time.sleep(10) timeLeft = self.proxy.getTimeLeft() self.assertEqual(int(timeLeft) / 3600, 191) @attr("integration") def testDestroyProxy(self): """ Test the proxy destroy method. """ self.proxy.destroy() self.proxyPath = self.proxy.getProxyFilename() self.assertFalse(os.path.exists(self.proxyPath)) # Create the proxy after the destroy self.proxy.create() @attr("integration") def testGetSubject(self): """ _testGetSubject_ Verify that the getSubject() method works correctly. """ subject = self.proxy.getSubject() self.assertEqual(subject, self.getUserIdentity(), "Error: Wrong subject.") return @attr("integration") def testGetUserName(self): """ _testGetUserName_ Verify that the getUserName() method correctly determines the user's name. """ user = self.proxy.getUserName() identity = self.getUserIdentity().split("/")[len(self.getUserIdentity().split("/")) - 1][3:] self.assertEqual(user, identity, "Error: User name is wrong: |%s|\n|%s|" % (user, identity)) return @attr("integration") def testCheckAttribute(self): """ Test if the checkAttribute method checks correctly the attributes validity. """ valid = self.proxy.checkAttribute() self.assertTrue(valid) @attr("integration") def testCheckTimeLeft(self): """ Test if the check method checks correctly the proxy validity. """ valid = self.proxy.check(self.proxyPath) self.assertTrue(valid) @attr("integration") def testVomsRenewal(self): """ Test if vomsExtensionRenewal method renews correctly the voms-proxy. """ proxyPath = self.proxy.getProxyFilename() time.sleep(70) attribute = self.proxy.prepareAttForVomsRenewal(self.proxy.getAttributeFromProxy(proxyPath)) self.proxy.vomsExtensionRenewal(proxyPath, attribute) vomsTimeLeft = self.proxy.getVomsLife(proxyPath) self.assertEqual(int(vomsTimeLeft) / 3600, 191) @attr("integration") def testElevateAttribute(self): """ Test if the vomsExtensionRenewal method elevate last attributes given. """ proxyPath = self.proxy.getProxyFilename() attribute = self.proxy.prepareAttForVomsRenewal("/cms/Role=NULL/Capability=NULL") self.proxy.vomsExtensionRenewal(proxyPath, attribute) self.assertEqual(self.proxy.getAttributeFromProxy(proxyPath), "/cms/Role=NULL/Capability=NULL") # Restore the original configuration of the proxy self.proxy.create() @attr("integration") def testUserGroupInProxy(self): """ Test if getUserAttributes method returns correctly the user group. """ self.assertTrue(self.proxy.group, "No group set. Testing incomplete.") self.assertEqual(self.proxy.group, self.getUserAttributes().split("\n")[0].split("/")[2]) @attr("integration") def testUserRoleInProxy(self): """ Test if getUserAttributes method returns correctly the user role. """ self.assertEqual(self.proxy.role, self.getUserAttributes().split("\n")[0].split("/")[3].split("=")[1]) @attr("integration") def testGetAttributes(self): """ Test getAttributeFromProxy method. """ self.assertTrue(self.proxy.group, "No group set. Testing incomplete.") if not self.dict["role"]: role = "NULL" else: role = self.dict["role"] proxyPath = self.proxy.getProxyFilename() self.assertEqual(self.proxy.getAttributeFromProxy(proxyPath).split("/")[2], self.dict["group"]) self.assertEqual(self.proxy.getAttributeFromProxy(proxyPath).split("/")[3].split("=")[1], role) @attr("integration") def testGetUserGroupAndRole(self): """ Test GetUserGroupAndRoleFromProxy method. """ if not self.dict["role"]: role = "NULL" else: role = self.dict["role"] proxyPath = self.proxy.getProxyFilename() if self.dict["group"] and self.dict["role"]: self.assertEqual(self.proxy.getUserGroupAndRoleFromProxy(proxyPath)[0], self.dict["group"]) self.assertEqual(self.proxy.getUserGroupAndRoleFromProxy(proxyPath)[1], role)
def createNewMyProxy(self, timeleftthreshold=0, nokey=False): """ Handles the MyProxy creation Let the following variables be timeleftthreshold: the proxy in myproxy should be delegated for at least this time (14 days) myproxytimeleft: current validity of your proxy in myproxy usercertDaysLeft: the number of days left before your user certificate expire myproxyDesiredValidity: delegate the proxy in myproxy for that time (30 days) If we need to renew the proxy in myproxy because its atributes has changed or because it is valid for less time than timeleftthreshold then we do it. Before doing that, we check when the user certificate is expiring. If it's within the timeleftthreshold (myproxytimeleft < timeleftthreshold) we delegate the proxy just for the time we need (checking first if we did not already do it since at some point usercertDaysLeft ~= myproxytimeleft and we don't need to delegate it at every command even though myproxytimeleft < timeleftthreshold). Note that a warning message is printed at every command it usercertDaysLeft < timeleftthreshold """ myproxy = Proxy(self.defaultDelegation) myproxy.userDN = myproxy.getSubjectFromCert(self.certLocation) myproxytimeleft = 0 self.logger.debug("Getting myproxy life time left for %s" % self.defaultDelegation["myProxySvr"]) # return an integer that indicates the number of seconds to the expiration of the proxy in myproxy myproxytimeleft = myproxy.getMyProxyTimeLeft(serverRenewer=True, nokey=nokey) self.logger.debug("Myproxy is valid: %i" % myproxytimeleft) trustRetrListChanged = myproxy.trustedRetrievers != self.defaultDelegation[ 'serverDN'] #list on the REST and on myproxy are different if myproxytimeleft < timeleftthreshold or self.proxyChanged or trustRetrListChanged: # checking the enddate of the user certificate usercertDaysLeft = myproxy.getUserCertEnddate() if usercertDaysLeft == 0: msg = "%sYOUR USER CERTIFICATE IS EXPIRED (OR WILL EXPIRE TODAY). YOU CANNOT USE THE CRAB3 CLIENT. PLEASE REQUEST A NEW CERTIFICATE HERE https://gridca.cern.ch/gridca/ AND SEE https://ca.cern.ch/ca/Help/?kbid=024010%s"\ % (colors.RED, colors.NORMAL) raise ProxyCreationException(msg) #if the certificate is going to expire print a warning. This is going to bre printed at every command if #the myproxytimeleft is inferior to the timeleftthreshold if usercertDaysLeft < self.myproxyDesiredValidity: self.logger.info("%sYour user certificate is going to expire in %s days. https://twiki.cern.ch/twiki/bin/view/CMSPublic/WorkBookStartingGrid#ObtainingCert %s"\ % (colors.RED, usercertDaysLeft, colors.NORMAL) ) #check if usercertDaysLeft ~= myproxytimeleft which means we already delegated the proxy for as long as we could if abs( usercertDaysLeft * 60 * 60 * 24 - myproxytimeleft ) < 60 * 60 * 24 and not trustRetrListChanged: #less than one day between usercertDaysLeft and myproxytimeleft return #adjust the myproxy delegation time accordingly to the user cert validity self.logger.info("%sDelegating your proxy for %s days instead of %s %s"\ % (colors.RED, usercertDaysLeft, self.myproxyDesiredValidity, colors.NORMAL) ) myproxy.myproxyValidity = "%i:00" % (usercertDaysLeft * 24) # creating the proxy self.logger.debug("Delegating a myproxy for %s hours" % myproxy.myproxyValidity) try: myproxy.delegate(serverRenewer=True, nokey=nokey) myproxytimeleft = myproxy.getMyProxyTimeLeft( serverRenewer=True, nokey=nokey) if myproxytimeleft <= 0: raise ProxyCreationException("It seems your proxy has not been delegated to myproxy. Please check the logfile for the exact error "+\ "(it might simply you typed a wrong password)") else: self.logger.debug("My-proxy delegated.") except Exception as ex: msg = ex._message if hasattr(ex, '_message') else str(ex) raise ProxyCreationException( "Problems delegating My-proxy. %s" % msg)
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel proxyArgs = {'logger': logging.getLogger(), 'cleanEnvironment': True} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY self.userCertFile = self.proxy.getUserCertFilename() # X509_USER_CERT # credential lifetime warning/error thresholds, in days self.credThresholds = { 'proxy': { 'error': 3, 'warning': 5 }, 'certificate': { 'error': 10, 'warning': 20 } } # Monitoring setup self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None) self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None) self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False) self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None) self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('cms-mb.cern.ch', 61313)]) # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here if hasattr(self.config, "Tier0Feeder"): self.isT0agent = True self.producer = "tier0wmagent" else: self.isT0agent = False self.producer = "wmagent" localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.General.centralWMStatsURL self.replicatorDocs.append({ 'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter" }) if self.isT0agent: t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({ 'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter" }) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams[ "ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = { 'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url'] } localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({ 'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params }) self.replicatorDocs.append({ 'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params }) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate(rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get( 'query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter( self.config.General.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() @timeFunction def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkCredLifetime(agentInfo, "proxy") self.checkCredLifetime(agentInfo, "certificate") timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not self.isT0agent: timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) self.uploadAgentInfoToCentralWMStats(agentInfo) self.buildMonITDocs(agentInfo) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} wqStates = ['Available', 'Acquired'] results['workByStatus'] = self.workqueueDS.getJobsByStatus() results[ 'workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority( ) elements = self.workqueueDS.getElementsByStatus(wqStates) uniSites, posSites = getGlobalSiteStatusSummary(elements, status=wqStates, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = { 'name': 'CouchServer', 'status': 'ok', 'error_message': "" } if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus( rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get( 'couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo): """ Add some required fields to the document before it can get uploaded to WMStats. :param agentInfo: dict with agent stats to be posted to couchdb """ agentInfo['_id'] = agentInfo["agent_url"] agentInfo['timestamp'] = int(time.time()) agentInfo['type'] = "agent_info" # directly upload to the remote to prevent data conflict when agent is cleaned up and redeployed try: self.centralWMStatsCouchDB.updateAgentInfo( agentInfo, propertiesToKeep=["data_last_update", "data_error"]) except Exception as e: logging.error( "Failed to upload agent statistics to WMStats. Error: %s", str(e)) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug( "Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug( "Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug( "Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug( "Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug( "Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug( "List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkCredLifetime(self, agInfo, credType): """ Check the credential lifetime. Usually X509_USER_PROXY or X509_USER_CERT and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :param credType: credential type, can be: "proxy" or "certificate" :return: same dictionary object plus additional keys/values if needed. """ if credType == "proxy": credFile = self.proxyFile secsLeft = self.proxy.getTimeLeft(proxy=credFile) elif credType == "certificate": credFile = self.userCertFile secsLeft = self.proxy.getUserCertTimeLeft(openSSL=True) else: logging.error( "Unknown credential type. Available options are: [proxy, certificate]" ) return logging.debug("%s '%s' lifetime is %d seconds", credType, credFile, secsLeft) daysLeft = secsLeft / (60 * 60 * 24) if daysLeft <= self.credThresholds[credType]['error']: credWarning = True agInfo['status'] = "error" elif daysLeft <= self.credThresholds[credType]['warning']: credWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: credWarning = False if credWarning: warnMsg = "Agent %s '%s' must be renewed ASAP. " % (credType, credFile) warnMsg += "Its time left is: %.2f hours;" % (secsLeft / 3600.) agInfo['proxy_warning'] = agInfo.get('proxy_warning', "") + warnMsg logging.warning(warnMsg) return def buildMonITDocs(self, dataStats): """ Convert agent statistics into MonIT-friendly documents to be posted to AMQ/ES. It creates 5 different type of documents: * priority information * site information * work information * agent information * agent health information Note that the internal methods are popping some metrics out of dataStats """ if not self.postToAMQ: return logging.info("Preparing documents to be posted to AMQ/MonIT..") allDocs = self._buildMonITPrioDocs(dataStats) allDocs.extend(self._buildMonITSitesDocs(dataStats)) allDocs.extend(self._buildMonITWorkDocs(dataStats)) allDocs.extend(self._buildMonITWMBSDocs(dataStats)) allDocs.extend(self._buildMonITAgentDocs(dataStats)) allDocs.extend(self._buildMonITHealthDocs(dataStats)) allDocs.extend(self._buildMonITSummaryDocs(dataStats)) # and finally post them all to AMQ logging.info("Found %d documents to post to AMQ", len(allDocs)) self.uploadToAMQ(allDocs, dataStats['agent_url'], dataStats['timestamp']) def _buildMonITPrioDocs(self, dataStats): """ Uses the `sitePendCountByPrio` metric in order to build documents reporting the site name, job priority and amount of jobs within that priority. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_prio_info MonIT docs """ docType = "wma_prio_info" prioDocs = [] sitePendCountByPrio = dataStats['WMBS_INFO'].pop( 'sitePendCountByPrio', []) for site, item in viewitems(sitePendCountByPrio): # it seems sites with no jobs are also always here as "Sitename": {0: 0} if list(item) == [0]: continue for prio, jobs in viewitems(item): prioDoc = {} prioDoc['site_name'] = site prioDoc['type'] = docType prioDoc['priority'] = prio prioDoc['job_count'] = jobs prioDocs.append(prioDoc) return prioDocs def _buildMonITSitesDocs(self, dataStats): """ Uses the site thresholds and job information for each site in order to build a `site_info` document type for MonIT. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_site_info MonIT docs """ docType = "wma_site_info" siteDocs = [] thresholds = dataStats['WMBS_INFO'].pop('thresholds', {}) thresholdsGQ2LQ = dataStats['WMBS_INFO'].pop('thresholdsGQ2LQ', {}) if self.isT0agent: possibleJobsPerSite = {} uniqueJobsPerSite = {} else: possibleJobsPerSite = dataStats['LocalWQ_INFO'].pop( 'possibleJobsPerSite', {}) uniqueJobsPerSite = dataStats['LocalWQ_INFO'].pop( 'uniqueJobsPerSite', {}) for site in sorted(thresholds): siteDoc = {} siteDoc['site_name'] = site siteDoc['type'] = docType siteDoc['thresholds'] = thresholds[site] siteDoc['state'] = siteDoc['thresholds'].pop('state', 'Unknown') siteDoc['thresholdsGQ2LQ'] = thresholdsGQ2LQ.get(site, 0) for status in possibleJobsPerSite: # make sure these keys are always present in the documents jobKey = "possible_%s_jobs" % status.lower() elemKey = "num_%s_elem" % status.lower() uniJobKey = "unique_%s_jobs" % status.lower() siteDoc[jobKey], siteDoc[elemKey], siteDoc[uniJobKey] = 0, 0, 0 if site in possibleJobsPerSite[status]: siteDoc[jobKey] = possibleJobsPerSite[status][site][ 'sum_jobs'] siteDoc[elemKey] = possibleJobsPerSite[status][site][ 'num_elem'] if site in uniqueJobsPerSite[status]: siteDoc[uniJobKey] = uniqueJobsPerSite[status][site][ 'sum_jobs'] siteDocs.append(siteDoc) return siteDocs def _buildMonITWorkDocs(self, dataStats): """ Uses the local workqueue information order by WQE status and build statistics for the workload in terms of workqueue elements and top level jobs. Using the WMBS data, also builds documents to show the amount of work in 'created' and 'executing' WMBS status. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_work_info MonIT docs """ workDocs = [] if self.isT0agent: return workDocs docType = "wma_work_info" workByStatus = dataStats['LocalWQ_INFO'].pop('workByStatus', {}) for status, info in viewitems(workByStatus): workDoc = {} workDoc['type'] = docType workDoc['status'] = status workDoc['num_elem'] = info.get('num_elem', 0) workDoc['sum_jobs'] = info.get('sum_jobs', 0) workDocs.append(workDoc) return workDocs def _buildMonITWMBSDocs(self, dataStats): """ Using the WMBS data, builds documents to show the amount of work in 'created' and 'executing' WMBS status. It also builds a document for every single wmbs_status in the database. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_wmbs_info and wma_wmbs_state_info docs """ docType = "wma_wmbs_info" wmbsDocs = [] wmbsCreatedTypeCount = dataStats['WMBS_INFO'].pop( 'wmbsCreatedTypeCount', {}) wmbsExecutingTypeCount = dataStats['WMBS_INFO'].pop( 'wmbsExecutingTypeCount', {}) for jobType in wmbsCreatedTypeCount: wmbsDoc = {} wmbsDoc['type'] = docType wmbsDoc['job_type'] = jobType wmbsDoc['created_jobs'] = wmbsCreatedTypeCount[jobType] wmbsDoc['executing_jobs'] = wmbsExecutingTypeCount[jobType] wmbsDocs.append(wmbsDoc) docType = "wma_wmbs_state_info" wmbsCountByState = dataStats['WMBS_INFO'].pop('wmbsCountByState', {}) for wmbsStatus in wmbsCountByState: wmbsDoc = {} wmbsDoc['type'] = docType wmbsDoc['wmbs_status'] = wmbsStatus wmbsDoc['num_jobs'] = wmbsCountByState[wmbsStatus] wmbsDocs.append(wmbsDoc) return wmbsDocs def _buildMonITAgentDocs(self, dataStats): """ Uses the BossAir and WMBS table information in order to build a view of amount of jobs in different statuses. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_agent_info MonIT docs """ docType = "wma_agent_info" agentDocs = [] activeRunJobByStatus = dataStats['WMBS_INFO'].pop( 'activeRunJobByStatus', {}) completeRunJobByStatus = dataStats['WMBS_INFO'].pop( 'completeRunJobByStatus', {}) for schedStatus in activeRunJobByStatus: agentDoc = {} agentDoc['type'] = docType agentDoc['schedd_status'] = schedStatus agentDoc['active_jobs'] = activeRunJobByStatus[schedStatus] agentDoc['completed_jobs'] = completeRunJobByStatus[schedStatus] agentDocs.append(agentDoc) return agentDocs def _buildMonITHealthDocs(self, dataStats): """ Creates documents with specific agent information, status of each component and worker thread (similar to what is shown in wmstats) and also some very basic performance numbers. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_health_info MonIT docs """ docType = "wma_health_info" healthDocs = [] workersStatus = dataStats.pop('workers', {}) for worker in workersStatus: healthDoc = {} healthDoc['type'] = docType healthDoc['worker_name'] = worker['name'] healthDoc['worker_state'] = worker['state'] healthDoc['worker_poll'] = worker['poll_interval'] healthDoc['worker_last_hb'] = worker['last_updated'] healthDoc['worker_cycle_time'] = worker['cycle_time'] healthDocs.append(healthDoc) return healthDocs def _buildMonITSummaryDocs(self, dataStats): """ Creates a document with the very basic agent info used in the wmstats monitoring tab. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_health_info MonIT docs """ docType = "wma_summary_info" summaryDocs = [] summaryDoc = {} summaryDoc['type'] = docType summaryDoc['agent_team'] = dataStats['agent_team'] summaryDoc['agent_version'] = dataStats['agent_version'] summaryDoc['agent_status'] = dataStats['status'] if not self.isT0agent: summaryDoc['wq_query_time'] = dataStats['LocalWQ_INFO'][ 'total_query_time'] summaryDoc['wmbs_query_time'] = dataStats['WMBS_INFO'][ 'total_query_time'] summaryDoc['drain_mode'] = dataStats['drain_mode'] summaryDoc['down_components'] = dataStats['down_components'] summaryDocs.append(summaryDoc) return summaryDocs def uploadToAMQ(self, docs, agentUrl, timeS): """ _uploadToAMQ_ Sends data to AMQ, which ends up in the MonIT infrastructure. :param docs: list of documents/dicts to be posted """ if not docs: logging.info("There are no documents to send to AMQ") return # add mandatory information for every single document for doc in docs: doc['agent_url'] = agentUrl docType = "cms_%s_info" % self.producer notifications = [] logging.debug("Sending the following data to AMQ %s", pformat(docs)) try: stompSvc = StompAMQ(username=self.userAMQ, password=self.passAMQ, producer=self.producer, topic=self.topicAMQ, validation_schema=None, host_and_ports=self.hostPortAMQ, logger=logging) for doc in docs: singleNotif, _, _ = stompSvc.make_notification( payload=doc, docType=docType, ts=timeS, dataSubfield="payload") notifications.append(singleNotif) failures = stompSvc.send(notifications) msg = "%i out of %i documents successfully sent to AMQ" % ( len(notifications) - len(failures), len(notifications)) logging.info(msg) except Exception as ex: logging.exception("Failed to send data to StompAMQ. Error %s", str(ex)) return
def __init__(self, config): BasePlugin.__init__(self, config) self.locationDict = {} myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.locationAction = daoFactory(classname="Locations.GetSiteInfo") self.packageDir = None # if agent is running in a container, Unpacker.py must come from a directory # on the host so the condor schedd can see it # config.General.workDir should always be bind mounted to the container if getattr(config.Agent, "isDocker", False): unpackerPath = os.path.join(config.General.workDir + "/Docker/WMRuntime/Unpacker.py") else: unpackerPath = os.path.join( getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py') if os.path.exists(unpackerPath): self.unpacker = unpackerPath else: self.unpacker = os.path.join(getWMBASE(), 'WMCore/WMRuntime/Unpacker.py') self.agent = getattr(config.Agent, 'agentName', 'WMAgent') self.sandbox = None self.scriptFile = config.JobSubmitter.submitScript self.defaultTaskPriority = getattr(config.BossAir, 'defaultTaskPriority', 0) self.maxTaskPriority = getattr(config.BossAir, 'maxTaskPriority', 1e7) self.jobsPerSubmit = getattr(config.JobSubmitter, 'jobsPerSubmit', 200) self.extraMem = getattr(config.JobSubmitter, 'extraMemoryPerCore', 500) # Required for global pool accounting self.acctGroup = getattr(config.BossAir, 'acctGroup', "production") self.acctGroupUser = getattr(config.BossAir, 'acctGroupUser', "cmsdataops") # Build a requirement string. All CMS resources match DESIRED_Sites on the START # expression side; however, there are currently some resources (T2_CH_CERN_HLT) # that are missing the REQUIRED_OS logic. Hence, we duplicate it here. # TODO(bbockelm): Remove reqStr once HLT has upgraded. self.reqStr = ( '((REQUIRED_OS=?="any") || ' '(GLIDEIN_REQUIRED_OS =?= "any") || ' 'stringListMember(GLIDEIN_REQUIRED_OS, REQUIRED_OS)) && ' '(AuthenticatedIdentity =!= "*****@*****.**")') if hasattr(config.BossAir, 'condorRequirementsString'): self.reqStr = config.BossAir.condorRequirementsString # x509 proxy handling proxy = Proxy({'logger': myThread.logger}) self.x509userproxy = proxy.getProxyFilename() # These are added now by the condor client #self.x509userproxysubject = proxy.getSubject() #self.x509userproxyfqan = proxy.getAttributeFromProxy(self.x509userproxy) return
def createNewMyProxy(self, timeleftthreshold=0, nokey=False): """ Handles the MyProxy creation Let the following variables be timeleftthreshold: the proxy in myproxy should be delegated for at least this time (14 days) myproxytimeleft: current validity of your proxy in myproxy usercertDaysLeft: the number of days left before your user certificate expire myproxyDesiredValidity: delegate the proxy in myproxy for that time (30 days) If we need to renew the proxy in myproxy because its atributes has changed or because it is valid for less time than timeleftthreshold then we do it. Before doing that, we check when the user certificate is expiring. If it's within the timeleftthreshold (myproxytimeleft < timeleftthreshold) we delegate the proxy just for the time we need (checking first if we did not already do it since at some point usercertDaysLeft ~= myproxytimeleft and we don't need to delegate it at every command even though myproxytimeleft < timeleftthreshold). Note that a warning message is printed at every command it usercertDaysLeft < timeleftthreshold """ myproxy = Proxy ( self.defaultDelegation ) myproxy.userDN = myproxy.getSubjectFromCert(self.certLocation) myproxytimeleft = 0 self.logger.debug("Getting myproxy life time left for %s" % self.defaultDelegation["myProxySvr"]) # return an integer that indicates the number of seconds to the expiration of the proxy in myproxy # Also catch the exception in case WMCore encounters a problem with the proxy itself (one such case was #4532) try: myproxytimeleft = myproxy.getMyProxyTimeLeft(serverRenewer=True, nokey=nokey) except Exception as ex: logging.exception("Problems calculating proxy lifetime, logging stack trace and raising ProxyCreationException") # WMException may contain the _message attribute. Otherwise, take the exception as a string. msg = ex._message if hasattr(ex, "_message") else str(ex) raise ProxyCreationException("Problems calculating the time left until the expiration of the proxy." " Please reset your environment or contact [email protected] if the problem persists.\n%s" % msg) self.logger.debug("Myproxy is valid: %i" % myproxytimeleft) trustRetrListChanged = myproxy.trustedRetrievers!=self.defaultDelegation['serverDN'] #list on the REST and on myproxy are different if myproxytimeleft < timeleftthreshold or self.proxyChanged or trustRetrListChanged: # checking the enddate of the user certificate usercertDaysLeft = myproxy.getUserCertEnddate() if usercertDaysLeft == 0: msg = "%sYOUR USER CERTIFICATE IS EXPIRED (OR WILL EXPIRE TODAY). YOU CANNOT USE THE CRAB3 CLIENT. PLEASE REQUEST A NEW CERTIFICATE HERE https://gridca.cern.ch/gridca/ AND SEE https://ca.cern.ch/ca/Help/?kbid=024010%s"\ % (colors.RED, colors.NORMAL) raise ProxyCreationException(msg) #if the certificate is going to expire print a warning. This is going to bre printed at every command if #the myproxytimeleft is inferior to the timeleftthreshold if usercertDaysLeft < self.myproxyDesiredValidity: self.logger.info("%sYour user certificate is going to expire in %s days. https://twiki.cern.ch/twiki/bin/view/CMSPublic/WorkBookStartingGrid#ObtainingCert %s"\ % (colors.RED, usercertDaysLeft, colors.NORMAL) ) #check if usercertDaysLeft ~= myproxytimeleft which means we already delegated the proxy for as long as we could if abs(usercertDaysLeft*60*60*24 - myproxytimeleft) < 60*60*24 and not trustRetrListChanged: #less than one day between usercertDaysLeft and myproxytimeleft return #adjust the myproxy delegation time accordingly to the user cert validity self.logger.info("%sDelegating your proxy for %s days instead of %s %s"\ % (colors.RED, usercertDaysLeft, self.myproxyDesiredValidity, colors.NORMAL) ) myproxy.myproxyValidity = "%i:00" % (usercertDaysLeft*24) # creating the proxy self.logger.debug("Delegating a myproxy for %s hours" % myproxy.myproxyValidity ) try: myproxy.delegate(serverRenewer = True, nokey=nokey) myproxytimeleft = myproxy.getMyProxyTimeLeft(serverRenewer=True, nokey=nokey) if myproxytimeleft <= 0: raise ProxyCreationException("It seems your proxy has not been delegated to myproxy. Please check the logfile for the exact error "+\ "(it might simply you typed a wrong password)") else: self.logger.debug("My-proxy delegated.") except Exception as ex: msg = ex._message if hasattr(ex, '_message') else str(ex) raise ProxyCreationException("Problems delegating My-proxy. %s" % msg)
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel self.jsonFile = config.AgentStatusWatcher.jsonFile proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({ 'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter" }) # TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({ 'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter" }) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams[ "ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = { 'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url'] } localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({ 'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params }) self.replicatorDocs.append({ 'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params }) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate(rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get( 'query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkProxyLifetime(agentInfo) timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not hasattr(self.config, "Tier0Feeder"): # Tier0 Agent doesn't have LQ. timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) # save locally json file as well with open(self.jsonFile, 'w') as outFile: json.dump(agentInfo, outFile, indent=2) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} results['workByStatus'] = self.workqueueDS.getJobsByStatus() results[ 'workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority( ) elements = self.workqueueDS.getElementsByStatus( ['Available', 'Acquired']) uniSites, posSites = getGlobalSiteStatusSummary(elements, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = { 'name': 'CouchServer', 'status': 'ok', 'error_message': "" } if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus( rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo() if lastDataUpload['data_last_update']: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']: agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get( 'couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): # direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug( "Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug( "Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug( "Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug( "Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug( "Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug( "List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkProxyLifetime(self, agInfo): """ Check the proxy lifetime (usually X509_USER_CERT) and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :return: same dictionary object plus additional keys/values if needed. """ secsLeft = self.proxy.getTimeLeft(proxy=self.proxyFile) logging.debug("Proxy '%s' lifetime is %d secs", self.proxyFile, secsLeft) if secsLeft <= 86400 * 3: # 3 days proxyWarning = True agInfo['status'] = "error" elif secsLeft <= 86400 * 5: # 5 days proxyWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: proxyWarning = False if proxyWarning: warnMsg = "Agent proxy '%s' must be renewed ASAP. " % self.proxyFile warnMsg += "Its time left is: %.2f hours." % (secsLeft / 3600.) agInfo['proxy_warning'] = warnMsg return
def createNewMyProxy2(self, timeleftthreshold=0, nokey=False): """ Handles the MyProxy creation. In this version the credential name will be simply <username>_CRAB like e.g. belforte_CRAB where username is the CERN username Let the following variables be timeleftthreshold: the proxy in myproxy should be delegated for at least this time (14 days) myproxytimeleft: current validity of your proxy in myproxy usercertDaysLeft: the number of days left before your user certificate expire myproxyDesiredValidity: delegate the proxy in myproxy for that time (30 days) If we need to renew the proxy in myproxy because its atributes has changed or because it is valid for less time than timeleftthreshold then we do it. Before doing that, we check when the user certificate is expiring. If it's within the timeleftthreshold (myproxytimeleft < timeleftthreshold) we delegate the proxy just for the time we need (checking first if we did not already do it since at some point usercertDaysLeft ~= myproxytimeleft and we don't need to delegate it at every command even though myproxytimeleft < timeleftthreshold). Note that a warning message is printed at every command it usercertDaysLeft < timeleftthreshold :returns a tupla with info in the credential in myprosxy: (credentialName, myproxytimeleft) credentialName : username to use in myproxy -l username myproxytimeleft: validity of the credential in seconds """ defaultDelegation = self.defaultDelegation defaultDelegation['myproxyAccount'] = None from CRABClient.UserUtilities import getUsername username = getUsername(proxyFile=self.proxyInfo['filename'], logger=self.logger) credentialName = username + '_CRAB' defaultDelegation['userName'] = credentialName myproxy = Proxy(defaultDelegation) #userDNFromCert = myproxy.getSubjectFromCert(self.certLocation) #if userDNFromCert: # myproxy.userDN = userDNFromCert myproxytimeleft = 0 self.logger.debug("Getting myproxy life time left for %s" % self.defaultDelegation["myProxySvr"]) # return an integer that indicates the number of seconds to the expiration of the proxy in myproxy # Also catch the exception in case WMCore encounters a problem with the proxy itself (one such case was #4532) try: myproxytimeleft = myproxy.getMyProxyTimeLeft(serverRenewer=True, nokey=nokey) except CredentialException as ex: msg = "WMCore could not computer valid time for credential %s .\n Error detail: " % credentialName msg += "%s" % str(ex._message) msg += "\nTry to remove old myproxy credentials as per https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ#crab_command_fails_with_Impossib" self.logger.error(msg) raise ProxyCreationException("no valid credential for %s" % credentialName) except Exception as ex: logging.exception( "Problems calculating proxy lifetime, logging stack trace and raising ProxyCreationException" ) # WMException may contain the _message attribute. Otherwise, take the exception as a string. msg = ex._message if hasattr(ex, "_message") else str(ex) # pylint: disable=protected-access, no-member raise ProxyCreationException( "Problems calculating the time left until the expiration of the proxy." + " Please reset your environment or contact [email protected] if the problem persists.\n%s" % msg) self.logger.debug("Myproxy is valid: %i", myproxytimeleft) trustRetrListChanged = myproxy.trustedRetrievers != self.defaultDelegation[ 'serverDN'] #list on the REST and on myproxy are different if myproxytimeleft < timeleftthreshold or self.proxyChanged or trustRetrListChanged: # checking the enddate of the user certificate usercertDaysLeft = myproxy.getUserCertEnddate() if usercertDaysLeft == 0: msg = "%sYOUR USER CERTIFICATE IS EXPIRED (OR WILL EXPIRE TODAY)." % colors.RED msg += " YOU CANNOT USE THE CRAB3 CLIENT." msg += " PLEASE REQUEST A NEW CERTIFICATE HERE https://gridca.cern.ch/gridca/" msg += " AND SEE https://ca.cern.ch/ca/Help/?kbid=024010%s" % colors.NORMAL raise ProxyCreationException(msg) #if the certificate is going to expire print a warning. This is going to bre printed at every command if #the myproxytimeleft is inferior to the timeleftthreshold if usercertDaysLeft < self.myproxyDesiredValidity: msg = "%sYour user certificate is going to expire in %s days." % ( colors.RED, usercertDaysLeft) msg += " See: https://twiki.cern.ch/twiki/bin/view/CMSPublic/WorkBookStartingGrid#ObtainingCert %s" % colors.NORMAL self.logger.info(msg) #check if usercertDaysLeft ~= myproxytimeleft which means we already delegated the proxy for as long as we could if abs( usercertDaysLeft * 60 * 60 * 24 - myproxytimeleft ) < 60 * 60 * 24 and not trustRetrListChanged: #less than one day between usercertDaysLeft and myproxytimeleft return (credentialName, myproxytimeleft) #adjust the myproxy delegation time accordingly to the user cert validity self.logger.info( "%sDelegating your proxy for %s days instead of %s %s", colors.RED, usercertDaysLeft, self.myproxyDesiredValidity, colors.NORMAL) myproxy.myproxyValidity = "%i:00" % (usercertDaysLeft * 24) # creating the proxy self.logger.debug("Delegating a myproxy for %s hours", myproxy.myproxyValidity) try: myproxy.delegate(serverRenewer=True, nokey=nokey) myproxytimeleft = myproxy.getMyProxyTimeLeft( serverRenewer=True, nokey=nokey) if myproxytimeleft <= 0: raise ProxyCreationException("It seems your proxy has not been delegated to myproxy. Please check the logfile for the exact error "+\ "(it might simply you typed a wrong password)") else: self.logger.debug("My-proxy delegated.") except Exception as ex: msg = ex._message if hasattr(ex, '_message') else str(ex) # pylint: disable=protected-access, no-member raise ProxyCreationException( "Problems delegating My-proxy. %s" % msg) return (credentialName, myproxytimeleft)