def testAlertsMessagingBasic(self): config = getConfig("/tmp") self.assertTrue(hasattr(config, "Alert")) # initialization # sender: instance of Alert messages Sender # preAlert: pre-defined values for Alert instances generated from this class self.config = config # needed in setUpAlertsMessaging preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = "testBasic") sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) # set up a temporary alert message receiver handler, receiver = utils.setUpReceiver(config.Alert.address, config.Alert.controlAddr) # test sending alert msg = "this is my message Basic" sendAlert(100, msg = msg) # wait for the alert to arrive while len(handler.queue) == 0: time.sleep(0.3) print "%s waiting for alert to arrive ..." % inspect.stack()[0][3] self.assertEqual(len(handler.queue), 1) alert = handler.queue[0] self.assertEqual(alert["Component"], "testBasic") self.assertEqual(alert["Level"], 100) self.assertEqual(alert["Source"], self.__class__.__name__) self.assertEqual(alert["Details"]["msg"], msg) sender.unregister() receiver.shutdown()
def testAlertsMessagingBasic(self): config = getConfig("/tmp") self.assertTrue(hasattr(config, "Alert")) # initialization # sender: instance of Alert messages Sender # preAlert: pre-defined values for Alert instances generated from this class self.config = config # needed in setUpAlertsMessaging preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName="testBasic") sendAlert = alertAPI.getSendAlert(sender=sender, preAlert=preAlert) # set up a temporary alert message receiver handler, receiver = utils.setUpReceiver(config.Alert.address, config.Alert.controlAddr) # test sending alert msg = "this is my message Basic" sendAlert(100, msg=msg) # wait for the alert to arrive while len(handler.queue) == 0: time.sleep(0.3) print "%s waiting for alert to arrive ..." % inspect.stack()[0][3] self.assertEqual(len(handler.queue), 1) alert = handler.queue[0] self.assertEqual(alert["Component"], "testBasic") self.assertEqual(alert["Level"], 100) self.assertEqual(alert["Source"], self.__class__.__name__) self.assertEqual(alert["Details"]["msg"], msg) sender.unregister() receiver.shutdown()
def testAgentConfigurationRetrieving(self): """ Test that getting some agent details (config values from config.Agent section) will be correctly propagated into Alert instances. Alert instance is obtained via API.getPredefinedAlert factory. """ d = dict(Additional = "detail") # instantiate just plain Alert, no configuration to take # into account at this point a = Alert(**d) self.assertEqual(a["HostName"], None) self.assertEqual(a["Contact"], None) self.assertEqual(a["TeamName"], None) self.assertEqual(a["AgentName"], None) self.assertEqual(a["Additional"], "detail") # instantiate via factory which reads configuration instance config = Configuration() config.section_("Agent") config.Agent.hostName = "some1" config.Agent.contact = "some2" config.Agent.teamName = "some3" config.Agent.agentName = "some4" a = alertAPI.getPredefinedAlert(**d) self.assertEqual(a["HostName"], "some1") self.assertEqual(a["Contact"], "some2") self.assertEqual(a["TeamName"], "some3") self.assertEqual(a["AgentName"], "some4") self.assertEqual(a["Additional"], "detail")
def __init__(self, config, generator): threading.Thread.__init__(self) # it's particular Poller config only self.config = config # reference to AlertGenerator instance self.generator = generator # store levels (critical, soft) for critical, soft thresholds correspondence # these values are defined in the AlertProcessor config # self.levels and self.thresholds has to have the same corresponding order # and critical has to be first - if this threshold is caught, no point # testing soft one # this belongs to the AlertGenerator and is in fact dependent on AlertProcessor # by referencing these two values - not sure if to tolerate such dependecy or # configure these two values independently in AlertGenerator itself (surely a # possible mismatch would make a bit of chaos) self.levels = [self.generator.config.AlertProcessor.critical.level, self.generator.config.AlertProcessor.soft.level] # critical, soft threshold values self.thresholds = [self.config.critical, self.config.soft] # pre-generated alert values, but before sending always new instance is created # these values are used to update the newly created instance dictAlert = dict(Type = "WMAgent", Workload = "n/a", Component = self.generator.__class__.__name__, Source = "<to_overwrite>") self.preAlert = alertAPI.getPredefinedAlert(**dictAlert) # flag controlling run of the Thread self._stopFlag = False # thread own sleep time self._threadSleepTime = 0.2 # seconds
def __init__(self, config, generator): threading.Thread.__init__(self) # it's particular Poller config only self.config = config # reference to AlertGenerator instance self.generator = generator # store levels (critical, soft) for critical, soft thresholds correspondence # these values are defined in the AlertProcessor config # self.levels and self.thresholds has to have the same corresponding order # and critical has to be first - if this threshold is caught, no point # testing soft one # this belongs to the AlertGenerator and is in fact dependent on AlertProcessor # by referencing these two values - not sure if to tolerate such dependecy or # configure these two values independently in AlertGenerator itself (surely a # possible mismatch would make a bit of chaos) self.levels = [ self.generator.config.AlertProcessor.critical.level, self.generator.config.AlertProcessor.soft.level ] # critical, soft threshold values self.thresholds = [self.config.critical, self.config.soft] # pre-generated alert values, but before sending always new instance is created # these values are used to update the newly created instance dictAlert = dict(Type="WMAgent", Workload="n/a", Component=self.generator.__class__.__name__, Source="<to_overwrite>") self.preAlert = alertAPI.getPredefinedAlert(**dictAlert) # flag controlling run of the Thread self._stopFlag = False # thread own sleep time self._threadSleepTime = 0.2 # seconds
def testAgentConfigurationRetrieving(self): """ Test that getting some agent details (config values from config.Agent section) will be correctly propagated into Alert instances. Alert instance is obtained via API.getPredefinedAlert factory. """ d = dict(Additional="detail") # instantiate just plain Alert, no configuration to take # into account at this point a = Alert(**d) self.assertEqual(a["HostName"], None) self.assertEqual(a["Contact"], None) self.assertEqual(a["TeamName"], None) self.assertEqual(a["AgentName"], None) self.assertEqual(a["Additional"], "detail") # instantiate via factory which reads configuration instance config = Configuration() config.section_("Agent") config.Agent.hostName = "some1" config.Agent.contact = "some2" config.Agent.teamName = "some3" config.Agent.agentName = "some4" a = alertAPI.getPredefinedAlert(**d) self.assertEqual(a["HostName"], "some1") self.assertEqual(a["Contact"], "some2") self.assertEqual(a["TeamName"], "some3") self.assertEqual(a["AgentName"], "some4") self.assertEqual(a["Additional"], "detail")
def initAlerts(self, compName=None): """ _initAlerts_ Setup the alerts for the rest of the system. sender: instance of the Alert messages Sender sendAlert: the code what sends the actual Alerts (documented in WMCore/Alerts/APIgetSendAlert) note: Tests are done in the API_t belonging to Alerts fw. This particular method is called from a number of components and some have particular tests on alerts sending. """ if not compName: compName = self.__class__.__name__ preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName=compName) sendAlert = alertAPI.getSendAlert(sender=sender, preAlert=preAlert) self.sender = sender self.sendAlert = sendAlert
def initAlerts(self, compName = None): """ _initAlerts_ Setup the alerts for the rest of the system. sender: instance of the Alert messages Sender sendAlert: the code what sends the actual Alerts (documented in WMCore/Alerts/APIgetSendAlert) note: Tests are done in the API_t belonging to Alerts fw. This particular method is called from a number of components and some have particular tests on alerts sending. """ if not compName: compName = self.__class__.__name__ preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = compName) sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) self.sender = sender self.sendAlert = sendAlert
def __init__(self, config, generator): # it's particular Poller config only self.config = config # reference to AlertGenerator instance self.generator = generator # store levels (critical, soft) for critical, soft thresholds correspondence # these values are defined in the AlertProcessor config # self.levels and self.thresholds has to have the same corresponding order # and critical has to be first - if this threshold is caught, no point testing soft one self.levels = [self.generator.config.AlertProcessor.critical.level, self.generator.config.AlertProcessor.soft.level] # critical, soft threshold values self.thresholds = [self.config.critical, self.config.soft] # pre-generated alert values, but before sending always new instance is created # these values are used to update the newly created instance dictAlert = dict(Type = "WMAgent", Workload = "n/a", Component = self.generator.__class__.__name__, Source = "<to_overwrite>") self.preAlert = alertAPI.getPredefinedAlert(**dictAlert)
def __init__(self, logger = None, dbi = None, **params): WorkQueueBase.__init__(self, logger, dbi) self.parent_queue = None self.params = params # config argument (within params) shall be reference to # Configuration instance (will later be checked for presence of "Alert") self.config = params.get("Config", None) self.params.setdefault('CouchUrl', os.environ.get('COUCHURL')) if not self.params.get('CouchUrl'): raise RuntimeError, 'CouchUrl config value mandatory' self.params.setdefault('DbName', 'workqueue') self.params.setdefault('InboxDbName', self.params['DbName'] + '_inbox') self.params.setdefault('ParentQueueCouchUrl', None) # We get work from here self.backend = WorkQueueBackend(self.params['CouchUrl'], self.params['DbName'], self.params['InboxDbName'], self.params['ParentQueueCouchUrl'], self.params.get('QueueURL'), logger = self.logger) if self.params.get('ParentQueueCouchUrl'): self.parent_queue = WorkQueueBackend(self.params['ParentQueueCouchUrl'].rsplit('/', 1)[0], self.params['ParentQueueCouchUrl'].rsplit('/', 1)[1]) self.params.setdefault("GlobalDBS", "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet") self.params.setdefault('QueueDepth', 2) # when less than this locally self.params.setdefault('LocationRefreshInterval', 600) self.params.setdefault('FullLocationRefreshInterval', 7200) self.params.setdefault('TrackLocationOrSubscription', 'subscription') self.params.setdefault('ReleaseIncompleteBlocks', False) self.params.setdefault('ReleaseRequireSubscribed', True) self.params.setdefault('PhEDExEndpoint', None) self.params.setdefault('PopulateFilesets', True) self.params.setdefault('LocalQueueFlag', True) self.params.setdefault('JobDumpConfig', None) self.params.setdefault('BossAirConfig', None) self.params['QueueURL'] = self.backend.queueUrl # url this queue is visible on # backend took previous QueueURL and sanitized it self.params.setdefault('WMBSUrl', None) # this will only be set on local Queue self.params.setdefault('Teams', ['']) self.params.setdefault('DrainMode', False) if self.params.get('CacheDir'): try: os.makedirs(self.params['CacheDir']) except OSError: pass elif self.params.get('PopulateFilesets'): raise RuntimeError, 'CacheDir mandatory for local queue' self.params.setdefault('SplittingMapping', {}) self.params['SplittingMapping'].setdefault('DatasetBlock', {'name': 'Block', 'args': {}} ) self.params['SplittingMapping'].setdefault('MonteCarlo', {'name': 'MonteCarlo', 'args':{}} ) self.params['SplittingMapping'].setdefault('Dataset', {'name': 'Dataset', 'args': {}} ) self.params['SplittingMapping'].setdefault('Block', {'name': 'Block', 'args': {}} ) self.params['SplittingMapping'].setdefault('ResubmitBlock', {'name': 'ResubmitBlock', 'args': {}} ) self.params.setdefault('EndPolicySettings', {}) assert(self.params['TrackLocationOrSubscription'] in ('subscription', 'location')) # Can only release blocks on location if self.params['TrackLocationOrSubscription'] == 'location': if self.params['SplittingMapping']['DatasetBlock']['name'] != 'Block': raise RuntimeError, 'Only blocks can be released on location' if self.params.get('PhEDEx'): self.phedexService = self.params['PhEDEx'] else: phedexArgs = {} if self.params.get('PhEDExEndpoint'): phedexArgs['endpoint'] = self.params['PhEDExEndpoint'] self.phedexService = PhEDEx(phedexArgs) if self.params.get('SiteDB'): self.SiteDB = self.params['SiteDB'] else: self.SiteDB = SiteDB() if type(self.params['Teams']) in types.StringTypes: self.params['Teams'] = [x.strip() for x in \ self.params['Teams'].split(',')] self.dataLocationMapper = WorkQueueDataLocationMapper(self.logger, self.backend, phedex = self.phedexService, sitedb = self.SiteDB, locationFrom = self.params['TrackLocationOrSubscription'], incompleteBlocks = self.params['ReleaseIncompleteBlocks'], requireBlocksSubscribed = not self.params['ReleaseIncompleteBlocks'], fullRefreshInterval = self.params['FullLocationRefreshInterval'], updateIntervalCoarseness = self.params['LocationRefreshInterval']) # initialize alerts sending client (self.sendAlert() method) # usage: self.sendAlert(levelNum, msg = msg) ; level - integer 1 .. 10 # 1 - 4 - lower levels ; 5 - 10 higher levels preAlert, self.alertSender = \ alertAPI.setUpAlertsMessaging(self, compName = "WorkQueueManager") self.sendAlert = alertAPI.getSendAlert(sender = self.alertSender, preAlert = preAlert) self.logger.debug("WorkQueue created successfully")
def submit(self, jobs, info): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.submitDir = self.config.JobSubmitter.submitDir timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400) successfulJobs = [] failedJobs = [] jdlFiles = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs if len(self.pool) == 0: # Starting things up # This is obviously a submit API logging.info("Starting up CondorPlugin worker pool") self.input = multiprocessing.Queue() self.result = multiprocessing.Queue() for x in range(self.nProcess): p = multiprocessing.Process(target = submitWorker, args = (self.input, self.result, timeout)) p.start() self.pool.append(p) if not os.path.exists(self.submitDir): os.makedirs(self.submitDir) # Now assume that what we get is the following; a mostly # unordered list of jobs with random sandboxes. # We intend to sort them by sandbox. submitDict = {} nSubmits = 0 for job in jobs: sandbox = job['sandbox'] if not sandbox in submitDict.keys(): submitDict[sandbox] = [] submitDict[sandbox].append(job) # Now submit the bastards queueError = False for sandbox in submitDict.keys(): jobList = submitDict.get(sandbox, []) idList = [x['jobid'] for x in jobList] if queueError: # If the queue has failed, then we must not process # any more jobs this cycle. continue while len(jobList) > 0: jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker] jobList = jobList[self.config.JobSubmitter.jobsPerWorker:] idList = [x['id'] for x in jobsReady] jdlList = self.makeSubmit(jobList = jobsReady) if not jdlList or jdlList == []: # Then we got nothing logging.error("No JDL file made!") return {'NoResult': [0]} jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0]) handle = open(jdlFile, 'w') handle.writelines(jdlList) handle.close() jdlFiles.append(jdlFile) # Now submit them logging.info("About to submit %i jobs" %(len(jobsReady))) if self.glexecPath: command = 'CS=`which condor_submit`; ' if self.glexecWrapScript: command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile if self.glexecUnwrapScript: command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile) else: command += '%s $CS %s' % (self.glexecPath, jdlFile) else: command = "condor_submit %s" % jdlFile try: self.input.put({'command': command, 'idList': idList}) except AssertionError as ex: msg = "Critical error: input pipeline probably closed.\n" msg += str(ex) msg += "Error Procedure: Something critical has happened in the worker process\n" msg += "We will now proceed to pull all useful data from the queue (if it exists)\n" msg += "Then refresh the worker pool\n" logging.error(msg) queueError = True break nSubmits += 1 # Now we should have sent all jobs to be submitted # Going to do the rest of it now for n in range(nSubmits): try: res = self.result.get(block = True, timeout = timeout) except Queue.Empty: # If the queue was empty go to the next submit # Those jobs have vanished logging.error("Queue.Empty error received!") logging.error("This could indicate a critical condor error!") logging.error("However, no information of any use was obtained due to process failure.") logging.error("Either process failed, or process timed out after %s seconds." % timeout) queueError = True continue except AssertionError as ex: msg = "Found Assertion error while retrieving output from worker process.\n" msg += str(ex) msg += "This indicates something critical happened to a worker process" msg += "We will recover what jobs we know were submitted, and resubmit the rest" msg += "Refreshing worker pool at end of loop" logging.error(msg) queueError = True continue try: output = res['stdout'] error = res['stderr'] idList = res['idList'] exitCode = res['exitCode'] except KeyError as ex: msg = "Error in finding key from result pipe\n" msg += "Something has gone crticially wrong in the worker\n" try: msg += "Result: %s\n" % str(res) except: pass msg += str(ex) logging.error(msg) queueError = True continue if not exitCode == 0: logging.error("Condor returned non-zero. Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error = error) logging.error("Processing failed jobs and proceeding to the next jobs.") logging.error("Do not restart component.") else: errorCheck = None if errorCheck: self.errorCount += 1 condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get('id', None) == jobID: job['fwjr'] = condorErrorReport failedJobs.append(job) break else: if self.errorCount > 0: self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # report this to the operators. if self.errorCount > self.errorThreshold: try: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) logging.error("Reporting to Alert system and continuing to process jobs") from WMCore.Alerts import API as alertAPI preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = "BossAirCondorPlugin") sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) sendAlert(6, msg = msg) sender.unregister() self.errorCount = 0 except: # There's nothing we can really do here pass # Remove JDL files unless commanded otherwise if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True): for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order # to free up memory (in the midst of the process, the forked # memory space shouldn't be touched, so it should still be # shared, but after this point any action by the Submitter will # result in memory duplication). logging.info("Purging worker pool to clean up memory") self.close() # We must return a list of jobs successfully submitted, # and a list of jobs failed logging.info("Done submitting jobs for this cycle in CondorPlugin") return successfulJobs, failedJobs
self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # report this to the operators. if self.errorCount > self.errorThreshold: try: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) logging.error("Reporting to Alert system and continuing to process jobs") from WMCore.Alerts import API as alertAPI preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = "BossAirCondorPlugin") sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) sendAlert(6, msg = msg) sender.unregister() self.errorCount = 0 except: # There's nothing we can really do here pass # Remove JDL files unless commanded otherwise if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True): for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order
class WorkQueue(WorkQueueBase): """ _WorkQueue_ WorkQueue object - interface to WorkQueue functionality. """ def __init__(self, logger=None, dbi=None, **params): WorkQueueBase.__init__(self, logger, dbi) self.parent_queue = None self.params = params # config argument (within params) shall be reference to # Configuration instance (will later be checked for presence of "Alert") self.config = params.get("Config", None) self.params.setdefault('CouchUrl', os.environ.get('COUCHURL')) if not self.params.get('CouchUrl'): raise RuntimeError, 'CouchUrl config value mandatory' self.params.setdefault('DbName', 'workqueue') self.params.setdefault('InboxDbName', self.params['DbName'] + '_inbox') self.params.setdefault('ParentQueueCouchUrl', None) # We get work from here self.backend = WorkQueueBackend(self.params['CouchUrl'], self.params['DbName'], self.params['InboxDbName'], self.params['ParentQueueCouchUrl'], self.params.get('QueueURL'), logger=self.logger) if self.params.get('ParentQueueCouchUrl'): try: self.parent_queue = WorkQueueBackend( self.params['ParentQueueCouchUrl'].rsplit('/', 1)[0], self.params['ParentQueueCouchUrl'].rsplit('/', 1)[1]) except IndexError, ex: # Probable cause: Someone didn't put the global WorkQueue name in # the ParentCouchUrl msg = "Parsing failure for ParentQueueCouchUrl - probably missing dbname in input\n" msg += "Exception: %s\n" % str(ex) msg += str("ParentQueueCouchUrl: %s\n" % self.params['ParentQueueCouchUrl']) self.logger.error(msg) raise WorkQueueError(msg) self.params['ParentQueueCouchUrl'] = self.parent_queue.queueUrl self.params.setdefault( "GlobalDBS", "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet") self.params.setdefault('QueueDepth', 0.5) # when less than this locally self.params.setdefault('LocationRefreshInterval', 600) self.params.setdefault('FullLocationRefreshInterval', 7200) self.params.setdefault('TrackLocationOrSubscription', 'subscription') self.params.setdefault('ReleaseIncompleteBlocks', False) self.params.setdefault('ReleaseRequireSubscribed', True) self.params.setdefault('PhEDExEndpoint', None) self.params.setdefault('PopulateFilesets', True) self.params.setdefault('LocalQueueFlag', True) self.params.setdefault('QueueRetryTime', 86400) self.params.setdefault('stuckElementAlertTime', 86400) self.params.setdefault('reqmgrCompleteGraceTime', 604800) self.params.setdefault('cancelGraceTime', 604800) self.params.setdefault('JobDumpConfig', None) self.params.setdefault('BossAirConfig', None) self.params[ 'QueueURL'] = self.backend.queueUrl # url this queue is visible on # backend took previous QueueURL and sanitized it self.params.setdefault('WMBSUrl', None) # this will only be set on local Queue if self.params.get('WMBSUrl'): self.params['WMBSUrl'] = Lexicon.sanitizeURL( self.params['WMBSUrl'])['url'] self.params.setdefault('Teams', []) self.params.setdefault('DrainMode', False) if self.params.get('CacheDir'): try: os.makedirs(self.params['CacheDir']) except OSError: pass elif self.params.get('PopulateFilesets'): raise RuntimeError, 'CacheDir mandatory for local queue' self.params.setdefault('SplittingMapping', {}) self.params['SplittingMapping'].setdefault('DatasetBlock', { 'name': 'Block', 'args': {} }) self.params['SplittingMapping'].setdefault('MonteCarlo', { 'name': 'MonteCarlo', 'args': {} }) self.params['SplittingMapping'].setdefault('Dataset', { 'name': 'Dataset', 'args': {} }) self.params['SplittingMapping'].setdefault('Block', { 'name': 'Block', 'args': {} }) self.params['SplittingMapping'].setdefault('ResubmitBlock', { 'name': 'ResubmitBlock', 'args': {} }) self.params.setdefault('EndPolicySettings', {}) assert (self.params['TrackLocationOrSubscription'] in ('subscription', 'location')) # Can only release blocks on location if self.params['TrackLocationOrSubscription'] == 'location': if self.params['SplittingMapping']['DatasetBlock'][ 'name'] != 'Block': raise RuntimeError, 'Only blocks can be released on location' if self.params.get('PhEDEx'): self.phedexService = self.params['PhEDEx'] else: phedexArgs = {} if self.params.get('PhEDExEndpoint'): phedexArgs['endpoint'] = self.params['PhEDExEndpoint'] self.phedexService = PhEDEx(phedexArgs) if self.params.get('SiteDB'): self.SiteDB = self.params['SiteDB'] else: self.SiteDB = SiteDB() if type(self.params['Teams']) in types.StringTypes: self.params['Teams'] = [x.strip() for x in \ self.params['Teams'].split(',')] self.dataLocationMapper = WorkQueueDataLocationMapper( self.logger, self.backend, phedex=self.phedexService, sitedb=self.SiteDB, locationFrom=self.params['TrackLocationOrSubscription'], incompleteBlocks=self.params['ReleaseIncompleteBlocks'], requireBlocksSubscribed=not self.params['ReleaseIncompleteBlocks'], fullRefreshInterval=self.params['FullLocationRefreshInterval'], updateIntervalCoarseness=self.params['LocationRefreshInterval']) # initialize alerts sending client (self.sendAlert() method) # usage: self.sendAlert(levelNum, msg = msg) ; level - integer 1 .. 10 # 1 - 4 - lower levels ; 5 - 10 higher levels preAlert, self.alertSender = \ alertAPI.setUpAlertsMessaging(self, compName = "WorkQueueManager") self.sendAlert = alertAPI.getSendAlert(sender=self.alertSender, preAlert=preAlert) self.logger.debug("WorkQueue created successfully")
import sys import time from WMCore.Alerts import API as alertAPI from WMCore.Alerts.Alert import Alert from WMCore.Alerts.ZMQ.Sender import Sender machine = "maxatest.cern.ch" target = "tcp://%s:6557" % machine targetController = "tcp://%s:6559" % machine if len(sys.argv) > 2: target = sys.argv[1] targetController = sys.argv[2] dictAlert = dict(Type="AlertTestClient", Workload="n/a", Component=__name__, Source=__name__) preAlert = alertAPI.getPredefinedAlert(**dictAlert) sender = Sender(target, targetController, "AlertTestClient") print ("created Sender client for alerts target: %s controller: %s" % (target, targetController)) sender.register() a = Alert(**preAlert) a["Timestamp"] = time.time() a["Level"] = 6 print "sending alert:\n'%s'" % a sender(a) sender.unregister()
self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # report this to the operators. if self.errorCount > self.errorThreshold: try: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) logging.error("Reporting to Alert system and continuing to process jobs") from WMCore.Alerts import API as alertAPI preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = "BossAirPyCondorPlugin") sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) sendAlert(6, msg = msg) sender.unregister() self.errorCount = 0 except: # There's nothing we can really do here pass # Remove JDL files unless commanded otherwise if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True): for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order
import time from WMCore.Alerts import API as alertAPI from WMCore.Alerts.Alert import Alert from WMCore.Alerts.ZMQ.Sender import Sender machine = "maxatest.cern.ch" target = "tcp://%s:6557" % machine targetController = "tcp://%s:6559" % machine if len(sys.argv) > 2: target = sys.argv[1] targetController = sys.argv[2] dictAlert = dict(Type="AlertTestClient", Workload="n/a", Component=__name__, Source=__name__) preAlert = alertAPI.getPredefinedAlert(**dictAlert) sender = Sender(target, targetController, "AlertTestClient") print("created Sender client for alerts target: %s controller: %s" % (target, targetController)) sender.register() a = Alert(**preAlert) a["Timestamp"] = time.time() a["Level"] = 6 print "sending alert:\n'%s'" % a sender(a) sender.unregister()
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.submitDir = self.config.JobSubmitter.submitDir timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400) successfulJobs = [] failedJobs = [] jdlFiles = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs if len(self.pool) == 0: # Starting things up # This is obviously a submit API logging.info("Starting up CondorPlugin worker pool") self.input = multiprocessing.Queue() self.result = multiprocessing.Queue() for x in range(self.nProcess): p = multiprocessing.Process(target = submitWorker, args = (self.input, self.result, timeout)) p.start() self.pool.append(p) if not os.path.exists(self.submitDir): os.makedirs(self.submitDir) # Now assume that what we get is the following; a mostly # unordered list of jobs with random sandboxes. # We intend to sort them by sandbox. submitDict = {} nSubmits = 0 for job in jobs: sandbox = job['sandbox'] if not sandbox in submitDict.keys(): submitDict[sandbox] = [] submitDict[sandbox].append(job) # Now submit the bastards queueError = False for sandbox in submitDict.keys(): jobList = submitDict.get(sandbox, []) idList = [x['jobid'] for x in jobList] if queueError: # If the queue has failed, then we must not process # any more jobs this cycle. continue while len(jobList) > 0: jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker] jobList = jobList[self.config.JobSubmitter.jobsPerWorker:] idList = [x['id'] for x in jobsReady] jdlList = self.makeSubmit(jobList = jobsReady) if not jdlList or jdlList == []: # Then we got nothing logging.error("No JDL file made!") return {'NoResult': [0]} jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0]) handle = open(jdlFile, 'w') handle.writelines(jdlList) handle.close() jdlFiles.append(jdlFile) # Now submit them logging.info("About to submit %i jobs" %(len(jobsReady))) if self.glexecPath: command = 'CS=`which condor_submit`; ' if self.glexecWrapScript: command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile if self.glexecUnwrapScript: command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile) else: command += '%s $CS %s' % (self.glexecPath, jdlFile) else: command = "condor_submit %s" % jdlFile try: self.input.put({'command': command, 'idList': idList}) except AssertionError as ex: msg = "Critical error: input pipeline probably closed.\n" msg += str(ex) msg += "Error Procedure: Something critical has happened in the worker process\n" msg += "We will now proceed to pull all useful data from the queue (if it exists)\n" msg += "Then refresh the worker pool\n" logging.error(msg) queueError = True break nSubmits += 1 # Now we should have sent all jobs to be submitted # Going to do the rest of it now for n in range(nSubmits): try: res = self.result.get(block = True, timeout = timeout) except Queue.Empty: # If the queue was empty go to the next submit # Those jobs have vanished logging.error("Queue.Empty error received!") logging.error("This could indicate a critical condor error!") logging.error("However, no information of any use was obtained due to process failure.") logging.error("Either process failed, or process timed out after %s seconds." % timeout) queueError = True continue except AssertionError as ex: msg = "Found Assertion error while retrieving output from worker process.\n" msg += str(ex) msg += "This indicates something critical happened to a worker process" msg += "We will recover what jobs we know were submitted, and resubmit the rest" msg += "Refreshing worker pool at end of loop" logging.error(msg) queueError = True continue try: output = res['stdout'] error = res['stderr'] idList = res['idList'] exitCode = res['exitCode'] except KeyError as ex: msg = "Error in finding key from result pipe\n" msg += "Something has gone critically wrong in the worker\n" try: msg += "Result: %s\n" % str(res) except: pass msg += str(ex) logging.error(msg) queueError = True continue if not exitCode == 0: logging.error("Condor returned non-zero. Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error = error) logging.error("Processing failed jobs and proceeding to the next jobs.") logging.error("Do not restart component.") else: errorCheck = None if errorCheck: self.errorCount += 1 condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get('id', None) == jobID: job['fwjr'] = condorErrorReport failedJobs.append(job) break else: if self.errorCount > 0: self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # report this to the operators. if self.errorCount > self.errorThreshold: try: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) logging.error("Reporting to Alert system and continuing to process jobs") from WMCore.Alerts import API as alertAPI preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = "BossAirCondorPlugin") sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) sendAlert(6, msg = msg) sender.unregister() self.errorCount = 0 except: # There's nothing we can really do here pass # Remove JDL files unless commanded otherwise if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True): for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order # to free up memory (in the midst of the process, the forked # memory space shouldn't be touched, so it should still be # shared, but after this point any action by the Submitter will # result in memory duplication). logging.info("Purging worker pool to clean up memory") self.close() # We must return a list of jobs successfully submitted, # and a list of jobs failed logging.info("Done submitting jobs for this cycle in CondorPlugin") return successfulJobs, failedJobs