def newQueueElement(self, **args): # DBS Url may not be available in the initial task # but in the pileup data (MC pileup) dbsUrl = self.initialTask.dbsUrl() if dbsUrl is None and self.pileupData: # Get the first DBS found dbsUrl = next(iter(self.wmspec.listPileupDatasets())) args.setdefault('Status', 'Available') args.setdefault('WMSpec', self.wmspec) args.setdefault('Task', self.initialTask) args.setdefault('RequestName', self.wmspec.name()) args.setdefault('TaskName', self.initialTask.name()) args.setdefault('Dbs', dbsUrl) args.setdefault('SiteWhitelist', self.initialTask.siteWhitelist()) args.setdefault('SiteBlacklist', self.initialTask.siteBlacklist()) args.setdefault('StartPolicy', self.wmspec.startPolicy()) args.setdefault('EndPolicy', self.wmspec.endPolicyParameters()) args.setdefault('Priority', self.wmspec.priority()) args.setdefault('PileupData', self.pileupData) if not args['Priority']: args['Priority'] = 0 ele = WorkQueueElement(**args) for data, sites in viewitems(ele['Inputs']): if not sites: raise WorkQueueWMSpecError( self.wmspec, 'Input data has no locations "%s"' % data) # catch infinite splitting loops if len(self.workQueueElements) > self.args.get('maxRequestSize', 1e8): raise WorkQueueWMSpecError( self.wmspec, 'Too many elements (%d)' % self.args.get('MaxRequestElements', 1e8)) self.workQueueElements.append(ele)
def queueWork(self, wmspecUrl, request=None, team=None): """ Take and queue work from a WMSpec. If request name is provided but doesn't match WMSpec name an error is raised. If team is provided work will only be available to queue's belonging to that team. Duplicate specs will be ignored. """ self.logger.info('queueWork() begin queueing "%s"' % wmspecUrl) wmspec = WMWorkloadHelper() wmspec.load(wmspecUrl) if request: # validate request name try: Lexicon.requestName(request) except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( wmspec, "Request name validation error: %s" % str(ex)) raise error if request != wmspec.name(): raise WorkQueueWMSpecError( wmspec, 'Request & workflow name mismatch %s vs %s' % (request, wmspec.name()))
def validate(self): """Check args and spec work with block splitting""" StartPolicyInterface.validateCommon(self) if self.initialTask.totalEvents() < 1: raise WorkQueueNoWorkError(self.wmspec, 'Invalid total events selection: %s' % str(self.initialTask.totalEvents())) if self.mask and self.mask['LastEvent'] < self.mask['FirstEvent']: raise WorkQueueWMSpecError(self.wmspec, "Invalid start & end events") if self.mask and self.mask['LastLumi'] < self.mask['FirstLumi']: raise WorkQueueWMSpecError(self.wmspec, "Invalid start & end lumis")
def __call__(self, wmspec, task, data=None, mask=None, team=None, continuous=False, rucioObj=None): self.wmspec = wmspec # bring in spec specific settings self.args.update(self.wmspec.startPolicyParameters()) self.initialTask = task if data: self.data = data self.mask = mask self.validate() try: pileupDatasets = self.wmspec.listPileupDatasets() if pileupDatasets: self.pileupData = self.getDatasetLocations(pileupDatasets) self.split() # For known exceptions raise custom error that will fail the workflow. except dbsClientException as ex: # A dbs configuration error implies the spec is invalid error = WorkQueueWMSpecError(self.wmspec, "DBS config error: %s" % str(ex)) raise error except AssertionError as ex: # Assertion generally means validation of an input field failed error = WorkQueueWMSpecError(self.wmspec, "Assertion error: %s" % str(ex)) raise error except DBSReaderError as ex: # Hacky way of identifying non-existant data, DbsBadRequest chomped by DBSReader if 'Invalid parameters' in str(ex): data = task.data.input.pythonise_( ) if task.data.input else 'None' msg = """data: %s, mask: %s, pileup: %s. %s""" % ( str(data), str(mask), str(pileupDatasets), str(ex)) error = WorkQueueNoWorkError(self.wmspec, msg) raise error raise # propagate other dbs errors # if we have no new elements and we are not adding work to request # already running, then raise exception if not self.workQueueElements and not continuous: data = task.data.input.pythonise_() if task.data.input else 'None' msg = "Failed to add work. Input data: %s, mask: %s." % (str(data), str(mask)) error = WorkQueueNoWorkError(self.wmspec, msg) raise error return self.workQueueElements, self.rejectedWork, self.badWork
def validBlocks(self, task): """Return blocks that pass the input data restriction according to the splitting algorithm""" validBlocks = [] acdcInfo = task.getInputACDC() if not acdcInfo: raise WorkQueueWMSpecError( self.wmspec, 'No acdc section for %s' % task.getPathName()) acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) if self.data: acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0]) else: # if self.data is not passed, assume the the data is input dataset # from the spec acdcBlockSplit = False if acdcBlockSplit: dbsBlock = {} dbsBlock['Name'] = self.data.keys()[0] block = acdc.getChunkInfo( acdcInfo['collection'], acdcBlockSplit['TaskName'], acdcBlockSplit['Offset'], acdcBlockSplit['NumOfFiles'], user=self.wmspec.getOwner().get("name"), group=self.wmspec.getOwner().get("group")) dbsBlock['NumberOfFiles'] = block['files'] dbsBlock['NumberOfEvents'] = block['events'] dbsBlock['NumberOfLumis'] = block['lumis'] dbsBlock['ACDC'] = acdcInfo if task.getTrustSitelists(): dbsBlock["Sites"] = self.sites else: # TODO remove this line when all DBS origin_site_name is converted to PNN block["locations"] = self.siteDB.checkAndConvertSENameToPNN( block["locations"]) # upto this dbsBlock["Sites"] = self.siteDB.PNNstoPSNs(block["locations"]) validBlocks.append(dbsBlock) else: if self.args['SplittingAlgo'] in self.unsupportedAlgos: raise WorkQueueWMSpecError( self.wmspec, 'ACDC is not supported for %s' % self.args['SplittingAlgo']) splittingFunc = self.defaultAlgo if self.args['SplittingAlgo'] in self.algoMapping: splittingFunc = self.algoMapping[self.args['SplittingAlgo']] validBlocks = splittingFunc(acdc, acdcInfo, task) return validBlocks
def validateCommon(self): """Common validation stuff""" try: Lexicon.requestName(self.wmspec.name()) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Workflow name validation error: %s" % str(ex)) raise error if self.initialTask.siteWhitelist(): if isinstance(self.initialTask.siteWhitelist(), basestring): error = WorkQueueWMSpecError(self.wmspec, 'Invalid site whitelist: Must be tuple/list but is %s' % type( self.initialTask.siteWhitelist())) raise error try: [Lexicon.cmsname(site) for site in self.initialTask.siteWhitelist()] except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Site whitelist validation error: %s" % str(ex)) raise error if self.initialTask.siteBlacklist(): if isinstance(self.initialTask.siteBlacklist(), basestring): error = WorkQueueWMSpecError(self.wmspec, 'Invalid site blacklist: Must be tuple/list but is %s' % type( self.initialTask.siteBlacklist())) raise error try: [Lexicon.cmsname(site) for site in self.initialTask.siteBlacklist()] except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Site blacklist validation error: %s" % str(ex)) raise error # splitter settings if self.args.get('SliceSize', 1) <= 0: error = WorkQueueWMSpecError(self.wmspec, 'Zero or negative SliceSize parameter') raise error if self.args.get('SubSliceSize', 1) <= 0: error = WorkQueueWMSpecError(self.wmspec, 'Zero or negative SubSliceSize parameter') raise error # check input dataset is valid try: if self.initialTask.getInputDatasetPath(): Lexicon.dataset(self.initialTask.getInputDatasetPath()) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Dataset validation error: %s" % str(ex)) raise error # if pileup is found, check that they are valid datasets try: pileupDatasets = self.wmspec.listPileupDatasets() for dbsUrl in pileupDatasets: for dataset in pileupDatasets[dbsUrl]: Lexicon.dataset(dataset) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(self.wmspec, "Pileup dataset validation error: %s" % str(ex)) raise error
class StartPolicyInterface(PolicyInterface): """Interface for start policies""" def __init__(self, **args): PolicyInterface.__init__(self, **args) self.workQueueElements = [] self.wmspec = None self.team = None self.initialTask = None self.splitParams = None self.dbs_pool = {} self.data = {} self.lumi = None self.couchdb = None self.rejectedWork = [] # List of inputs that were rejected self.pileupData = {} def split(self): """Apply policy to spec""" raise NotImplementedError def validate(self): """Check params and spec are appropriate for the policy""" raise NotImplementedError def validateCommon(self): """Common validation stuff""" try: Lexicon.requestName(self.wmspec.name()) except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Workflow name validation error: %s" % str(ex)) raise error if self.initialTask.siteWhitelist(): if type(self.initialTask.siteWhitelist()) in types.StringTypes: error = WorkQueueWMSpecError( self.wmspec, 'Invalid site whitelist: Must be tuple/list but is %s' % type(self.initialTask.siteWhitelist())) raise error try: [ Lexicon.cmsname(site) for site in self.initialTask.siteWhitelist() ] except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Site whitelist validation error: %s" % str(ex)) raise error
def validateCommon(self): """Common validation stuff""" try: Lexicon.requestName(self.wmspec.name()) except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Workflow name validation error: %s" % str(ex)) raise error
def __call__(self, wmspec, task, data=None, mask=None, team=None): self.wmspec = wmspec # bring in spec specific settings self.args.update(self.wmspec.startPolicyParameters()) self.initialTask = task if data: self.data = data self.mask = mask self.validate() try: pileupDatasets = self.wmspec.listPileupDatasets() if pileupDatasets: self.pileupData = self.getDatasetLocations(pileupDatasets) self.split() # For known exceptions raise custom error that will fail the workflow. except DbsConfigurationError as ex: # A dbs configuration error implies the spec is invalid error = WorkQueueWMSpecError(self.wmspec, "DBS config error: %s" % str(ex)) raise error except AssertionError as ex: # Assertion generally means validation of an input field failed error = WorkQueueWMSpecError(self.wmspec, "Assertion error: %s" % str(ex)) raise error except DBSReaderError as ex: # Hacky way of identifying non-existant data, DbsBadRequest chomped by DBSReader # DbsConnectionError: Database exception,Invalid parameters thrown by Summary api if 'DbsBadRequest' in str(ex) or 'Invalid parameters' in str(ex): data = task.data.input.pythonise_( ) if task.data.input else 'None' msg = """data: %s, mask: %s, pileup: %s. %s""" % ( str(data), str(mask), str(pileupDatasets), str(ex)) error = WorkQueueNoWorkError(self.wmspec, msg) raise error raise # propagate other dbs errors # if we have no elements then there was no work in the spec, fail it if not self.workQueueElements: data = task.data.input.pythonise_() if task.data.input else 'None' msg = """data: %s, mask: %s.""" % (str(data), str(mask)) error = WorkQueueNoWorkError(self.wmspec, msg) raise error return self.workQueueElements, self.rejectedWork
def validBlocks(self, task): """Return blocks that pass the input data restriction according to the splitting algorithm""" validBlocks = [] acdcInfo = task.getInputACDC() if not acdcInfo: raise WorkQueueWMSpecError( self.wmspec, 'No acdc section for %s' % task.getPathName()) acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) if self.data: acdcBlockSplit = ACDCBlock.splitBlockName(next(iter(self.data))) else: # if self.data is not passed, assume the the data is input dataset # from the spec acdcBlockSplit = False if acdcBlockSplit: dbsBlock = {} dbsBlock['Name'] = next(iter(self.data)) block = acdc.getChunkInfo(acdcInfo['collection'], acdcBlockSplit['TaskName'], acdcBlockSplit['Offset'], acdcBlockSplit['NumOfFiles']) dbsBlock['NumberOfFiles'] = block['files'] dbsBlock['NumberOfEvents'] = block['events'] dbsBlock['NumberOfLumis'] = block['lumis'] dbsBlock['ACDC'] = acdcInfo if task.getTrustSitelists().get('trustlists'): dbsBlock["Sites"] = self.sites else: dbsBlock["Sites"] = self.cric.PNNstoPSNs(block["locations"]) validBlocks.append(dbsBlock) else: if self.args['SplittingAlgo'] in self.unsupportedAlgos: raise WorkQueueWMSpecError( self.wmspec, 'ACDC is not supported for %s' % self.args['SplittingAlgo']) splittingFunc = self.defaultAlgo if self.args['SplittingAlgo'] in self.algoMapping: splittingFunc = self.algoMapping[self.args['SplittingAlgo']] validBlocks = splittingFunc(acdc, acdcInfo, task) return validBlocks
def validBlocks(self, task): """Return blocks that pass the input data restriction""" validBlocks = [] # TODO take the chunk size from parameter chunkSize = 200 acdcInfo = task.getInputACDC() if not acdcInfo: raise WorkQueueWMSpecError( self.wmspec, 'No acdc section for %s' % task.getPathName()) acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) if self.data: acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0]) else: #if self.data is not passed, assume the the data is input dataset # from the spec acdcBlockSplit = False if acdcBlockSplit: dbsBlock = {} dbsBlock['Name'] = self.data.keys()[0] block = acdc.getChunkInfo( acdcInfo['collection'], acdcBlockSplit['TaskName'], acdcBlockSplit['Offset'], acdcBlockSplit['NumOfFiles'], user=self.wmspec.getOwner().get("name"), group=self.wmspec.getOwner().get("group")) dbsBlock['NumberOfFiles'] = block['files'] dbsBlock['NumberOfEvents'] = block['events'] dbsBlock['NumberOfLumis'] = block['lumis'] dbsBlock['ACDC'] = acdcInfo dbsBlock["Sites"] = sitesFromStorageEelements(block["locations"]) validBlocks.append(dbsBlock) else: acdcBlocks = acdc.chunkFileset( acdcInfo['collection'], acdcInfo['fileset'], chunkSize, user=self.wmspec.getOwner().get("name"), group=self.wmspec.getOwner().get("group")) for block in acdcBlocks: dbsBlock = {} dbsBlock['Name'] = ACDCBlock.name(self.wmspec.name(), acdcInfo["fileset"], block['offset'], block['files']) dbsBlock['NumberOfFiles'] = block['files'] dbsBlock['NumberOfEvents'] = block['events'] dbsBlock['NumberOfLumis'] = block['lumis'] dbsBlock["Sites"] = sitesFromStorageEelements( block["locations"]) dbsBlock['ACDC'] = acdcInfo validBlocks.append(dbsBlock) return validBlocks
def newQueueElement(self, **args): args.setdefault('Status', 'Available') args.setdefault('WMSpec', self.wmspec) args.setdefault('Task', self.initialTask) args.setdefault('RequestName', self.wmspec.name()) args.setdefault('TaskName', self.initialTask.name()) args.setdefault('Dbs', self.initialTask.dbsUrl()) args.setdefault('SiteWhitelist', self.initialTask.siteWhitelist()) args.setdefault('SiteBlacklist', self.initialTask.siteBlacklist()) args.setdefault('EndPolicy', self.wmspec.endPolicyParameters()) args.setdefault('Priority', self.wmspec.priority()) if not args['Priority']: args['Priority'] = 0 ele = WorkQueueElement(**args) for data, sites in ele['Inputs'].items(): if not sites: raise WorkQueueWMSpecError( self.wmspec, 'Input data has no locations "%s"' % data) # catch infinite splitting loops if len(self.workQueueElements) > self.args.get('maxRequestSize', 1e8): raise WorkQueueWMSpecError( self.wmspec, 'Too many elements (%d)' % self.args.get('MaxRequestElements', 1e8)) self.workQueueElements.append(ele)
def __call__(self, wmspec, task, data=None, mask=None, team=None): self.wmspec = wmspec # bring in spec specific settings self.args.update(self.wmspec.startPolicyParameters()) self.initialTask = task if data: self.data = data self.mask = mask self.validate() try: self.split() # For known exceptions raise custom error that will fail the workflow. except DbsConfigurationError, ex: # A dbs configuration error implies the spec is invalid error = WorkQueueWMSpecError(self.wmspec, "DBS config error: %s" % str(ex)) raise error
def validate(self): """Check args and spec work with block splitting""" StartPolicyInterface.validateCommon(self) if not self.initialTask.inputDataset(): raise WorkQueueWMSpecError(self.wmspec, 'No input dataset')
def queueNewRequests(self, queue): """Get requests from regMgr and queue to workqueue""" self.logger.info("Contacting Request manager for more work") work = 0 workLoads = [] try: workLoads = self.getAvailableRequests() except Exception as ex: traceMsg = traceback.format_exc() msg = "Error contacting RequestManager: %s" % traceMsg self.logger.warning(msg) return 0 for team, reqName, workLoadUrl in workLoads: try: try: Lexicon.couchurl(workLoadUrl) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. # check its not a local file if not os.path.exists(workLoadUrl): error = WorkQueueWMSpecError( None, "Workflow url validation error: %s" % str(ex)) raise error self.logger.info("Processing request %s at %s" % (reqName, workLoadUrl)) units = queue.queueWork(workLoadUrl, request=reqName, team=team) self.logdb.delete(reqName, "error", this_thread=True, agent=False) except TERMINAL_EXCEPTIONS as ex: # fatal error - report back to ReqMgr self.logger.error( 'Permanent failure processing request "%s": %s' % (reqName, str(ex))) self.logger.info("Marking request %s as failed in ReqMgr" % reqName) self.reportRequestStatus(reqName, 'Failed', message=str(ex)) continue except (IOError, socket.error, CouchError, CouchConnectionError) as ex: # temporary problem - try again later msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nError: "%s"' % str(ex) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except Exception as ex: # Log exception as it isnt a communication problem msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nSee log for details.\nError: "%s"' % str(ex) self.logger.exception('Unknown error processing %s' % reqName) self.logdb.post(reqName, msg, 'error') continue self.logger.info('%s units(s) queued for "%s"' % (units, reqName)) work += units self.logger.info("%s element(s) obtained from RequestManager" % work) return work
def queueNewRequests(self, queue): """Get requests from regMgr and queue to workqueue""" self.logger.info("Contacting Request manager for more work") work = 0 workLoads = [] if queue.params['DrainMode']: self.logger.info( 'Draining queue: Skip requesting work from ReqMgr') return 0 try: workLoads = self.getAvailableRequests(queue.params['Teams']) except Exception as ex: traceMsg = traceback.format_exc() msg = "Error contacting RequestManager: %s" % traceMsg self.logger.warning(msg) return 0 for team, reqName, workLoadUrl in workLoads: # try: # self.reportRequestStatus(reqName, "negotiating") # except Exception, ex: # self.logger.error(""" # Unable to update ReqMgr state to negotiating: %s # Ignoring this request: %s""" % (str(ex), reqName)) # continue try: try: Lexicon.couchurl(workLoadUrl) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. # check its not a local file if not os.path.exists(workLoadUrl): error = WorkQueueWMSpecError( None, "Workflow url validation error: %s" % str(ex)) raise error self.logger.info("Processing request %s at %s" % (reqName, workLoadUrl)) units = queue.queueWork(workLoadUrl, request=reqName, team=team) self.logdb.delete(reqName, "error", this_thread=True) except (WorkQueueWMSpecError, WorkQueueNoWorkError) as ex: # fatal error - report back to ReqMgr self.logger.info( 'Permanent failure processing request "%s": %s' % (reqName, str(ex))) self.logger.info("Marking request %s as failed in ReqMgr" % reqName) self.reportRequestStatus(reqName, 'Failed', message=str(ex)) continue except (IOError, socket.error, CouchError, CouchConnectionError) as ex: # temporary problem - try again later msg = 'Error processing request "%s": will try again later.' \ '\nError: "%s"' % (reqName, str(ex)) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except Exception as ex: # Log exception as it isnt a communication problem msg = 'Error processing request "%s": will try again later.' \ '\nSee log for details.\nError: "%s"' % (reqName, str(ex)) self.logger.exception('Unknown error processing %s' % reqName) self.logdb.post(reqName, msg, 'error') continue try: self.reportRequestStatus(reqName, "acquired") except Exception as ex: self.logger.warning("Unable to update ReqMgr state: %s" % str(ex)) self.logger.warning('Will try again later') self.logger.info('%s units(s) queued for "%s"' % (units, reqName)) work += units self.logger.info("%s element(s) obtained from RequestManager" % work) return work
def split(self): """Apply policy to spec""" # if not specified take standard defaults self.args.setdefault('SliceType', 'NumberOfEvents') self.args.setdefault('SliceSize', 1000) # events per job self.args.setdefault('SubSliceType', 'NumberOfEventsPerLumi') self.args.setdefault('SubSliceSize', self.args['SliceSize']) # events per lumi self.args.setdefault('MaxJobsPerElement', 1000) # jobs per WQE self.args.setdefault('MaxLumisPerElement', os.environ.get('MAX_LUMIS_PER_WQE')) self.args.setdefault( 'blowupFactor', 1.0) # Estimate of additional jobs following tasks. # Total WQE tasks will be Jobs*(1+blowupFactor) noInputUpdate = self.initialTask.getTrustSitelists().get('trustlists') noPileupUpdate = self.initialTask.getTrustSitelists().get( 'trustPUlists') if not self.mask: self.mask = Mask(FirstRun=1, FirstLumi=self.initialTask.getFirstLumi(), FirstEvent=self.initialTask.getFirstEvent(), LastRun=1, LastEvent=self.initialTask.getFirstEvent() + self.initialTask.totalEvents() - 1) mask = Mask(**self.mask) #First let's initialize some parameters stepSize = int(self.args['SliceSize']) * int( self.args['MaxJobsPerElement']) total = mask['LastEvent'] - mask['FirstEvent'] + 1 lastAllowedEvent = mask['LastEvent'] eventsAccounted = 0 while eventsAccounted < total: current = mask['FirstEvent'] + stepSize - 1 # inclusive range if current > lastAllowedEvent: current = lastAllowedEvent mask['LastEvent'] = current #Calculate the job splitting without actually doing it # number of lumis is calculated by events number and SubSliceSize which is events per lumi # So if there no exact division between events per job and events per lumi # it takes the ceiling of the value. # Therefore total lumis can't be calculated from total events / SubSliceSize # It has to be caluated by adding the lumis_per_job * number of jobs nEvents = mask['LastEvent'] - mask['FirstEvent'] + 1 lumis_per_job = ceil(self.args['SliceSize'] / self.args['SubSliceSize']) nLumis = floor(nEvents / self.args['SliceSize']) * lumis_per_job remainingLumis = ceil(nEvents % self.args['SliceSize'] / self.args['SubSliceSize']) nLumis += remainingLumis jobs = ceil(nEvents / self.args['SliceSize']) if self.args['MaxLumisPerElement'] and nLumis > int( self.args['MaxLumisPerElement']): raise WorkQueueWMSpecError( self.wmspec, "Too many lumis in WQE: %s" % nLumis) mask['LastLumi'] = mask['FirstLumi'] + int( nLumis) - 1 # inclusive range self.newQueueElement(WMSpec=self.wmspec, NumberOfLumis=nLumis, NumberOfEvents=nEvents, Jobs=jobs, Mask=copy(mask), NoInputUpdate=noInputUpdate, NoPileupUpdate=noPileupUpdate, blowupFactor=self.args['blowupFactor']) if mask['LastEvent'] > (2**32 - 1): #This is getting tricky, to ensure consecutive #events numbers we must calculate where the jobSplitter #will restart the firstEvent to 1 for the last time #in the newly created unit internalEvents = mask['FirstEvent'] accumulatedEvents = internalEvents breakPoint = internalEvents while accumulatedEvents < mask['LastEvent']: if (internalEvents + self.args['SliceSize'] - 1) > (2**32 - 1): internalEvents = 1 breakPoint = accumulatedEvents else: internalEvents += self.args['SliceSize'] accumulatedEvents += self.args['SliceSize'] leftoverEvents = mask['LastEvent'] - breakPoint + 1 mask['FirstEvent'] = leftoverEvents + 1 else: mask['FirstEvent'] = mask['LastEvent'] + 1 mask['FirstLumi'] = mask['LastLumi'] + 1 eventsAccounted += stepSize lastAllowedEvent = (total - eventsAccounted) + mask['FirstEvent'] - 1
class WorkQueueReqMgrInterface(): """Helper class for ReqMgr interaction""" def __init__(self, **kwargs): if not kwargs.get('logger'): import logging kwargs['logger'] = logging self.logger = kwargs['logger'] self.reqMgr = RequestManager(kwargs) self.previous_state = {} def __call__(self, queue): """Synchronize WorkQueue and RequestManager""" msg = '' try: # pull in new work work = self.queueNewRequests(queue) msg += "New Work: %d\n" % work except Exception: self.logger.exception("Error caught during RequestManager pull") try: # get additional open-running work extraWork = self.addNewElementsToOpenRequests(queue) msg += "Work added: %d\n" % extraWork except Exception: self.logger.exception("Error caught during RequestManager split") try: # report back to ReqMgr uptodate_elements = self.report(queue) msg += "Updated ReqMgr status for: %s\n" % ", ".join( [x['RequestName'] for x in uptodate_elements]) except: self.logger.exception("Error caught during RequestManager update") else: try: # Delete finished requests from WorkQueue self.deleteFinishedWork(queue, uptodate_elements) except: self.logger.exception("Error caught during work deletion") queue.backend.recordTaskActivity('reqmgr_sync', msg) def queueNewRequests(self, queue): """Get requests from regMgr and queue to workqueue""" self.logger.info("Contacting Request manager for more work") work = 0 workLoads = [] if queue.params['DrainMode']: self.logger.info( 'Draining queue: Skip requesting work from ReqMgr') return 0 try: workLoads = self.getAvailableRequests(*queue.params['Teams']) except Exception, ex: msg = "Error contacting RequestManager: %s" % str(ex) self.logger.warning(msg) return 0 for team, reqName, workLoadUrl in workLoads: # try: # self.reportRequestStatus(reqName, "negotiating") # except Exception, ex: # self.logger.error(""" # Unable to update ReqMgr state to negotiating: %s # Ignoring this request: %s""" % (str(ex), reqName)) # continue try: try: Lexicon.couchurl(workLoadUrl) except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. # check its not a local file if not os.path.exists(workLoadUrl): error = WorkQueueWMSpecError( None, "Workflow url validation error: %s" % str(ex)) raise error self.logger.info("Processing request %s at %s" % (reqName, workLoadUrl)) units = queue.queueWork(workLoadUrl, request=reqName, team=team) except (WorkQueueWMSpecError, WorkQueueNoWorkError), ex: # fatal error - report back to ReqMgr self.logger.info( 'Permanent failure processing request "%s": %s' % (reqName, str(ex))) self.logger.info("Marking request %s as failed in ReqMgr" % reqName) self.reportRequestStatus(reqName, 'Failed', message=str(ex)) continue
raise error try: [ Lexicon.cmsname(site) for site in self.initialTask.siteWhitelist() ] except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Site whitelist validation error: %s" % str(ex)) raise error if self.initialTask.siteBlacklist(): if type(self.initialTask.siteBlacklist()) in types.StringTypes: error = WorkQueueWMSpecError( self.wmspec, 'Invalid site blacklist: Must be tuple/list but is %s' % type(self.initialTask.siteBlacklist())) raise error try: [ Lexicon.cmsname(site) for site in self.initialTask.siteBlacklist() ] except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Site blacklist validation error: %s" % str(ex)) raise error # splitter settings if self.args.get('SliceSize', 1) <= 0: