def availableWork(self, conditions, teams = None, wfs = None): """Get work which is available to be run""" elements = [] for site in conditions.keys(): if not conditions[site] > 0: del conditions[site] if not conditions: return elements, conditions options = {} options['include_docs'] = True options['descending'] = True options['resources'] = conditions if teams: options['teams'] = teams if wfs: options['wfs'] = wfs result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) for i in result: element = CouchWorkQueueElement.fromDocument(self.db, i) elements.append(element) # Remove 1st random site that can run work names = conditions.keys() random.shuffle(names) for site in names: if element.passesSiteRestriction(site): slots_left = conditions[site] - element['Jobs'] if slots_left > 0: conditions[site] = slots_left else: conditions.pop(site, None) break return elements, conditions
def fixConflicts(self): """Fix elements in conflict Each local queue runs this to resolve its conflicts with global, resolution propagates up to global. Conflicting elements are merged into one element with others deleted. This will fail if elements are modified during the resolution - if this happens rerun. """ for db in [self.inbox, self.db]: for row in db.loadView('WorkQueue', 'conflicts')['rows']: element_id = row['id'] try: conflicting_elements = [CouchWorkQueueElement.fromDocument(db, db.document(element_id, rev)) \ for rev in row['value']] fixed_elements = fixElementConflicts(*conflicting_elements) if self.saveElements(fixed_elements[0]): self.saveElements( *fixed_elements[1:] ) # delete others (if merged value update accepted) except Exception as ex: self.logger.error("Error resolving conflict for %s: %s" % (element_id, str(ex)))
def getElementsForWorkflow(self, workflow): """Get elements for a workflow""" elements = self.db.loadView('WorkQueue', 'elementsByWorkflow', {'key': workflow, 'include_docs': True, 'reduce': False}) return [CouchWorkQueueElement.fromDocument(self.db, x['doc']) for x in elements.get('rows', [])]
def getElementsForParent(self, parent): """Get elements with the given parent""" elements = self.db.loadView('WorkQueue', 'elementsByParent', { 'key': parent.id, 'include_docs': True }) return [ CouchWorkQueueElement.fromDocument(self.db, x['doc']) for x in elements.get('rows', []) ]
def getElementsForPileupData(self, data): """Get active elements for this data """ elements = self.db.loadView('WorkQueue', 'elementsByPileupData', { 'key': data, 'include_docs': True }) return [ CouchWorkQueueElement.fromDocument(self.db, x['doc']) for x in elements.get('rows', []) ]
def getElements(self, status=None, elementIDs=None, returnIdOnly=False, db=None, loadSpec=False, WorkflowName=None, **elementFilters): """Return elements that match requirements status, elementIDs & filters are 'AND'ed together to filter elements. returnIdOnly causes the element not to be loaded and only the id returned db is used to specify which database to return from loadSpec causes the workflow for each spec to be loaded. WorkflowName may be used in the place of RequestName """ key = [] if not db: db = self.db if elementFilters.get('RequestName') and not WorkflowName: WorkflowName = elementFilters.pop('RequestName') if elementIDs: if elementFilters or status or returnIdOnly: raise ValueError( "Can't specify extra filters (or return id's) when using element id's with getElements()") elements = [CouchWorkQueueElement(db, i).load() for i in elementIDs] else: options = {'include_docs': True, 'filter': elementFilters, 'idOnly': returnIdOnly, 'reduce': False} # filter on workflow or status if possible filterName = 'elementsByWorkflow' if WorkflowName: key.append(WorkflowName) elif status: filterName = 'elementsByStatus' key.append(status) elif elementFilters.get('SubscriptionId'): key.append(elementFilters['SubscriptionId']) filterName = 'elementsBySubscription' # add given params to filters if status: options['filter']['Status'] = status if WorkflowName: options['filter']['RequestName'] = WorkflowName view = db.loadList('WorkQueue', 'filter', filterName, options, key) view = json.loads(view) if returnIdOnly: return view elements = [CouchWorkQueueElement.fromDocument(db, row) for row in view] if loadSpec: specs = {} # cache as may have multiple elements for same spec for ele in elements: if ele['RequestName'] not in specs: wmspec = self.getWMSpec(ele['RequestName']) specs[ele['RequestName']] = wmspec ele['WMSpec'] = specs[ele['RequestName']] del specs return elements
def fixConflicts(self): """Fix elements in conflict Each local queue runs this to resolve its conflicts with global, resolution propagates up to global. Conflicting elements are merged into one element with others deleted. This will fail if elements are modified during the resolution - if this happens rerun. """ ordered_states = ['Available', 'Negotiating', 'Acquired', 'Running', 'Done', 'Failed', 'CancelRequested', 'Canceled'] allowed_keys = ['Status', 'EventsWritten', 'FilesProcessed', 'PercentComplete', 'PercentSuccess'] for db in [self.inbox, self.db]: conflicts = db.loadView('WorkQueue', 'conflicts') queue = [] for row in conflicts['rows']: previous_value = None element_id = row['id'] for rev in row['value']: # loop over conflicting revisions ele = CouchWorkQueueElement.fromDocument(db, db.document(element_id, rev)) if not previous_value: # 1st will contain merged result and become winner previous_value = ele continue for key in previous_value: if previous_value[key] == ele.get(key): continue # we need to merge: Take elements from both that seem most advanced, e.g. status & progress stats if key not in allowed_keys: msg = 'Unable to merge conflicting element keys: field "%s" value 1 "%s" value2 "%s"' raise RuntimeError, msg % (key, previous_value.get(key), ele.get(key)) if key == 'Status': if ordered_states.index(ele[key]) > ordered_states.index(previous_value[key]): previous_value[key] = ele[key] elif ele[key] > previous_value[key]: previous_value[key] = ele[key] # once losing element has been merged - queue for deletion queue.append(ele) # conflict resolved - save element and delete losers msg = 'Resolving conflict for wf "%s", id "%s": Losing rev(s): %s' self.logger.info(msg % (str(previous_value['RequestName']), str(previous_value.id), ", ".join([x._document['_rev'] for x in queue]))) if self.saveElements(previous_value): for i in queue: i.delete() # delete others (if merged value update accepted) self.saveElements(*queue)
def availableWork(self, conditions, teams=None, wfs=None): """Get work which is available to be run""" elements = [] for site in conditions.keys(): if not conditions[site] > 0: del conditions[site] if not conditions: return elements, conditions options = {} options['include_docs'] = True options['descending'] = True options['resources'] = conditions if teams: options['teams'] = teams if wfs: result = [] for i in xrange(0, len(wfs), 20): options['wfs'] = wfs[i:i + 20] data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result.extend(json.loads(data)) # sort final list result.sort(key=lambda x: x[ 'WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement' ]['Priority']) else: result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) for i in result: element = CouchWorkQueueElement.fromDocument(self.db, i) elements.append(element) # Remove 1st random site that can run work names = conditions.keys() random.shuffle(names) for site in names: if element.passesSiteRestriction(site): slots_left = conditions[site] - element['Jobs'] if slots_left > 0: conditions[site] = slots_left else: conditions.pop(site, None) break if not conditions: break return elements, conditions
def availableWork(self, conditions, teams = None, wfs = None): """Get work which is available to be run""" elements = [] for site in conditions.keys(): if not conditions[site] > 0: del conditions[site] if not conditions: return elements, conditions options = {} options['include_docs'] = True options['descending'] = True options['resources'] = conditions if teams: options['teams'] = teams if wfs: result = [] for i in xrange(0, len(wfs), 20): options['wfs'] = wfs[i:i+20] data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result.extend(json.loads(data)) # sort final list result.sort(key = lambda x: x['WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement']['Priority']) else: result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) for i in result: element = CouchWorkQueueElement.fromDocument(self.db, i) elements.append(element) # Remove 1st random site that can run work names = conditions.keys() random.shuffle(names) for site in names: if element.passesSiteRestriction(site): slots_left = conditions[site] - element['Jobs'] if slots_left > 0: conditions[site] = slots_left else: conditions.pop(site, None) break if not conditions: break return elements, conditions
def fixConflicts(self): """Fix elements in conflict Each local queue runs this to resolve its conflicts with global, resolution propagates up to global. Conflicting elements are merged into one element with others deleted. This will fail if elements are modified during the resolution - if this happens rerun. """ for db in [self.inbox, self.db]: for row in db.loadView('WorkQueue', 'conflicts')['rows']: element_id = row['id'] try: conflicting_elements = [CouchWorkQueueElement.fromDocument(db, db.document(element_id, rev)) \ for rev in row['value']] fixed_elements = fixElementConflicts(*conflicting_elements) if self.saveElements(fixed_elements[0]): self.saveElements(*fixed_elements[1:]) # delete others (if merged value update accepted) except Exception, ex: self.logger.error("Error resolving conflict for %s: %s" % (element_id, str(ex)))
def getElementsForParentData(self, data): """Get active elements for this data """ elements = self.db.loadView('WorkQueue', 'elementsByParentData', {'key' : data, 'include_docs' : True}) return [CouchWorkQueueElement.fromDocument(self.db, x['doc']) for x in elements.get('rows', [])]
def getElementsForWorkflow(self, workflow): """Get elements for a workflow""" elements = self.db.loadView( "WorkQueue", "elementsByWorkflow", {"key": workflow, "include_docs": True, "reduce": False} ) return [CouchWorkQueueElement.fromDocument(self.db, x["doc"]) for x in elements.get("rows", [])]
def getElementsForParent(self, parent): """Get elements with the given parent""" elements = self.db.loadView('WorkQueue', 'elementsByParent', {'key' : parent.id, 'include_docs' : True}) return [CouchWorkQueueElement.fromDocument(self.db, x['doc']) for x in elements.get('rows', [])]
def availableWork(self, thresholds, siteJobCounts, team=None, wfs=None, excludeWorkflows=None, numElems=9999999): """ Get work which is available to be run Assume thresholds is a dictionary; keys are the site name, values are the maximum number of running jobs at that site. Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site name and task priorities. The value is the number of jobs running at that priority. It will pull work until it reaches the number of elements configured (numElems). Since it's also used for calculating free resources, default it to "infinity" Note: this method will be called with no limit of work elements when it's simply calculating the resources available (based on what is in LQ), before it gets work from GQ """ self.logger.info("Getting up to %d available work from %s", numElems, self.queueUrl) self.logger.info(" for team name: %s", team) self.logger.info(" for wfs: %s", wfs) self.logger.info(" with excludeWorkflows: %s", excludeWorkflows) self.logger.info(" for thresholds: %s", thresholds) excludeWorkflows = excludeWorkflows or [] elements = [] sortedElements = [] # We used to pre-filter sites, looking to see if there are idle job slots # We don't do this anymore, as we may over-allocate # jobs to sites if the new jobs have a higher priority. # If there are no sites, punt early. if not thresholds: self.logger.error("No thresholds is set: Please check") return elements, thresholds, siteJobCounts options = {} options['include_docs'] = True options['descending'] = True options['num_elem'] = numElems options['resources'] = thresholds if team: options['team'] = team if wfs: result = [] for i in xrange(0, len(wfs), 20): options['wfs'] = wfs[i:i + 20] data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result.extend(json.loads(data)) else: result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) if not result: self.logger.info("No available work or it did not pass work/data restrictions for: %s ", self.queueUrl) else: self.logger.info("Retrieved %d elements from workRestrictions list for: %s", len(result), self.queueUrl) # Iterate through the results; apply whitelist / blacklist / data # locality restrictions. Only assign jobs if they are high enough # priority. for i in result: element = CouchWorkQueueElement.fromDocument(self.db, i) # make sure not to acquire work for aborted or force-completed workflows if element['RequestName'] in excludeWorkflows: msg = "Skipping aborted/force-completed workflow: %s, work id: %s" self.logger.info(msg, element['RequestName'], element._id) else: sortedElements.append(element) # sort elements to get them in priority first and timestamp order sortedElements.sort(key=lambda element: element['CreationTime']) sortedElements.sort(key=lambda x: x['Priority'], reverse=True) sites = thresholds.keys() self.logger.info("Current siteJobCounts:") for site, jobsByPrio in siteJobCounts.items(): self.logger.info(" %s : %s", site, jobsByPrio) for element in sortedElements: commonSites = possibleSites(element) prio = element['Priority'] possibleSite = None random.shuffle(sites) for site in sites: if site in commonSites: # Count the number of jobs currently running of greater priority curJobCount = sum([x[1] if x[0] >= prio else 0 for x in siteJobCounts.get(site, {}).items()]) self.logger.debug("Job Count: %s, site: %s thresholds: %s" % (curJobCount, site, thresholds[site])) if curJobCount < thresholds[site]: possibleSite = site break if possibleSite: elements.append(element) siteJobCounts.setdefault(possibleSite, {}) siteJobCounts[possibleSite][prio] = siteJobCounts[possibleSite].setdefault(prio, 0) + \ element['Jobs'] * element.get('blowupFactor', 1.0) else: self.logger.debug("No available resources for %s with doc id %s", element['RequestName'], element.id) self.logger.info("And %d elements passed location and siteJobCounts restrictions for: %s", len(elements), self.queueUrl) return elements, thresholds, siteJobCounts
def availableWork(self, thresholds, siteJobCounts, teams = None, wfs = None): """ Get work which is available to be run Assume thresholds is a dictionary; keys are the site name, values are the maximum number of running jobs at that site. Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site name and task priorities. The value is the number of jobs running at that priority. """ self.logger.info("Getting available work from %s/%s" % (sanitizeURL(self.server.url)['url'], self.db.name)) elements = [] # We used to pre-filter sites, looking to see if there are idle job slots # We don't do this anymore, as we may over-allocate # jobs to sites if the new jobs have a higher priority. # If there are no sites, punt early. if not thresholds: self.logger.error("No thresholds is set: Please check") return elements, thresholds, siteJobCounts options = {} options['include_docs'] = True options['descending'] = True options['resources'] = thresholds if teams: options['teams'] = teams self.logger.info("setting teams %s" % teams) if wfs: result = [] for i in xrange(0, len(wfs), 20): options['wfs'] = wfs[i:i+20] data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result.extend(json.loads(data)) # sort final list result.sort(key = lambda x: x['WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement']['Priority']) else: result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) if len(result) == 0: self.logger.info("""No available work in WQ or didn't pass workqueue restriction - check Pileup, site white list, etc""") self.logger.debug("Available Work:\n %s \n for resources\n %s" % (result, thresholds)) # Iterate through the results; apply whitelist / blacklist / data # locality restrictions. Only assign jobs if they are high enough # priority. for i in result: element = CouchWorkQueueElement.fromDocument(self.db, i) prio = element['Priority'] possibleSite = None sites = thresholds.keys() random.shuffle(sites) for site in sites: if element.passesSiteRestriction(site): # Count the number of jobs currently running of greater priority prio = element['Priority'] curJobCount = sum(map(lambda x : x[1] if x[0] >= prio else 0, siteJobCounts.get(site, {}).items())) self.logger.debug("Job Count: %s, site: %s threshods: %s" % (curJobCount, site, thresholds[site])) if curJobCount < thresholds[site]: possibleSite = site break if possibleSite: self.logger.debug("Possible site exists %s" % str(possibleSite)) elements.append(element) if site not in siteJobCounts: siteJobCounts[site] = {} siteJobCounts[site][prio] = siteJobCounts[site].setdefault(prio, 0) + element['Jobs']*element.get('blowupFactor', 1.0) else: self.logger.info("No possible site for %s" % element['RequestName']) # sort elements to get them in priority first and timestamp order elements.sort(key=lambda element: element['CreationTime']) elements.sort(key = lambda x: x['Priority'], reverse = True) return elements, thresholds, siteJobCounts
def availableWork(self, thresholds, siteJobCounts, teams=None, wfs=None, excludeWorkflows=None, numElems=9999999): """ Get work which is available to be run Assume thresholds is a dictionary; keys are the site name, values are the maximum number of running jobs at that site. Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site name and task priorities. The value is the number of jobs running at that priority. It will pull work until it reaches the number of elements configured (numElems). Since it's also used for calculating free resources, default it to "infinity" Note: this method will be called with no limit of work elements when it's simply calculating the resources available (based on what is in LQ), before it gets work from GQ """ self.logger.info("Getting up to %d available work from %s", numElems, self.queueUrl) excludeWorkflows = excludeWorkflows or [] elements = [] sortedElements = [] # We used to pre-filter sites, looking to see if there are idle job slots # We don't do this anymore, as we may over-allocate # jobs to sites if the new jobs have a higher priority. # If there are no sites, punt early. if not thresholds: self.logger.error("No thresholds is set: Please check") return elements, thresholds, siteJobCounts options = {} options['include_docs'] = True options['descending'] = True options['resources'] = thresholds if teams: options['teams'] = teams self.logger.info("setting teams %s" % teams) if wfs: result = [] for i in xrange(0, len(wfs), 20): options['wfs'] = wfs[i:i + 20] data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result.extend(json.loads(data)) else: result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) if len(result) == 0: self.logger.info("""No available work in WQ or didn't pass workqueue restriction - check Pileup, site white list, etc""") self.logger.debug("Available Work:\n %s \n for resources\n %s" % (result, thresholds)) # Iterate through the results; apply whitelist / blacklist / data # locality restrictions. Only assign jobs if they are high enough # priority. for i in result: element = CouchWorkQueueElement.fromDocument(self.db, i) # filter out exclude list from abvaling if element['RequestName'] not in excludeWorkflows: sortedElements.append(element) # sort elements to get them in priority first and timestamp order sortedElements.sort(key=lambda element: element['CreationTime']) sortedElements.sort(key=lambda x: x['Priority'], reverse=True) for element in sortedElements: if numElems <= 0: self.logger.info("Reached the maximum number of elements to be pulled: %d", len(elements)) break if not possibleSites(element): self.logger.info("No possible sites for %s with doc id %s", element['RequestName'], element.id) continue prio = element['Priority'] possibleSite = None sites = thresholds.keys() random.shuffle(sites) for site in sites: if element.passesSiteRestriction(site): # Count the number of jobs currently running of greater priority curJobCount = sum(map(lambda x: x[1] if x[0] >= prio else 0, siteJobCounts.get(site, {}).items())) self.logger.debug("Job Count: %s, site: %s thresholds: %s" % (curJobCount, site, thresholds[site])) if curJobCount < thresholds[site]: possibleSite = site break if possibleSite: numElems -= 1 self.logger.debug("Possible site exists %s" % str(possibleSite)) elements.append(element) if possibleSite not in siteJobCounts: siteJobCounts[possibleSite] = {} siteJobCounts[possibleSite][prio] = siteJobCounts[possibleSite].setdefault(prio, 0) + \ element['Jobs'] * element.get('blowupFactor', 1.0) else: self.logger.info("No available resources for %s with doc id %s", element['RequestName'], element.id) return elements, thresholds, siteJobCounts
def calculateAvailableWork(self, thresholds, siteJobCounts): """ A short version of the `availableWork` method, which is used only to calculate the amount of work already available at the local workqueue. :param thresholds: a dictionary key'ed by the site name, values representing the maximum number of jobs allowed at that site. :param siteJobCounts: a dictionary-of-dictionaries key'ed by the site name; value is a dictionary with the number of jobs running at a given priority. :return: a tuple with the elements accepted and an overview of job counts per site """ # NOTE: this method can be less verbose as well elements = [] # If there are no sites, punt early. if not thresholds: self.logger.error("No thresholds is set: Please check") return elements, siteJobCounts self.logger.info("Calculating available work from queue %s", self.queueUrl) options = {} options['include_docs'] = True options['descending'] = True options['resources'] = thresholds options['num_elem'] = 9999999 # magic number! result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) self.logger.info( "Retrieved %d elements from workRestrictions list for: %s", len(result), self.queueUrl) # Convert python dictionary into Couch WQE objects # And sort them by creation time and priority, such that highest priority and # oldest elements come first in the list sortedElements = [] for item in result: element = CouchWorkQueueElement.fromDocument(self.db, item) sortedElements.append(element) sortAvailableElements(sortedElements) for element in sortedElements: commonSites = possibleSites(element) prio = element['Priority'] # shuffle list of common sites all the time to give everyone the same chance random.shuffle(commonSites) possibleSite = None for site in commonSites: if site in thresholds: # Count the number of jobs currently running of greater priority, if they # are less than the site thresholds, then accept this element curJobCount = sum([ x[1] if x[0] >= prio else 0 for x in viewitems(siteJobCounts.get(site, {})) ]) self.logger.debug("Job Count: %s, site: %s thresholds: %s", curJobCount, site, thresholds[site]) if curJobCount < thresholds[site]: possibleSite = site break if possibleSite: self.logger.debug( "Meant to accept workflow: %s, with prio: %s, element id: %s, for site: %s", element['RequestName'], prio, element.id, possibleSite) elements.append(element) siteJobCounts.setdefault(possibleSite, {}) siteJobCounts[possibleSite][prio] = siteJobCounts[possibleSite].setdefault(prio, 0) + \ element['Jobs'] * element.get('blowupFactor', 1.0) else: self.logger.debug( "No available resources for %s with localdoc id %s", element['RequestName'], element.id) self.logger.info( "And %d elements passed location and siteJobCounts restrictions for: %s", len(elements), self.queueUrl) return elements, siteJobCounts
def availableWork(self, thresholds, siteJobCounts, team=None, excludeWorkflows=None, numElems=9999999): """ Get work - either from local or global queue - which is available to be run. :param thresholds: a dictionary key'ed by the site name, values representing the maximum number of jobs allowed at that site. :param siteJobCounts: a dictionary-of-dictionaries key'ed by the site name; value is a dictionary with the number of jobs running at a given priority. :param team: a string with the team name we want to pull work for :param excludeWorkflows: list of (aborted) workflows that should not be accepted :param numElems: integer with the maximum number of elements to be accepted (default to a very large number when pulling work from local queue, read unlimited) :return: a tuple with the elements accepted and an overview of job counts per site """ excludeWorkflows = excludeWorkflows or [] elements = [] # If there are no sites, punt early. if not thresholds: self.logger.error("No thresholds is set: Please check") return elements, siteJobCounts self.logger.info("Current siteJobCounts:") for site, jobsByPrio in viewitems(siteJobCounts): self.logger.info(" %s : %s", site, jobsByPrio) self.logger.info("Getting up to %d available work from %s", numElems, self.queueUrl) self.logger.info(" for team name: %s", team) self.logger.info(" with excludeWorkflows: %s", excludeWorkflows) self.logger.info(" for thresholds: %s", thresholds) # FIXME: magic numbers docsSliceSize = 1000 options = {} options['include_docs'] = True options['descending'] = True options['resources'] = thresholds options['limit'] = docsSliceSize # FIXME: num_elem option can likely be deprecated, but it needs synchronization # between agents and global workqueue... for now, make sure it can return the slice size options['num_elem'] = docsSliceSize if team: options['team'] = team # Fetch workqueue elements in slices, using the CouchDB "limit" and "skip" # options for couch views. Conditions to stop this loop are: # a) have a hard stop at 50k+1 (we might have to make this configurable) # b) stop as soon as an empty slice is returned by Couch (thus all docs have # already been retrieve) # c) or, once "numElems" elements have been accepted numSkip = 0 breakOut = False while True: if breakOut: # then we have reached the maximum number of elements to be accepted break self.logger.info(" with limit docs: %s, and skip first %s docs", docsSliceSize, numSkip) options['skip'] = numSkip result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) if result: self.logger.info( "Retrieved %d elements from workRestrictions list for: %s", len(result), self.queueUrl) else: self.logger.info( "All the workqueue elements have been exhausted for: %s ", self.queueUrl) break # update number of documents to skip in the next cycle numSkip += docsSliceSize # Convert python dictionary into Couch WQE objects, skipping aborted workflows # And sort them by creation time and priority, such that highest priority and # oldest elements come first in the list sortedElements = [] for i in result: element = CouchWorkQueueElement.fromDocument(self.db, i) # make sure not to acquire work for aborted or force-completed workflows if element['RequestName'] in excludeWorkflows: msg = "Skipping aborted/force-completed workflow: %s, work id: %s" self.logger.info(msg, element['RequestName'], element._id) else: sortedElements.append(element) sortAvailableElements(sortedElements) for element in sortedElements: if numElems <= 0: msg = "Reached maximum number of elements to be accepted, " msg += "configured to: {}, from queue: {}".format( len(elements), self.queueUrl) self.logger.info(msg) breakOut = True # get out of the outer loop as well break commonSites = possibleSites(element) prio = element['Priority'] # shuffle list of common sites all the time to give everyone the same chance random.shuffle(commonSites) possibleSite = None for site in commonSites: if site in thresholds: # Count the number of jobs currently running of greater priority, if they # are less than the site thresholds, then accept this element curJobCount = sum([ x[1] if x[0] >= prio else 0 for x in viewitems(siteJobCounts.get(site, {})) ]) self.logger.debug( "Job Count: %s, site: %s thresholds: %s" % (curJobCount, site, thresholds[site])) if curJobCount < thresholds[site]: possibleSite = site break if possibleSite: self.logger.info( "Accepting workflow: %s, with prio: %s, element id: %s, for site: %s", element['RequestName'], prio, element.id, possibleSite) numElems -= 1 elements.append(element) siteJobCounts.setdefault(possibleSite, {}) siteJobCounts[possibleSite][prio] = siteJobCounts[possibleSite].setdefault(prio, 0) + \ element['Jobs'] * element.get('blowupFactor', 1.0) else: self.logger.debug( "No available resources for %s with doc id %s", element['RequestName'], element.id) self.logger.info( "And %d elements passed location and siteJobCounts restrictions for: %s", len(elements), self.queueUrl) return elements, siteJobCounts
def getElementsForPileupData(self, data): """Get active elements for this data """ elements = self.db.loadView("WorkQueue", "elementsByPileupData", {"key": data, "include_docs": True}) return [CouchWorkQueueElement.fromDocument(self.db, x["doc"]) for x in elements.get("rows", [])]
def availableWork(self, thresholds, siteJobCounts, team=None, wfs=None, excludeWorkflows=None, numElems=9999999): """ Get work which is available to be run Assume thresholds is a dictionary; keys are the site name, values are the maximum number of running jobs at that site. Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site name and task priorities. The value is the number of jobs running at that priority. It will pull work until it reaches the number of elements configured (numElems). Since it's also used for calculating free resources, default it to "infinity" Note: this method will be called with no limit of work elements when it's simply calculating the resources available (based on what is in LQ), before it gets work from GQ """ self.logger.info("Getting up to %d available work from %s", numElems, self.queueUrl) excludeWorkflows = excludeWorkflows or [] elements = [] sortedElements = [] # We used to pre-filter sites, looking to see if there are idle job slots # We don't do this anymore, as we may over-allocate # jobs to sites if the new jobs have a higher priority. # If there are no sites, punt early. if not thresholds: self.logger.error("No thresholds is set: Please check") return elements, thresholds, siteJobCounts options = {} options['include_docs'] = True options['descending'] = True options['resources'] = thresholds if team: options['team'] = team self.logger.info("setting team to %s" % team) if wfs: result = [] for i in xrange(0, len(wfs), 20): options['wfs'] = wfs[i:i + 20] data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result.extend(json.loads(data)) else: result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) if len(result) == 0: self.logger.info( """No available work in WQ or didn't pass workqueue restriction - check Pileup, site white list, etc""") self.logger.debug("Available Work:\n %s \n for resources\n %s" % (result, thresholds)) # Iterate through the results; apply whitelist / blacklist / data # locality restrictions. Only assign jobs if they are high enough # priority. for i in result: element = CouchWorkQueueElement.fromDocument(self.db, i) # filter out exclude list from abvaling if element['RequestName'] not in excludeWorkflows: sortedElements.append(element) # sort elements to get them in priority first and timestamp order sortedElements.sort(key=lambda element: element['CreationTime']) sortedElements.sort(key=lambda x: x['Priority'], reverse=True) for element in sortedElements: if numElems <= 0: self.logger.info( "Reached the maximum number of elements to be pulled: %d", len(elements)) break if not possibleSites(element): self.logger.info("No possible sites for %s with doc id %s", element['RequestName'], element.id) continue prio = element['Priority'] possibleSite = None sites = thresholds.keys() random.shuffle(sites) for site in sites: if element.passesSiteRestriction(site): # Count the number of jobs currently running of greater priority curJobCount = sum([ x[1] if x[0] >= prio else 0 for x in siteJobCounts.get(site, {}).items() ]) self.logger.debug( "Job Count: %s, site: %s thresholds: %s" % (curJobCount, site, thresholds[site])) if curJobCount < thresholds[site]: possibleSite = site break if possibleSite: numElems -= 1 self.logger.debug("Possible site exists %s" % str(possibleSite)) elements.append(element) if possibleSite not in siteJobCounts: siteJobCounts[possibleSite] = {} siteJobCounts[possibleSite][prio] = siteJobCounts[possibleSite].setdefault(prio, 0) + \ element['Jobs'] * element.get('blowupFactor', 1.0) else: self.logger.debug( "No available resources for %s with doc id %s", element['RequestName'], element.id) return elements, thresholds, siteJobCounts
def getElementsForParent(self, parent): """Get elements with the given parent""" elements = self.db.loadView("WorkQueue", "elementsByParent", {"key": parent.id, "include_docs": True}) return [CouchWorkQueueElement.fromDocument(self.db, x["doc"]) for x in elements.get("rows", [])]
def availableWork(self, thresholds, siteJobCounts, teams=None, wfs=None): """ Get work which is available to be run Assume thresholds is a dictionary; keys are the site name, values are the maximum number of running jobs at that site. Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site name and task priorities. The value is the number of jobs running at that priority. """ self.logger.info("Getting available work from %s/%s" % (sanitizeURL(self.server.url)['url'], self.db.name)) elements = [] # We used to pre-filter sites, looking to see if there are idle job slots # We don't do this anymore, as we may over-allocate # jobs to sites if the new jobs have a higher priority. # If there are no sites, punt early. if not thresholds: self.logger.error("No thresholds is set: Please check") return elements, thresholds, siteJobCounts options = {} options['include_docs'] = True options['descending'] = True options['resources'] = thresholds if teams: options['teams'] = teams self.logger.info("setting teams %s" % teams) if wfs: result = [] for i in xrange(0, len(wfs), 20): options['wfs'] = wfs[i:i + 20] data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result.extend(json.loads(data)) # sort final list result.sort(key=lambda x: x[ 'WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement' ]['Priority']) else: result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) if len(result) == 0: self.logger.info( """No available work in WQ or didn't pass workqueue restriction - check Pileup, site white list, etc""") self.logger.debug("Available Work:\n %s \n for resources\n %s" % (result, thresholds)) # Iterate through the results; apply whitelist / blacklist / data # locality restrictions. Only assign jobs if they are high enough # priority. for i in result: element = CouchWorkQueueElement.fromDocument(self.db, i) prio = element['Priority'] possibleSite = None sites = thresholds.keys() random.shuffle(sites) for site in sites: if element.passesSiteRestriction(site): # Count the number of jobs currently running of greater priority prio = element['Priority'] curJobCount = sum( map(lambda x: x[1] if x[0] >= prio else 0, siteJobCounts.get(site, {}).items())) self.logger.debug("Job Count: %s, site: %s threshods: %s" % (curJobCount, site, thresholds[site])) if curJobCount < thresholds[site]: possibleSite = site break if possibleSite: elements.append(element) if site not in siteJobCounts: siteJobCounts[site] = {} siteJobCounts[site][prio] = siteJobCounts[site].setdefault( prio, 0) + element['Jobs'] else: self.logger.info("No possible site for %s" % element) # sort elements to get them in priority first and timestamp order elements.sort(key=lambda element: element['CreationTime']) elements.sort(key=lambda x: x['Priority'], reverse=True) return elements, thresholds, siteJobCounts
def availableWork(self, thresholds, siteJobCounts, teams = None, wfs = None): """ Get work which is available to be run Assume thresholds is a dictionary; keys are the site name, values are the maximum number of running jobs at that site. Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site name and task priorities. The value is the number of jobs running at that priority. """ elements = [] # We used to pre-filter sites, looking to see if there are idle job slots # We don't do this anymore, as we may over-allocate # jobs to sites if the new jobs have a higher priority. # If there are no sites, punt early. if not thresholds: return elements, thresholds, siteJobCounts options = {} options['include_docs'] = True options['descending'] = True options['resources'] = thresholds if teams: options['teams'] = teams if wfs: result = [] for i in xrange(0, len(wfs), 20): options['wfs'] = wfs[i:i+20] data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result.extend(json.loads(data)) # sort final list result.sort(key = lambda x: x['WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement']['Priority']) else: result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) # Iterate through the results; apply whitelist / blacklist / data # locality restrictions. Only assign jobs if they are high enough # priority. for i in result: element = CouchWorkQueueElement.fromDocument(self.db, i) prio = element['Priority'] possibleSite = None sites = thresholds.keys() random.shuffle(sites) for site in sites: if element.passesSiteRestriction(site): # Count the number of jobs currently running of greater priority prio = element['Priority'] curJobCount = sum(map(lambda x : x[1] if x[0] >= prio else 0, siteJobCounts.get(site, {}).items())) if curJobCount < thresholds[site]: possibleSite = site break if possibleSite: elements.append(element) if site not in siteJobCounts: siteJobCounts[site] = {} siteJobCounts[site][prio] = siteJobCounts[site].setdefault(prio, 0) + element['Jobs'] # sort elements to get them in timestamp order elements = sorted(elements, key=lambda element: element['CreationTime']) return elements, thresholds, siteJobCounts