def testBkQuery(self, bkQuery, printOutput=False, bkClient=None): """ just pretty print of the result of a BK Query """ if bkClient is None: bkClient = BookkeepingClient() res = bkClient.getFiles(bkQuery) if not res['OK']: return self._errorReport(res, 'Failed to perform BK query') gLogger.info('The supplied query returned %d files' % len(res['Value'])) if printOutput: self._prettyPrint(res) return S_OK(res['Value'])
class BookkeepingWatchAgent(AgentModule, TransformationAgentsUtilities): """ LHCbDIRAC only agent. A threaded agent. """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) TransformationAgentsUtilities.__init__(self) self.bkQueriesToBeChecked = Queue.Queue() self.bkQueriesInCheck = [] self.fullUpdatePeriod = 86400 self.bkUpdateLatency = 7200 self.debug = False self.transInThread = {} self.pickleFile = 'BookkeepingWatchAgent.pkl' self.chunkSize = 1000 self.pluginsWithNoRunInfo = ['LHCbStandard', 'ReplicateDataset', 'ArchiveDataset', 'LHCbMCDSTBroadcastRandom', 'ReplicateToLocalSE', 'RemoveReplicas', 'RemoveReplicasWhenProcessed', 'RemoveReplicasWithAncestors', 'ReplicateWithAncestors', 'ReduceReplicas', 'RemoveDatasetFromDisk', 'DestroyDataset', 'DestroyDatasetWhenProcessed', 'BySize', 'Standard'] self.timeLog = {} self.fullTimeLog = {} self.bkQueries = {} self.transClient = None self.bkClient = None def initialize(self): """ Make the necessary initializations. The ThreadPool is created here, the _execute() method is what each thread will execute. """ self.fullUpdatePeriod = self.am_getOption('FullUpdatePeriod', self.fullUpdatePeriod) self.bkUpdateLatency = self.am_getOption('BKUpdateLatency', self.bkUpdateLatency) self.debug = self.am_getOption('verbose', self.debug) self.pickleFile = os.path.join(self.am_getWorkDirectory(), self.pickleFile) self.chunkSize = self.am_getOption('maxFilesPerChunk', self.chunkSize) self.pluginsWithNoRunInfo = Operations().getValue('TransformationPlugins/PluginsWithNoRunInfo', self.pluginsWithNoRunInfo) self._logInfo('Full Update Period: %d seconds' % self.fullUpdatePeriod) self._logInfo('BK update latency : %d seconds' % self.bkUpdateLatency) self._logInfo('Plugins with no run info: %s' % ', '.join(self.pluginsWithNoRunInfo)) self.transClient = TransformationClient() self.bkClient = BookkeepingClient() try: with open(self.pickleFile, 'r') as pf: self.timeLog = pickle.load(pf) self.fullTimeLog = pickle.load(pf) self.bkQueries = pickle.load(pf) self._logInfo("successfully loaded Log from", self.pickleFile, "initialize") except (EOFError, IOError): self._logInfo("failed loading Log from", self.pickleFile, "initialize") self.timeLog = {} self.fullTimeLog = {} self.bkQueries = {} maxNumberOfThreads = self.am_getOption('maxThreadsInPool', 1) threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) for i in xrange(maxNumberOfThreads): threadPool.generateJobAndQueueIt(self._execute, [i]) gMonitor.registerActivity("Iteration", "Agent Loops", AGENT_NAME, "Loops/min", gMonitor.OP_SUM) return S_OK() @gSynchro def __dumpLog(self): """ dump the log in the pickle file """ if self.pickleFile: try: with open(self.pickleFile, 'w') as pf: pickle.dump(self.timeLog, pf) pickle.dump(self.fullTimeLog, pf) pickle.dump(self.bkQueries, pf) self._logVerbose("successfully dumped Log into %s" % self.pickleFile) except IOError as e: self._logError("fail to open %s: %s" % (self.pickleFile, e)) except pickle.PickleError as e: self._logError("fail to dump %s: %s" % (self.pickleFile, e)) except ValueError as e: self._logError("fail to close %s: %s" % (self.pickleFile, e)) ################################################################################ def execute(self): """ Main execution method. Just fills a list, and a queue, with BKKQueries ID. """ gMonitor.addMark('Iteration', 1) # Get all the transformations result = self.transClient.getTransformations(condDict={'Status': ['Active', 'Idle']}) if not result['OK']: self._logError("Failed to get transformations.", result['Message']) return S_OK() transIDsList = [long(transDict['TransformationID']) for transDict in result['Value']] res = self.transClient.getTransformationsWithBkQueries(transIDsList) if not res['OK']: self._logError("Failed to get transformations with Bk Queries.", res['Message']) return S_OK() transIDsWithBkQueriesList = res['Value'] _count = 0 # Process each transformation for transID in transIDsWithBkQueriesList: if transID in self.bkQueriesInCheck: continue self.bkQueriesInCheck.append(transID) self.bkQueriesToBeChecked.put(transID) _count += 1 self._logInfo("Out of %d transformations, %d put in thread queue" % (len(result['Value']), _count)) self.__dumpLog() return S_OK() def _execute(self, threadID): """ Real executor. This is what is executed by the single threads - so do not return here! Just continue """ while True: # not self.bkQueriesToBeChecked.empty(): transID = None try: transID = self.bkQueriesToBeChecked.get() self.transInThread[transID] = ' [Thread%d] [%s] ' % (threadID, str(transID)) startTime = time.time() self._logInfo("Processing transformation %s." % transID, transID=transID) res = self.transClient.getTransformation(transID, extraParams=False) if not res['OK']: self._logError("Failed to get transformation", res['Message'], transID=transID) continue transPlugin = res['Value']['Plugin'] res = self.transClient.getBookkeepingQuery(transID) if not res['OK']: self._logError("Failed to get BkQuery", res['Message'], transID=transID) continue bkQuery = res['Value'] # Determine the correct time stamp to use for this transformation now = datetime.datetime.utcnow() self.__timeStampForTransformation(transID, bkQuery, now) try: files = self.__getFiles(transID, bkQuery, now) except RuntimeError as e: # In case we failed a full query, we should retry full query until successful if 'StartDate' not in bkQuery: self.bkQueries.pop(transID, None) self._logError("Failed to get response from the Bookkeeping: %s" % e, "", "__getFiles", transID) continue runDict = {} filesMetadata = {} # get the files metadata for lfnChunk in breakListIntoChunks(files, self.chunkSize): start = time.time() res = self.bkClient.getFileMetadata(lfnChunk) self._logVerbose("Got metadata from BK for %d files" % len(lfnChunk), transID=transID, reftime=start) if not res['OK']: self._logError("Failed to get BK metadata for %d files" % len(lfnChunk), res['Message'], transID=transID) # No need to return as we only consider files that are successful... else: filesMetadata.update(res['Value']['Successful']) # There is no need to add the run information for a transformation that doesn't need it if transPlugin not in self.pluginsWithNoRunInfo: for lfn, metadata in filesMetadata.iteritems(): runID = metadata.get('RunNumber', None) if isinstance(runID, (basestring, int, long)): runDict.setdefault(int(runID), []).append(lfn) try: self.__addRunsMetadata(transID, runDict.keys()) except RuntimeError as e: self._logException("Failure adding runs metadata", method="__addRunsMetadata", lException=e, transID=transID) else: runDict[None] = filesMetadata.keys() # Add all new files to the transformation for runID in sorted(runDict): lfnList = runDict[runID] # We enter all files of a run at once, otherwise do it by chunks lfnChunks = [lfnList] if runID else breakListIntoChunks(lfnList, self.chunkSize) for lfnChunk in lfnChunks: # Add the files to the transformation self._logVerbose('Adding %d lfns for transformation' % len(lfnChunk), transID=transID) result = self.transClient.addFilesToTransformation(transID, lfnChunk) if not result['OK']: self._logError("Failed to add %d lfns to transformation" % len(lfnChunk), result['Message'], transID=transID) return result else: # Handle errors errors = {} for lfn, error in result['Value']['Failed'].iteritems(): errors.setdefault(error, []).append(lfn) for error, lfns in errors.iteritems(): self._logWarn("Failed to add files to transformation", error, transID=transID) self._logVerbose("\n\t".join([''] + lfns)) # Add the metadata and RunNumber to the newly inserted files addedLfns = [lfn for (lfn, status) in result['Value']['Successful'].iteritems() if status == 'Added'] if addedLfns: # Add files metadata: size and file type lfnDict = dict((lfn, {'Size': filesMetadata[lfn]['FileSize'], 'FileType': filesMetadata[lfn]['FileType']}) for lfn in addedLfns) res = self.transClient.setParameterToTransformationFiles(transID, lfnDict) if not res['OK']: self._logError("Failed to set transformation files metadata", res['Message']) return res # Add run information if it exists if runID: self._logInfo("Added %d files to transformation for run %d, now including run information" % (len(addedLfns), runID), transID=transID) self._logVerbose("Associating %d files to run %d" % (len(addedLfns), runID), transID=transID) res = self.transClient.addTransformationRunFiles(transID, runID, addedLfns) if not res['OK']: self._logError("Failed to associate %d files to run %d" % (len(addedLfns), runID), res['Message'], transID=transID) return res else: self._logInfo("Added %d files to transformation" % len(addedLfns), transID=transID) except Exception as x: # pylint: disable=broad-except self._logException('Exception while adding files to transformation', lException=x, method='_execute', transID=transID) finally: self._logInfo("Processed transformation", transID=transID, reftime=startTime) if transID in self.bkQueriesInCheck: self.bkQueriesInCheck.remove(transID) self.transInThread.pop(transID, None) return S_OK() @gSynchro def __timeStampForTransformation(self, transID, bkQuery, now): """ Determine the correct time stamp to use for this transformation """ fullTimeLog = self.fullTimeLog.setdefault(transID, now) bkQueryLog = self.bkQueries.setdefault(transID, {}) bkQueryLog.pop('StartDate', None) self.bkQueries[transID] = bkQuery.copy() if transID in self.timeLog \ and bkQueryLog == bkQuery \ and (now - fullTimeLog) < datetime.timedelta(seconds=self.fullUpdatePeriod): # If it is more than a day since the last reduced query, make a full query just in case timeStamp = self.timeLog[transID] delta = datetime.timedelta(seconds=self.bkUpdateLatency) bkQuery['StartDate'] = (timeStamp - delta).strftime('%Y-%m-%d %H:%M:%S') if 'StartDate' not in bkQuery: self.fullTimeLog[transID] = now def __getFiles(self, transID, bkQuery, now): """ Perform the query to the Bookkeeping """ self._logInfo("Using BK query for transformation: %s" % str(bkQuery), transID=transID) start = time.time() result = self.bkClient.getFiles(bkQuery) self._logVerbose("BK query time: %.2f seconds." % (time.time() - start), transID=transID) if not result['OK']: raise RuntimeError(result['Message']) else: self.__updateTimeStamp(transID, now) if result['Value']: self._logInfo("Obtained %d files from BK" % len(result['Value']), transID=transID) return result['Value'] @gSynchro def __updateTimeStamp(self, transID, now): """ Update time stamp for current transformation to now """ self.timeLog[transID] = now def __addRunsMetadata(self, transID, runsList): """ Add the run metadata """ runsInCache = self.transClient.getRunsInCache({'Name': ['TCK', 'CondDb', 'DDDB']}) if not runsInCache['OK']: raise RuntimeError(runsInCache['Message']) newRuns = list(set(runsList) - set(runsInCache['Value'])) if newRuns: self._logVerbose("Associating run metadata to %d runs" % len(newRuns), transID=transID) res = self.bkClient.getRunInformation({'RunNumber': newRuns, 'Fields': ['TCK', 'CondDb', 'DDDB']}) if not res['OK']: raise RuntimeError(res['Message']) else: for run, runMeta in res['Value'].iteritems(): res = self.transClient.addRunsMetadata(run, runMeta) if not res['OK']: raise RuntimeError(res['Message']) # Add run duration to the metadata runsInCache = self.transClient.getRunsInCache({'Name': ['Duration']}) if not runsInCache['OK']: raise RuntimeError(runsInCache['Message']) newRuns = list(set(runsList) - set(runsInCache['Value'])) if newRuns: self._logVerbose("Associating run duration to %d runs" % len(newRuns), transID=transID) res = self.bkClient.getRunInformation({'RunNumber': newRuns, 'Fields': ['JobStart', 'JobEnd']}) if not res['OK']: raise RuntimeError(res['Message']) else: for run, runMeta in res['Value'].iteritems(): duration = (runMeta['JobEnd'] - runMeta['JobStart']).seconds res = self.transClient.addRunsMetadata(run, {'Duration': duration}) if not res['OK']: raise RuntimeError(res['Message']) def finalize(self): """ Gracious finalization """ if self.bkQueriesInCheck: self._logInfo("Wait for queue to get empty before terminating the agent (%d tasks)" % len(self.transInThread)) self.bkQueriesInCheck = [] while self.transInThread: time.sleep(2) self.log.info("Threads are empty, terminating the agent...") return S_OK()
class RequestTrackingAgent(AgentModule): def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.bkClient = None self.prodReq = None def initialize(self): """ Just initializing the clients """ self.bkClient = BookkeepingClient() self.prodReq = RPCClient("ProductionManagement/ProductionRequest") return S_OK() def execute(self): """The RequestTrackingAgent execution method. """ result = self.prodReq.getTrackedInput() update = [] if result['OK']: gLogger.verbose( "Requests tracked: %s" % (','.join([str(req['RequestID']) for req in result['Value']]))) for request in result['Value']: result = self.bkInputNumberOfEvents(request) if result['OK']: update.append({ 'RequestID': request['RequestID'], 'RealNumberOfEvents': result['Value'] }) else: gLogger.error( 'Input of %s is not updated: %s' % (str(request['RequestID']), result['Message'])) else: gLogger.error('Request service: %s' % result['Message']) if update: result = self.prodReq.updateTrackedInput(update) if not result['OK']: gLogger.error(result['Message']) return S_OK('Request Tracking information updated') def bkInputNumberOfEvents(self, request): """ Extremely dirty way... """ dq = request.get('inDataQualityFlag', 'ALL') if dq != 'ALL': dq = [str(idq) for idq in dq.replace(' ', '').split(',')] try: condition = { 'ProcessingPass': str(request.get('inProPass', '')).strip(), 'FileType': [ str(ift) for ift in request.get('inFileType', '').replace( ' ', '').split(',') ], 'EventType': str(request.get('EventType', '')).replace(' ', ''), 'ConfigName': str(request.get('configName', '')).replace(' ', ''), 'ConfigVersion': str(request.get('configVersion', '')).replace(' ', ''), 'DataQualityFlag': dq } except KeyError as ke: gLogger.error("%s is incomplete: %s" % (request['RequestID'], repr(ke))) return S_ERROR(repr(ke)) if 'condType' in request and request['condType'] == 'Run': condition['DataTakingConditions'] = str(request['SimCondition']) else: condition['SimulationConditions'] = str(request['SimCondition']) if str(request['inProductionID']) not in ('0', 'ALL'): condition['Production'] = [ int(x) for x in str(request['inProductionID']).split(',') ] if 'inTCKs' in request and str(request['inTCKs']) != '': condition['TCK'] = [ str(x) for x in str(request['inTCKs']).split(',') ] condition['NbOfEvents'] = True gLogger.verbose("Requesting: ", str(condition)) result = self.bkClient.getFiles(condition) if not result['OK']: gLogger.error("Error requesting files from BK", result['Message']) return result if not result['Value'][0]: return S_OK(0) try: sum_nr = long(result['Value'][0]) except ValueError as e: return S_ERROR("Can not convert result from BK call: %s" % str(e)) return S_OK(sum_nr)
class BKQuery(): """ It used to build a dictionary using a given Bookkeeping path which is used to query the Bookkeeping database. """ def __init__(self, bkQuery=None, prods=None, runs=None, fileTypes=None, visible=True, eventTypes=None): prods = prods if prods is not None else [] runs = runs if runs is not None else [] fileTypes = fileTypes if fileTypes is not None else [] self.extraBKitems = ("StartRun", "EndRun", "Production", "RunNumber") self.__bkClient = BookkeepingClient() bkPath = '' bkQueryDict = {} self.__bkFileTypes = set() self.__exceptFileTypes = set() self.__fakeAllDST = 'ZZZZZZZZALL.DST' self.__alreadyWarned = False if isinstance(bkQuery, BKQuery): bkQueryDict = bkQuery.getQueryDict().copy() elif isinstance(bkQuery, dict): bkQueryDict = bkQuery.copy() elif isinstance(bkQuery, basestring): bkPath = bkQuery bkQueryDict = self.buildBKQuery(bkPath=bkPath, bkQueryDict=bkQueryDict, prods=prods, runs=runs, fileTypes=fileTypes, eventTypes=eventTypes, visible=visible) self.__bkPath = bkPath self.__bkQueryDict = bkQueryDict if not bkQueryDict.get('Visible'): self.setVisible(visible) def __str__(self): return str(self.__bkQueryDict) def buildBKQuery(self, bkPath='', bkQueryDict=None, prods=None, runs=None, fileTypes=None, visible=True, eventTypes=None): """ it builds a dictionary using a path """ bkQueryDict = bkQueryDict if bkQueryDict is not None else {} prods = prods if prods is not None else [] if not isinstance(prods, list): prods = [prods] runs = runs if runs is not None else [] fileTypes = fileTypes if fileTypes is not None else [] gLogger.verbose("BKQUERY.buildBKQuery: Path %s, Dict %s, \ Prods %s, Runs %s, FileTypes %s, EventTypes %s, Visible %s" % (bkPath, str(bkQueryDict), str(prods), str(runs), str(fileTypes), str(eventTypes), visible)) self.__bkQueryDict = {} if not bkPath and not prods and not bkQueryDict and not runs: return {} if bkQueryDict: bkQuery = bkQueryDict.copy() else: bkQuery = {} ###### Query given as a path /ConfigName/ConfigVersion/ConditionDescription/ProcessingPass/EventType/FileType ###### # or if prefixed with evt: /ConfigName/ConfigVersion/EventType/ConditionDescription/ProcessingPass/FileType if bkPath: self.__getAllBKFileTypes() bkFields = ("ConfigName", "ConfigVersion", "ConditionDescription", "ProcessingPass", "EventType", "FileType") url = bkPath.split(':', 1) if len(url) == 1: bkPath = url[0] else: if url[0] == 'evt': bkFields = ("ConfigName", "ConfigVersion", "EventType", "ConditionDescription", "ProcessingPass", "FileType") elif url[0] == 'pp': bkFields = ("ProcessingPass", "EventType", "FileType") elif url[0] == 'prod': bkFields = ("Production", "ProcessingPass", "EventType", "FileType") elif url[0] == 'runs': bkFields = ("Runs", "ProcessingPass", "EventType", "FileType") elif url[0] not in ('sim', 'daq', 'cond'): gLogger.error('Invalid BK path:%s' % bkPath) return self.__bkQueryDict bkPath = url[1] if bkPath[0] != '/': bkPath = '/' + bkPath if bkPath[0:2] == '//': bkPath = bkPath[1:] bkPath = bkPath.replace("RealData", "Real Data") i = 0 processingPass = '******' defaultPP = False bk = bkPath.split('/')[1:] + len(bkFields) * [''] for bpath in bk: gLogger.verbose( 'buildBKQuery.1. Item #%d, Field %s, From Path %s, ProcessingPass %s' % (i, bkFields[i], bpath, processingPass)) if bkFields[i] == 'ProcessingPass': if bpath != '' and bpath.upper() != 'ALL' and \ not bpath.split(',')[0].split(' ')[0].isdigit() and \ not bpath.upper() in self.__bkFileTypes: processingPass = os.path.join(processingPass, bpath) continue # Set the PP if processingPass != '/': bkQuery['ProcessingPass'] = processingPass else: defaultPP = True i += 1 gLogger.verbose( 'buildBKQuery.2. Item #%d, Field %s, From Path %s, ProcessingPass %s' % (i, bkFields[i], bpath, processingPass)) if bkFields[i] == 'EventType' and bpath: eventTypeList = [] # print b if bpath.upper() == 'ALL': bpath = 'ALL' else: for et in bpath.split(','): try: eventType = int(et.split(' ')[0]) eventTypeList.append(eventType) except ValueError: pass if len(eventTypeList) == 1: eventTypeList = eventTypeList[0] bpath = eventTypeList gLogger.verbose('buildBKQuery. Event types %s' % eventTypeList) # Set the BK dictionary item if bpath != '': bkQuery[bkFields[i]] = bpath if defaultPP: # PP was empty, try once more to get the Event Type defaultPP = False else: # Go to next item i += 1 if i == len(bkFields): break gLogger.verbose('buildBKQuery. Query dict %s' % str(bkQuery)) # Set default event type to real data if bkQuery.get('ConfigName') != 'MC' and not bkQuery.get( 'EventType'): bkQuery['EventType'] = '90000000' if bkQuery.get('EventType') == 'ALL': bkQuery.pop('EventType') # Run limits are given runs = bkQuery.pop('Runs', runs) if runs: try: bkQuery = parseRuns(bkQuery, runs) except BadRunRange: return self.__bkQueryDict ###### Query given as a list of production ###### if prods and str(prods[0]).upper() != 'ALL': try: bkQuery.setdefault('Production', []).extend([int(prod) for prod in prods]) except ValueError as ex: # The prods list does not contains numbers gLogger.warn(ex) gLogger.error('Invalid production list', str(prods)) return self.__bkQueryDict # If an event type is specified if eventTypes: bkQuery['EventType'] = eventTypes # Set the file type(s) taking into account excludes file types fileTypes = bkQuery.get('FileType', fileTypes) bkQuery.pop('FileType', None) self.__bkQueryDict = bkQuery.copy() fileType = self.__fileType(fileTypes) # print fileType if fileType: bkQuery['FileType'] = fileType # Remove all "ALL"'s in the dict, if any for i in self.__bkQueryDict: if isinstance(bkQuery[i], basestring) and bkQuery[i] == 'ALL': bkQuery.pop(i) # If there is only one production, make it faster with a single value rather than a list prodList = bkQuery.get('Production') if isinstance(prodList, list) and len(prodList) == 1: bkQuery['Production'] = prodList[0] self.__bkQueryDict = bkQuery.copy() self.setVisible(visible) # Set both event type entries # print "Before setEventType", self.__bkQueryDict if not self.setEventType(bkQuery.get('EventType')): self.__bkQueryDict = {} return self.__bkQueryDict # Set conditions # print "Before setConditions", self.__bkQueryDict self.setConditions( bkQuery.get( 'ConditionDescription', bkQuery.get('DataTakingConditions', bkQuery.get('SimulationConditions')))) # print "Returned value", self.__bkQueryDict return self.__bkQueryDict def setOption(self, key, val): """ It insert an item to the dictionary. The key is an bookkeeping attribute (condition). """ if val: self.__bkQueryDict[key] = val else: self.__bkQueryDict.pop(key, None) return self.__bkQueryDict def setConditions(self, cond=None): """ Set the dictionary items for a given condition, or remove it (cond=None) """ if 'ConfigName' not in self.__bkQueryDict and cond: gLogger.warn( "Impossible to set Conditions to a BK Query without Configuration" ) return self.__bkQueryDict # There are two items in the dictionary: ConditionDescription and Simulation/DataTaking-Conditions eventType = self.__bkQueryDict.get('EventType', 'ALL') if self.__bkQueryDict.get('ConfigName') == 'MC' or \ (isinstance(eventType, basestring) and eventType.upper() != 'ALL' and eventType[0] != '9'): conditionsKey = 'SimulationConditions' else: conditionsKey = 'DataTakingConditions' self.setOption('ConditionDescription', cond) return self.setOption(conditionsKey, cond) def setFileType(self, fileTypes=None): """insert the file type to the Boookkeeping dictionary """ return self.setOption('FileType', self.__fileType(fileTypes)) def setDQFlag(self, dqFlag='OK'): """ Sets the data quality. """ if isinstance(dqFlag, basestring): dqFlag = dqFlag.upper() elif isinstance(dqFlag, list): dqFlag = [dq.upper() for dq in dqFlag] return self.setOption('DataQuality', dqFlag) def setStartDate(self, startDate): """ Sets the start date. """ return self.setOption('StartDate', startDate) def setEndDate(self, endDate): """ Sets the end date """ return self.setOption('EndDate', endDate) def setProcessingPass(self, processingPass): """ Sets the processing pass """ return self.setOption('ProcessingPass', processingPass) def setEventType(self, eventTypes=None): """ Sets the event type """ if eventTypes: if isinstance(eventTypes, basestring): eventTypes = eventTypes.split(',') elif not isinstance(eventTypes, list): eventTypes = [eventTypes] try: eventTypes = [str(int(et)) for et in eventTypes] except ValueError as ex: gLogger.warn(ex) gLogger.error('Invalid list of event types', eventTypes) return {} if isinstance(eventTypes, list) and len(eventTypes) == 1: eventTypes = eventTypes[0] return self.setOption('EventType', eventTypes) def setVisible(self, visible=None): """ Sets the visibility flag """ if visible is True or (isinstance(visible, basestring) and visible[0].lower() == 'y'): visible = 'Yes' if visible is False: visible = 'No' return self.setOption('Visible', visible) def setExceptFileTypes(self, fileTypes): """ Sets the expected file types """ if not isinstance(fileTypes, list): fileTypes = [fileTypes] self.__exceptFileTypes.update(fileTypes) self.setFileType( [t for t in self.getFileTypeList() if t not in fileTypes]) def getExceptFileTypes(self): return list(self.__exceptFileTypes) def getQueryDict(self): """ Returns the bookkeeping dictionary """ return self.__bkQueryDict def getPath(self): """ Returns the Bookkeeping path """ return self.__bkPath def makePath(self): """ Builds a path from the dictionary """ bk = self.__bkQueryDict fileType = bk.get('FileType', '') if isinstance(fileType, list): fileType = ','.join(fileType) path = os.path.join( '/', bk.get('ConfigName', ''), bk.get('ConfigVersion', ''), bk.get('ConditionDescription', '.'), bk.get('ProcessingPass', '.')[1:], str(bk.get('EventType', '.')).replace('90000000', '.'), fileType).replace('/./', '//') while True: if path.endswith('/'): path = path[:-1] else: return path def getFileTypeList(self): """ Returns the file types """ fileTypes = self.__bkQueryDict.get('FileType', []) if not isinstance(fileTypes, list): fileTypes = [fileTypes] return fileTypes def getEventTypeList(self): """ Returns the event types """ eventType = self.__bkQueryDict.get("EventType", []) if eventType: if not isinstance(eventType, list): eventType = [eventType] return eventType def getProcessingPass(self): """ Returns the processing pass """ return self.__bkQueryDict.get('ProcessingPass', '') def getConditions(self): """ Returns the Simulation/data taking conditions """ return self.__bkQueryDict.get('ConditionDescription', '') def getConfiguration(self): """ Returns the configuration name and configuration version """ configName = self.__bkQueryDict.get('ConfigName', '') configVersion = self.__bkQueryDict.get('ConfigVersion', '') if not configName or not configVersion: return '' return os.path.join('/', configName, configVersion) def isVisible(self): """ Returns True/False depending on the visibility flag """ return self.__bkQueryDict.get('Visible', 'All') def __fileType(self, fileType=None, returnList=False): """ return the file types taking into account the expected file types """ gLogger.verbose("BKQuery.__fileType: %s, fileType: %s" % (self, fileType)) if not fileType: return [] self.__getAllBKFileTypes() if isinstance(fileType, list): fileTypes = fileType else: fileTypes = fileType.split(',') allRequested = None if fileTypes[0].lower() == "all": allRequested = True bkTypes = self.getBKFileTypes() gLogger.verbose('BKQuery.__fileType: bkTypes %s' % str(bkTypes)) if bkTypes: fileTypes = list(set(bkTypes) - self.__exceptFileTypes) else: fileTypes = [] expandedTypes = set() # print "Requested", fileTypes for fileType in fileTypes: if fileType.lower() == 'all.hist': allRequested = False expandedTypes.update([ t for t in self.__exceptFileTypes.union(self.__bkFileTypes) if t.endswith('HIST') ]) elif fileType.lower().find("all.") == 0: ext = '.' + fileType.split('.')[1] fileType = [] if allRequested is None: allRequested = True expandedTypes.update([ t for t in set(self.getBKFileTypes()) - self.__exceptFileTypes if t.endswith(ext) ]) else: expandedTypes.add(fileType) # Remove __exceptFileTypes only if not explicitly required # print "Obtained", fileTypes, expandedTypes gLogger.verbose( "BKQuery.__fileType: requested %s, expanded %s, except %s" % (allRequested, expandedTypes, self.__exceptFileTypes)) if expandedTypes - self.__bkFileTypes and not self.__alreadyWarned: self.__alreadyWarned = True gLogger.always( "**** Take care: some requested file types do not exist!!", str(sorted(expandedTypes - self.__bkFileTypes))) if allRequested or not expandedTypes & self.__exceptFileTypes: expandedTypes -= self.__exceptFileTypes gLogger.verbose("BKQuery.__fileType: result %s" % sorted(expandedTypes)) if len(expandedTypes) == 1 and not returnList: return list(expandedTypes)[0] else: return list(expandedTypes) def __getAllBKFileTypes(self): """ Returns the file types from the bookkeeping database """ if not self.__bkFileTypes: self.__bkFileTypes = set([self.__fakeAllDST]) warned = False while True: res = self.__bkClient.getAvailableFileTypes() if res['OK']: dbresult = res['Value'] for record in dbresult['Records']: if record[0].endswith('HIST') or \ record[0].endswith('ETC') or \ record[0] == 'LOG' or \ record[0].endswith('ROOT'): self.__exceptFileTypes.add(record[0]) self.__bkFileTypes.add(record[0]) break if not warned: gLogger.always('Error getting BK file types, retrying', res['Message']) warned = True def __getBKFiles(self, bkQueryDict, retries=5): """ Call BK getFiles() with some retries """ if not retries: retries = sys.maxsize errorLogged = False while retries: res = self.__bkClient.getFiles(bkQueryDict) if res['OK']: break retries -= 1 if not errorLogged: errorLogged = True gLogger.warn("Error getting files from BK, retrying...", res['Message']) return res def getLFNsAndSize(self, getSize=True): """ Returns the LFNs and their size for a given data set """ self.__getAllBKFileTypes() res = self.__getBKFiles(self.__bkQueryDict) lfns = [] lfnSize = 0 if not res['OK']: gLogger.error("Error from BK for %s:" % self.__bkQueryDict, res['Message']) else: lfns = set(res['Value']) exceptFiles = list(self.__exceptFileTypes) if exceptFiles and not self.__bkQueryDict.get('FileType'): res = self.__getBKFiles( BKQuery(self.__bkQueryDict).setOption( 'FileType', exceptFiles)) if res['OK']: lfnsExcept = set(res['Value']) & lfns else: gLogger.error( "***** ERROR ***** Error in getting dataset from BK for %s files:" % exceptFiles, res['Message']) lfnsExcept = set() if lfnsExcept: gLogger.warn( "***** WARNING ***** Found %d files in BK query that will be \ excluded (file type in %s)!" % (len(lfnsExcept), str(exceptFiles))) gLogger.warn( " If creating a transformation, set '--FileType ALL'" ) lfns = lfns - lfnsExcept else: exceptFiles = False if getSize: # Get size only if needed query = BKQuery(self.__bkQueryDict) query.setOption("FileSize", True) res = self.__getBKFiles(query.getQueryDict()) if res['OK'] and isinstance(res['Value'], list) and res['Value'][0]: lfnSize = res['Value'][0] if exceptFiles and not self.__bkQueryDict.get('FileType'): res = self.__getBKFiles( query.setOption('FileType', exceptFiles)) if res['OK'] and isinstance(res['Value'], list) and res['Value'][0]: lfnSize -= res['Value'][0] lfnSize /= 1000000000000. else: lfnSize = 0. return {'LFNs': list(lfns), 'LFNSize': lfnSize} def getLFNSize(self, visible=None): """ Returns the size of a given data set """ if visible is None: visible = self.isVisible() res = self.__getBKFiles( BKQuery(self.__bkQueryDict, visible=visible).setOption('FileSize', True)) if res['OK'] and isinstance(res['Value'], list) and res['Value'][0]: lfnSize = res['Value'][0] else: lfnSize = 0 return lfnSize def getNumberOfLFNs(self, visible=None): """ Returns the number of LFNs correspond to a given data set """ if visible is None: visible = self.isVisible() if self.isVisible() != visible: query = BKQuery(self.__bkQueryDict, visible=visible) else: query = self fileTypes = query.getFileTypeList() nbFiles = 0 size = 0 for fileType in fileTypes: if fileType: res = self.__bkClient.getFilesSummary( query.setFileType(fileType)) # print query, res if res['OK']: res = res['Value'] ind = res['ParameterNames'].index('NbofFiles') if res['Records'][0][ind]: nbFiles += res['Records'][0][ind] ind1 = res['ParameterNames'].index('FileSize') size += res['Records'][0][ind1] # print 'Visible',query.isVisible(),fileType, 'Files:', # res['Records'][0][ind], 'Size:', res['Records'][0][ind1] return {'NumberOfLFNs': nbFiles, 'LFNSize': size} def getLFNs(self, printSEUsage=False, printOutput=True, visible=None): """ returns a list of lfns. It prints statistics about the data sets if it is requested. """ if visible is None: visible = self.isVisible() if self.isVisible() != visible: query = BKQuery(self.__bkQueryDict, visible=visible) else: query = self # Loop for each production or each event type rather than make a single query loopItem = None prods = self.__bkQueryDict.get('Production') eventTypes = self.__bkQueryDict.get('EventType') if prods and isinstance(prods, list): loopItem = 'Production' loopList = prods elif eventTypes and isinstance(eventTypes, list): loopItem = 'EventType' loopList = eventTypes if loopItem: # It's faster to loop on a list of prods or event types than query the BK with a list as argument lfns = [] lfnSize = 0 if query == self: query = BKQuery(self.__bkQueryDict, visible=visible) for item in loopList: query.setOption(loopItem, item) lfnsAndSize = query.getLFNsAndSize(getSize=printOutput) lfns += lfnsAndSize['LFNs'] lfnSize += lfnsAndSize['LFNSize'] else: lfnsAndSize = query.getLFNsAndSize(getSize=printOutput) lfns = lfnsAndSize['LFNs'] lfnSize = lfnsAndSize['LFNSize'] if not lfns: gLogger.verbose("No files found for BK query %s" % str(self.__bkQueryDict)) else: lfns.sort() # Only for printing if printOutput: gLogger.notice("\n%d files (%.1f TB) in directories:" % (len(lfns), lfnSize)) dirs = {} for lfn in lfns: directory = os.path.join(os.path.dirname(lfn), '') dirs[directory] = dirs.setdefault(directory, 0) + 1 for directory in sorted(dirs): gLogger.notice("%s %s files" % (directory, dirs[directory])) if printSEUsage: rpc = RPCClient('DataManagement/StorageUsage') totalUsage = {} totalSize = 0 for directory in dirs: res = rpc.getStorageSummary(directory, '', '', []) if res['OK']: for se in [ se for se in res['Value'] if not se.endswith("-ARCHIVE") ]: totalUsage[se] = totalUsage.setdefault( se, 0) + res['Value'][se]['Size'] totalSize += res['Value'][se]['Size'] ses = sorted(totalUsage) totalUsage['Total'] = totalSize ses.append('Total') gLogger.notice("\n%s %s" % ("SE".ljust(20), "Size (TB)")) for se in ses: gLogger.notice("%s %s" % (se.ljust(20), ('%.1f' % (totalUsage[se] / 1000000000000.)))) return lfns def getDirs(self, printOutput=False, visible=None): """ Returns the directories """ if visible is None: visible = self.isVisible() lfns = self.getLFNs(printSEUsage=True, printOutput=printOutput, visible=visible) dirs = set() for lfn in lfns: dirs.add(os.path.dirname(lfn)) return sorted(dirs) @staticmethod def __getProdStatus(prod): """ Returns the status of a given transformation """ res = TransformationClient().getTransformation(prod, extraParams=False) if not res['OK']: gLogger.error("Couldn't get information on production %d" % prod) return None return res['Value']['Status'] def getBKRuns(self): """ It returns a list of runs from the bookkeeping. """ if self.getProcessingPass().replace('/', '') == 'Real Data': return self.getBKProductions() def getBKProductions(self, visible=None): """ It returns a list of productions """ if visible is None: visible = self.isVisible() prodList = self.__bkQueryDict.get('Production') if prodList: if not isinstance(prodList, list): prodList = [prodList] return sorted(prodList) if not self.getProcessingPass(): gLogger.fatal( 'Impossible to get a list of productions without the Processing Pass' ) return [] eventTypes = self.__bkQueryDict.get('EventType') if not isinstance(eventTypes, list): eventTypes = [eventTypes] fullList = set() for eventType in eventTypes: bkQ = BKQuery(self.__bkQueryDict) bkQ.setVisible(visible) bkDict = bkQ.setEventType(eventType) # gLogger.notice( 'Get productions for BK query', str( bkDict ) ) res = self.__bkClient.getProductions(bkDict) if not res['OK']: gLogger.error('Error getting productions from BK', res['Message']) return [] if self.getProcessingPass().replace('/', '') != 'Real Data': fileTypes = self.getFileTypeList() prodList = set(prod for prods in res['Value']['Records'] for prod in prods if self.__getProdStatus(prod) != 'Deleted') # print '\n', self.__bkQueryDict, res['Value']['Records'], '\nVisible:', visible, prodList pList = set() if fileTypes: transClient = TransformationClient() for prod in prodList: res = transClient.getBookkeepingQuery(prod) if res['OK'] and res['Value']['FileType'] in fileTypes: pList.add(prod) if not pList: pList = prodList else: runList = sorted( [-run for r in res['Value']['Records'] for run in r]) startRun = int(self.__bkQueryDict.get('StartRun', 0)) endRun = int(self.__bkQueryDict.get('EndRun', sys.maxsize)) pList = set(run for run in runList if run >= startRun and run <= endRun) fullList.update(pList) return sorted(fullList) def getBKConditions(self): """ It returns the data taking / simulation conditions """ conditions = self.__bkQueryDict.get('ConditionDescription') if conditions: if not isinstance(conditions, list): conditions = [conditions] return conditions result = self.__bkClient.getConditions(self.__bkQueryDict) if result['OK']: resList = result['Value'] else: return [] conditions = [] for res in resList: ind = res['ParameterNames'].index('Description') if res['Records']: conditions += [par[ind] for par in res['Records']] break return sorted(conditions) def getBKEventTypes(self): """ It returns the event types """ eventType = self.getEventTypeList() if eventType: return eventType res = self.__bkClient.getEventTypes(self.__bkQueryDict)['Value'] ind = res['ParameterNames'].index('EventType') eventTypes = sorted([rec[ind] for rec in res['Records']]) return eventTypes def getBKFileTypes(self, bkDict=None): """ It returns the file types. """ fileTypes = self.getFileTypeList() # print "Call getBKFileTypes:", self, fileTypes if not fileTypes: if not bkDict: bkDict = self.__bkQueryDict.copy() else: bkDict = bkDict.copy() bkDict.setdefault('Visible', 'All') bkDict.pop('RunNumber', None) fileTypes = [] eventTypes = bkDict.get('EventType') if isinstance(eventTypes, list): for et in eventTypes: bkDict['EventType'] = et fileTypes += self.getBKFileTypes(bkDict) else: res = self.__bkClient.getFileTypes(bkDict) if res['OK']: res = res['Value'] ind = res['ParameterNames'].index('FileTypes') fileTypes = [ rec[ind] for rec in res['Records'] if rec[ind] not in self.__exceptFileTypes ] if 'ALL.DST' in fileTypes: fileTypes.remove('ALL.DST') fileTypes.append(self.__fakeAllDST) # print 'FileTypes1', fileTypes fileTypes = self.__fileType(fileTypes, returnList=True) # print 'FileTypes2', fileTypes if self.__fakeAllDST in fileTypes: fileTypes.remove(self.__fakeAllDST) fileTypes.append('ALL.DST') # print 'FileTypes3', fileTypes return fileTypes def getBKProcessingPasses(self, queryDict=None, depth=None): """ It returns the processing pass. """ if depth is None: depth = sys.maxsize processingPasses = {} if not queryDict: queryDict = self.__bkQueryDict.copy() initialPP = queryDict.get('ProcessingPass', '/') # print "Start", initialPP, queryDict res = self.__bkClient.getProcessingPass(queryDict, initialPP) if not res['OK']: if 'Empty Directory' not in res['Message']: gLogger.error( "ERROR getting processing passes for %s" % queryDict, res['Message']) return {} ppRecords = res['Value'][0] if 'Name' in ppRecords['ParameterNames']: ind = ppRecords['ParameterNames'].index('Name') passes = sorted([ os.path.join(initialPP, rec[ind]) for rec in ppRecords['Records'] ]) else: passes = [] evtRecords = res['Value'][1] if 'EventType' in evtRecords['ParameterNames']: ind = evtRecords['ParameterNames'].index('EventType') eventTypes = [str(rec[ind]) for rec in evtRecords['Records']] else: eventTypes = [] if passes and depth: depth -= 1 nextProcessingPasses = {} for pName in passes: processingPasses[pName] = [] queryDict['ProcessingPass'] = pName nextProcessingPasses.update( self.getBKProcessingPasses(queryDict, depth=depth)) processingPasses.update(nextProcessingPasses) if eventTypes: processingPasses[initialPP] = eventTypes for pName in ('/Real Data', '/'): if pName in processingPasses: processingPasses.pop(pName) # print "End", initialPP, [( key, processingPasses[key] ) for key in sorted( processingPasses.keys() )] return processingPasses