def getFileBlocksInfo(self, dataset, onlyClosedBlocks=False, blockName=None, locations=True): """ """ self.checkDatasetPath(dataset) args = {'dataset': dataset, 'detail': True} if blockName: args['block_name'] = blockName try: blocks = self.dbs.listBlocks(**args) except Exception as ex: msg = "Error in DBSReader.getFileBlocksInfo(%s)\n" % dataset msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) blocks = [ remapDBS3Keys(block, stringify=True, block_name='Name') for block in blocks ] # only raise if blockName not specified - mimic dbs2 error handling if not blocks and not blockName: msg = "DBSReader.getFileBlocksInfo(%s, %s): No matching data" raise DBSReaderError(msg % (dataset, blockName)) if locations: for block in blocks: block['PhEDExNodeList'] = [{ 'Name': x } for x in self.listFileBlockLocation(block['Name'])] if onlyClosedBlocks: return [x for x in blocks if str(x['OpenForWriting']) != "1"] return blocks
def getDBSSummaryInfo(self, dataset=None, block=None): """ Get dataset summary includes # of files, events, blocks and total size """ try: if block: summary = self.dbs.listFileSummaries(block_name=block) else: # dataset case dataset shouldn't be None summary = self.dbs.listFileSummaries(dataset=dataset) except DBSReaderError, ex: msg = "Error in DBSReader.listDatasetSummary(%s, %s)\n" % (dataset, block) msg += "%s\n" % formatEx(ex) raise DBSReaderError(msg)
def listFileBlockLocation(self, fileBlockNames, dbsOnly=False): """ _listFileBlockLocation_ Get origin_site_name of a block """ singleBlockName = None if isinstance(fileBlockNames, basestring): singleBlockName = fileBlockNames fileBlockNames = [fileBlockNames] for block in fileBlockNames: self.checkBlockName(block) locations = {} node_filter = set(['UNKNOWN', None]) if dbsOnly: blocksInfo = {} try: for block in fileBlockNames: blocksInfo.setdefault(block, []) # there should be only one element with a single origin site string ... for blockInfo in self.dbs.listBlockOrigin(block_name=block): blocksInfo[block].append(blockInfo['origin_site_name']) except dbsClientException as ex: msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) else: try: blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(block=fileBlockNames, complete='y') except Exception as ex: msg = "Error while getting block location from PhEDEx for block_name=%s)\n" % fileBlockNames msg += "%s\n" % str(ex) raise Exception(msg) for block in fileBlockNames: valid_nodes = set(blocksInfo.get(block, [])) - node_filter locations[block] = list(valid_nodes) # returning single list if a single block is passed if singleBlockName: return locations[singleBlockName] return locations
def getDBSSummaryInfo(self, dataset = None, block = None): """ Get dataset summary includes # of files, events, blocks and total size """ #FIXME: Doesnt raise exceptions on missing data as old api did if dataset: self.checkDatasetPath(dataset) try: if block: summary = self.dbs.listFileSummaries(block_name = block) else: # dataset case dataset shouldn't be None summary = self.dbs.listFileSummaries(dataset = dataset) except DbsException, ex: msg = "Error in DBSReader.listDatasetSummary(%s, %s)\n" % (dataset, block) msg += "%s\n" % formatEx(ex) raise DBSReaderError(msg)
def getDBSSummaryInfo(self, dataset=None, block=None): """ Get dataset summary includes # of files, events, blocks and total size """ if dataset: self.checkDatasetPath(dataset) try: if block: summary = self.dbs.getSummary(block=block) else: # dataset case dataset shouldn't be None summary = self.dbs.getSummary(dataset=dataset) except DbsException, ex: msg = "Error in DBSReader.listDatasetSummary(%s, %s)\n" % (dataset, block) msg += "%s\n" % formatEx(ex) raise DBSReaderError(msg)
def getFileBlocksInfo(self, dataset, onlyClosedBlocks=False, blockName='*', locations=True): """ """ self.checkDatasetPath(dataset) try: blocks = self.dbs.listBlocks(dataset, blockName, nosite=not locations) except DbsException, ex: msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset msg += "%s\n" % formatEx(ex) raise DBSReaderError(msg)
def listPrimaryDatasets(self, match=None): """ _listPrimaryDatasets_ return a list of primary datasets matching the glob expression. If no expression is provided, all datasets are returned """ arg = "*" if match != None: arg = match try: result = self.dbs.listPrimaryDatasets(arg) except DbsException, ex: msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % arg msg += "%s\n" % formatEx(ex) raise DBSReaderError(msg)
def listProcessedDatasets(self, primary, dataTier='*'): """ _listProcessedDatasets_ return a list of Processed datasets for the primary and optional data tier value """ try: result = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=dataTier) except dbsClientException as ex: msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) result = [x['dataset'].split('/')[2] for x in result] return result
def getFileBlocksInfo(self, dataset, onlyClosedBlocks=False, blockName=None, locations=True): """ """ self.checkDatasetPath(dataset) args = {'dataset': dataset, 'detail': True} if blockName: args['block_name'] = blockName try: blocks = self.dbs.listBlocks(**args) except dbsClientException, ex: msg = "Error in DBSReader.getFileBlocksInfo(%s)\n" % dataset msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg)
def listDatasetLocation(self, datasetName, dbsOnly=False): """ _listDatasetLocation_ List the origin SEs where there is at least a block of the given dataset. """ self.checkDatasetPath(datasetName) locations = set() if dbsOnly: try: blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName) except dbsClientException as ex: msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) if not blocksInfo: # no data location from dbs return list() for blockInfo in blocksInfo: #TODO remove this line when all DBS origin_site_name is converted to PNN blockInfo[ 'origin_site_name'] = self.siteDB.checkAndConvertSENameToPNN( blockInfo['origin_site_name']) #upto this locations.update(blockInfo['origin_site_name']) locations.difference_update( ['UNKNOWN', None]) # remove entry when SE name is 'UNKNOWN' else: try: blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks( dataset=[datasetName], complete='y') except Exception as ex: msg = "Error while getting block location from PhEDEx for dataset=%s)\n" % datasetName msg += "%s\n" % str(ex) raise Exception(msg) if blocksInfo: for blockSites in blocksInfo.values(): locations.update(blockSites) return list(locations)
def listPrimaryDatasets(self, match='*'): """ _listPrimaryDatasets_ return a list of primary datasets, The full dataset name must be provided pattern based mathcing is no longer supported. If no expression is provided, all datasets are returned """ try: result = self.dbs.listPrimaryDatasets(primary_ds_name=match) except dbsClientException as ex: msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) result = [x['primary_ds_name'] for x in result] return result
def getFileBlocksInfo(self, dataset, onlyClosedBlocks = True, blockName = '*', locations = True): """Fake block info""" blocks = [x for x in self.dataBlocks.getBlocks(dataset) if x['Name'] == blockName or blockName == '*'] if not blocks: # Weird error handling follows, this is what dbs does: # If block specified, return [], else raise DbsBadRequest error if blockName != '*': return [] else: raise DBSReaderError('DbsBadRequest: DBS Server Raised An Error') if locations: for block in blocks: block['PhEDExNodeList'] = [{'Role' : '', 'Name' : x} for x in \ self.listFileBlockLocation(block['Name'])] return blocks
def listOpenFileBlocks(self, dataset): """ _listOpenFileBlocks_ Retrieve a list of open fileblock names for a dataset """ self.checkDatasetPath(dataset) try: blocks = self.dbs.listBlocks(dataset=dataset, detail=True) except dbsClientException as ex: msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) result = [x['block_name'] for x in blocks if str(x['open_for_writing']) == "1"] return result
def getFileListByDataset(self, dataset, validFileOnly=1, detail=True): """ _getFileListByDataset_ Given a dataset, retrieves all blocks, lfns and number of events (among other not really important info). Returns a list of dict. """ try: fileList = self.dbs.listFileArray(dataset=dataset, validFileOnly=validFileOnly, detail=detail) return fileList except dbsClientException as ex: msg = "Error in " msg += "DBSReader.getFileListByDataset(%s)\n" % dataset msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg)
def listProcessedDatasets(self, primary, dataTier=None): """ _listProcessedDatasets_ return a list of Processed datasets for the primary and optional data tier value """ tier = "*" if dataTier != None: tier = dataTier try: result = self.dbs.listProcessedDatasets(primary, tier) except DbsException, ex: msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary msg += "%s\n" % formatEx(ex) raise DBSReaderError(msg)
def blockToDatasetPath(self, blockName): """ _blockToDatasetPath_ Given a block name, get the dataset Path associated with that Block. Returns the dataset path, or None if not found """ self.checkBlockName(blockName) try: blocks = self.dbs.listBlocks(block_name=blockName, nosite=True) except DbsException, ex: msg = "Error in " msg += "DBSReader.blockToDataset(%s)\n" % blockName msg += "%s\n" % formatEx(ex) raise DBSReaderError(msg)
def validateInputDatasSetAndParentFlag(arguments): """ Check if the InputDataset value provided corresponds to an actual dataset in DBS. If parent flag is provided, then check whether the input dataset has a parent. the InputDataset existence in DBS and its parent, if needed. """ inputdataset = _getChainKey(arguments, "InputDataset") mcpileup = _getChainKey(arguments, "MCPileup") datapileup = _getChainKey(arguments, "DataPileup") includeParents = _getChainKey(arguments, "IncludeParents") # TODO: this replace can be removed in one year from now, thus March 2022 dbsURL = arguments.get("DbsUrl") if dbsURL: dbsURL = dbsURL.replace("cmsweb.cern.ch", "cmsweb-prod.cern.ch") if includeParents and not inputdataset: msg = "IncludeParents flag is True but InputDataset value has not been provided" raise WMSpecFactoryException(msg) if dbsURL and inputdataset or mcpileup or datapileup: # import DBS3Reader here, since Runtime code import this module and worker # node doesn't have dbs3 client from WMCore.Services.DBS.DBS3Reader import DBS3Reader from WMCore.Services.DBS.DBSErrors import DBSReaderError dbsInst = DBS3Reader(dbsURL) try: _datasetExists(dbsInst, inputdataset) _datasetExists(dbsInst, mcpileup) _datasetExists(dbsInst, datapileup) except DBSReaderError as ex: # we need to Wrap the exception to WMSpecFactoryException to be caught in reqmgr validation raise WMSpecFactoryException(str(ex)) if includeParents: try: result = dbsInst.listDatasetParents(inputdataset) if len(result) == 0: msg = "IncludeParents flag is True but the input dataset %s has no parents" % inputdataset raise DBSReaderError(msg) except DBSReaderError as ex: raise WMSpecFactoryException(str(ex)) return
def listDatasetLocation(self, datasetName, dbsOnly=False): """ _listDatasetLocation_ List the origin SEs where there is at least a block of the given dataset. """ self.checkDatasetPath(datasetName) if not dbsOnly: try: blocksInfo = self.phedex.getReplicaSEForBlocks( dataset=[datasetName], complete='y') except Exception as ex: msg = "Error while getting block location from PhEDEx for dataset=%s)\n" % datasetName msg += "%s\n" % str(ex) raise Exception(msg) if not blocksInfo: # if we couldnt get data location from PhEDEx, try to look into origin site location from dbs dbsOnly = True else: locations = set(blocksInfo.values()[0]) for blockSites in blocksInfo.values(): locations.intersection_update(blockSites) if dbsOnly: try: blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName) except dbsClientException as ex: msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) if not blocksInfo: # no data location from dbs return list() locations = set() for blockInfo in blocksInfo: locations.update([blockInfo['origin_site_name']]) locations.difference_update( ['UNKNOWN']) # remove entry when SE name is 'UNKNOWN' return list(locations)
def blockExists(self, fileBlockName): """ _blockExists_ Check to see if block with name provided exists in the DBS Instance. Return True if exists, False if not """ self.checkBlockName(fileBlockName) try: blocks = self.dbs.listBlocks(block_name=fileBlockName, nosite=True) except DbsException, ex: msg = "Error in " msg += "DBSReader.blockExists(%s)\n" % fileBlockName msg += "%s\n" % formatEx(ex) raise DBSReaderError(msg)
def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None): """ _listFileBlocks_ Retrieve a list of fileblock names for a dataset """ self.checkDatasetPath(dataset) args = {'dataset': dataset, 'detail': False} if blockName: args['block_name'] = blockName if onlyClosedBlocks: args['detail'] = True try: blocks = self.dbs.listBlocks(**args) except dbsClientException, ex: msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg)
def listRunLumis(self, dataset=None, block=None): """ It gets a list of DBSRun objects and returns the number of lumisections per run DbsRun (RunNumber, NumberOfEvents, NumberOfLumiSections, TotalLuminosity, StoreNumber, StartOfRungetLong, EndOfRun, CreationDate, CreatedBy, LastModificationDate, LastModifiedBy ) """ # Pointless code in python3 if isinstance(block, str): block = unicode(block) if isinstance(dataset, str): dataset = unicode(dataset) try: if block: results = self.dbs.listRuns(block_name=block) else: results = self.dbs.listRuns(dataset=dataset) except dbsClientException as ex: msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block) msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) # send runDict format as result, this format is for sync with dbs2 call # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis # So it returns {run_number: None} # TODO: After DBS2 is completely removed change the return format more sensible one runDict = {} for x in results: for runNumber in x["run_num"]: runDict[runNumber] = None return runDict
def matchProcessedDatasets(self, primary, tier, process): """ _matchProcessedDatasets_ return a list of Processed datasets """ result = [] try: datasets = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=tier, detail=True) except dbsClientException as ex: msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) for dataset in datasets: dataset = remapDBS3Keys(dataset, processed_ds_name='Name') dataset['PathList'] = [dataset['dataset']] if dataset['Name'] == process: result.append(dataset) return result
def getFileBlockWithParents(self, fileBlockName): """ Retrieve a list of parent files in the block; a flag whether the block is still open or not; and it used to resolve the block location via PhEDEx. :return: a dictionary in the format of: {"PhEDExNodeNames" : [], "Files" : { LFN : Events }, "IsOpen" : True|False} """ fileBlockName = decodeBytesToUnicode(fileBlockName) if not self.blockExists(fileBlockName): msg = "DBSReader.getFileBlockWithParents(%s): No matching data" raise DBSReaderError(msg % fileBlockName) result = {"PhEDExNodeNames": [], # FIXME: we better get rid of this line! "Files": self.listFilesInBlockWithParents(fileBlockName), "IsOpen": self.blockIsOpen(fileBlockName)} return result
def lfnsInBlock(self, fileBlockName): """ _lfnsInBlock_ LFN list only for block, details = False => faster query """ try: files = self.dbs.listFiles( "", # path "", #primary "", # processed [], #tier_list "", #analysisDataset fileBlockName, details="False") except DbsException, ex: msg = "Error in " msg += "DBSReader.lfnsInBlock(%s)\n" % fileBlockName msg += "%s\n" % formatEx(ex) raise DBSReaderError(msg)
def listFileBlockLocation(self, fileBlockNames): """ _listFileBlockLocation_ Get origin_site_name of a block """ singleBlockName = None if isinstance(fileBlockNames, (str, bytes)): singleBlockName = fileBlockNames fileBlockNames = [fileBlockNames] for block in fileBlockNames: self.checkBlockName(block) locations = {} node_filter = set(['UNKNOWN', None]) blocksInfo = {} try: for block in fileBlockNames: blocksInfo.setdefault(block, []) # there should be only one element with a single origin site string ... for blockInfo in self.dbs.listBlockOrigin(block_name=block): blocksInfo[block].append(blockInfo['origin_site_name']) except dbsClientException as ex: msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) for block in fileBlockNames: valid_nodes = set(blocksInfo.get(block, [])) - node_filter locations[block] = list(valid_nodes) # returning single list if a single block is passed if singleBlockName: return locations[singleBlockName] return locations
def listFilesInBlock(self, fileBlockName): """ _listFilesInBlock_ Get a list of files in the named fileblock """ try: files = self.dbs.listFiles( "", # path "", #primary "", # processed [], #tier_list "", #analysisDataset fileBlockName, details="True") except DbsException, ex: msg = "Error in " msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName msg += "%s\n" % formatEx(ex) raise DBSReaderError(msg)
def getDBSSummaryInfo(self, dataset=None, block=None): """Dataset summary""" def getLumisectionsInBlock(b): lumis = 0 for file in self.dataBlocks.getFiles(b): for x in file['LumiList']: lumis =+ len(x['LumiSectionNumber']) return lumis result = {} if block: result['NumberOfEvents'] = str(sum([x['NumberOfEvents'] for x in self.dataBlocks.getFiles(block)])) result['NumberOfFiles'] = str(len(self.dataBlocks.getFiles(block))) result['NumberOfLumis'] = str(getLumisectionsInBlock(block)) result['path'] = dataset result['block'] = block result['OpenForWriting'] = '1' if self.dataBlocks._openForWriting() else '0' if dataset: if self.dataBlocks.getBlocks(dataset): result['NumberOfEvents'] = str(sum([x['NumberOfEvents'] for x in self.dataBlocks.getBlocks(dataset)])) result['NumberOfFiles'] = str(sum([x['NumberOfFiles'] for x in self.dataBlocks.getBlocks(dataset)])) lumis = 0 for b in self.dataBlocks.getBlocks(dataset): lumis += b['NumberOfLumis'] result['NumberOfLumis'] = str(lumis) result['path'] = dataset # Weird error handling follows, this is what dbs does if not result: raise DBSReaderError('DbsConnectionError: Database exception,Invalid parameters') return result
class DBS2Reader: """ _DBSReader_ General API for reading data from DBS """ def __init__(self, url, **contact): args = {"url": url, "level": 'ERROR', "version": ''} args.update(contact) #try: self.dbs = DbsApi(args) #except DbsException, ex: # msg = "Error in DBSReader with DbsApi\n" # msg += "%s\n" % formatEx(ex) # raise DBSReaderError(msg) # setup DLS api - with either dbs or phedex depending on dbs instance if url.count('cmsdbsprod.cern.ch/cms_dbs_prod_global') or \ self.dbs.getServerInfo()['InstanceName'] == 'GLOBAL': dlsType = 'DLS_TYPE_PHEDEX' dlsUrl = 'https://cmsweb.cern.ch/phedex/datasvc/xml/prod' else: dlsType = 'DLS_TYPE_DBS' dlsUrl = url try: self.dls = dlsClient.getDlsApi(dls_type=dlsType, dls_endpoint=dlsUrl, version=args['version']) except DlsApiError, ex: msg = "Error in DBSReader with DlsApi\n" msg += "%s\n" % str(ex) raise DBSReaderError(msg) except DbsException, ex: msg = "Error in DBSReader with DbsApi\n" msg += "%s\n" % formatEx(ex) raise DBSReaderError(msg)
def getDBSSummaryInfo(self, dataset=None, block=None): """ Get dataset summary includes # of files, events, blocks and total size """ if dataset: self.checkDatasetPath(dataset) try: if block: summary = self.dbs.listFileSummaries(block_name=block, validFileOnly=1) else: summary = self.dbs.listFileSummaries(dataset=dataset, validFileOnly=1) except Exception as ex: msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset, block) msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) if not summary: # missing data or all files invalid return {} result = remapDBS3Keys(summary[0], stringify=True) result['path'] = dataset if dataset else '' result['block'] = block if block else '' return result
def blockToDatasetPath(self, blockName): """ _blockToDatasetPath_ Given a block name, get the dataset Path associated with that Block. Returns the dataset path, or None if not found """ self.checkBlockName(blockName) try: blocks = self.dbs.listBlocks(block_name=blockName, detail=True) except Exception as ex: msg = "Error in " msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) if blocks == []: return None pathname = blocks[-1].get('dataset', None) return pathname