def setupJobParameters(self, config, pm): config = config.addSections(['dataset']).addTags([self]) self.dataSplitter = None self.dataRefresh = None self.dataset = config.get('dataset', '').strip() if self.dataset == '': return config.set('se output pattern', '@NICK@_job_@MY_JOBID@_@X@', override = False) config.set('default lookup', 'DATASETNICK', override = False) defaultProvider = config.get('dataset provider', 'ListProvider') dataProvider = DataProvider.create(config, self.dataset, defaultProvider) splitterName = config.get('dataset splitter', 'FileBoundarySplitter') splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName)) self.dataSplitter = splitterClass(config) self.checkSE = config.getBool('dataset storage check', True, onChange = None) # Create and register dataset parameter plugin paramSource = DataParameterSource(config.getWorkPath(), 'data', dataProvider, self.dataSplitter, self.initDataProcessor()) DataParameterSource.datasetsAvailable['data'] = paramSource # Select dataset refresh rate self.dataRefresh = config.getTime('dataset refresh', -1, onChange = None) if self.dataRefresh > 0: paramSource.resyncSetup(interval = max(self.dataRefresh, dataProvider.queryLimit())) utils.vprint('Dataset source will be queried every %s' % utils.strTime(self.dataRefresh), -1) else: paramSource.resyncSetup(interval = 0) def externalRefresh(sig, frame): paramSource.resyncSetup(force = True) signal.signal(signal.SIGUSR2, externalRefresh) if self.dataSplitter.getMaxJobs() == 0: raise UserError('There are no events to process')
def getEntries(self, path, metadata, events, seList, objStore): datacachePath = os.path.join(objStore.get('GC_WORKDIR', ''), 'datacache.dat') source = utils.QM((self.source == '') and os.path.exists(datacachePath), datacachePath, self.source) if source and (source not in self.lfnMap): pSource = DataProvider.create(createConfigFactory().getConfig(), source, 'ListProvider') for (n, fl) in map(lambda b: (b[DataProvider.Dataset], b[DataProvider.FileList]), pSource.getBlocks()): self.lfnMap.setdefault(source, {}).update(dict(map(lambda fi: (self.lfnTrans(fi[DataProvider.URL]), n), fl))) pList = set() for key in filter(lambda k: k in metadata, self.parentKeys): pList.update(map(lambda pPath: self.lfnMap.get(source, {}).get(self.lfnTrans(pPath)), metadata[key])) metadata['PARENT_PATH'] = filter(lambda x: x, pList) yield (path, metadata, events, seList, objStore)
def setupJobParameters(self, config, pm): config = config.changeView(viewClass = TaggedConfigView, addSections = ['dataset'], addTags = [self]) self.dataSplitter = None self.dataRefresh = None self._forceRefresh = config.getState('resync', detail = 'dataset', default = False) def userRefresh(config, old_obj, cur_obj, cur_entry, obj2str): if ((old_obj == '') and (cur_obj != '')): raise UserError('It is currently not possible to attach a dataset to a non-dataset task!') self._forceRefresh = True return cur_obj self.dataset = config.get('dataset', '', onChange = userRefresh).strip() if self.dataset == '': return config.set('se output pattern', '@NICK@_job_@GC_JOB_ID@_@X@') config.set('default lookup', 'DATASETNICK') defaultProvider = config.get('dataset provider', 'ListProvider') dataProvider = DataProvider.create(config, self.dataset, defaultProvider) splitterName = config.get('dataset splitter', 'FileBoundarySplitter') splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName)) self.dataSplitter = splitterClass(config) # Create and register dataset parameter source paramSplitProcessor = config.getCompositePlugin('dataset processor', 'BasicDataSplitProcessor SECheckSplitProcessor', 'MultiDataSplitProcessor', cls = DataSplitProcessor).getInstance(config) paramSource = DataParameterSource(config.getWorkPath(), 'data', dataProvider, self.dataSplitter, paramSplitProcessor) DataParameterSource.datasetsAvailable['data'] = paramSource # Select dataset refresh rate self.dataRefresh = config.getTime('dataset refresh', -1, onChange = None) if self.dataRefresh > 0: paramSource.resyncSetup(interval = max(self.dataRefresh, dataProvider.queryLimit())) utils.vprint('Dataset source will be queried every %s' % utils.strTime(self.dataRefresh), -1) else: paramSource.resyncSetup(interval = 0) if self._forceRefresh: paramSource.resyncSetup(force = True) def externalRefresh(sig, frame): paramSource.resyncSetup(force = True) signal.signal(signal.SIGUSR2, externalRefresh) if self.dataSplitter.getMaxJobs() == 0: raise UserError('There are no events to process')
def __init__(self, config): dsPath = config.get('source dataset path') self.source = DataProvider.create(config, None, dsPath, 'ListProvider')
def main(): dataset = args[0].strip() cfgSettings = {'dbs blacklist T1': 'False', 'remove empty blocks': 'False', 'remove empty files': 'False', 'location format': opts.locationfmt, 'nickname check collision': 'False'} if opts.metadata or opts.blockmetadata: cfgSettings['lumi filter'] = '-' cfgSettings['keep lumi metadata'] = 'True' section = 'dataset' fillerList = [DefaultFilesConfigFiller()] if opts.settings: fillerList.append(FileConfigFiller([opts.settings])) tmpCfg = Config(fillerList, opts.settings) section = tmpCfg.get('global', ['task', 'module']) dummyConfig = Config(fillerList + [DictConfigFiller({section: cfgSettings})], opts.settings) dummyConfig.opts = opts dummyConfig = dummyConfig.addSections(['dataset']) if os.path.exists(dataset): provider = DataProvider.loadState(dataset, dummyConfig) else: provider = DataProvider.create(dummyConfig, dataset, opts.provider) blocks = provider.getBlocks() if len(blocks) == 0: raise DatasetError('No blocks!') datasets = set(map(lambda x: x[DataProvider.Dataset], blocks)) if len(datasets) > 1 or opts.info: headerbase = [(DataProvider.Dataset, 'Dataset')] else: print 'Dataset: %s' % blocks[0][DataProvider.Dataset] headerbase = [] if opts.configentry: print print 'dataset =' infos = {} order = [] maxnick = 5 for block in blocks: dsName = block[DataProvider.Dataset] if not infos.get(dsName, None): order.append(dsName) infos[dsName] = dict([(DataProvider.Dataset, dsName)]) if DataProvider.Nickname not in block and opts.confignick: try: if '/' in dsName: block[DataProvider.Nickname] = dsName.lstrip('/').split('/')[1] else: block[DataProvider.Nickname] = dsName except: pass if DataProvider.Nickname not in block and opts.confignick: block[DataProvider.Nickname] = np.getName(None, dsName, block) if DataProvider.Nickname in block: nick = block[DataProvider.Nickname] infos[dsName][DataProvider.Nickname] = nick maxnick = max(maxnick, len(nick)) if len(block[DataProvider.FileList]): infos[dsName][DataProvider.URL] = block[DataProvider.FileList][0][DataProvider.URL] for dsID, dsName in enumerate(order): info = infos[dsName] short = DataProvider.providers.get(provider.__class__.__name__, provider.__class__.__name__) print '', info.get(DataProvider.Nickname, 'nick%d' % dsID).rjust(maxnick), ':', short, ':', print '%s%s' % (provider._datasetExpr, QM(short == 'list', ' %% %s' % info[DataProvider.Dataset], '')) if opts.listdatasets: # Add some enums for consistent access to info dicts DataProvider.NFiles = -1 DataProvider.NBlocks = -2 print infos = {} order = [] infosum = {DataProvider.Dataset : 'Sum'} for block in blocks: dsName = block.get(DataProvider.Dataset, '') if not infos.get(dsName, None): order.append(dsName) infos[dsName] = {DataProvider.Dataset: block[DataProvider.Dataset]} def updateInfos(target): target[DataProvider.NBlocks] = target.get(DataProvider.NBlocks, 0) + 1 target[DataProvider.NFiles] = target.get(DataProvider.NFiles, 0) + len(block[DataProvider.FileList]) target[DataProvider.NEntries] = target.get(DataProvider.NEntries, 0) + block[DataProvider.NEntries] updateInfos(infos[dsName]) updateInfos(infosum) head = [(DataProvider.Dataset, 'Dataset'), (DataProvider.NEntries, '#Events'), (DataProvider.NBlocks, '#Blocks'), (DataProvider.NFiles, '#Files')] utils.printTabular(head, map(lambda x: infos[x], order) + ["=", infosum]) if opts.listblocks: print utils.printTabular(headerbase + [(DataProvider.BlockName, 'Block'), (DataProvider.NEntries, 'Events')], blocks) if opts.listfiles: print for block in blocks: if len(datasets) > 1: print 'Dataset: %s' % block[DataProvider.Dataset] print 'Blockname: %s' % block[DataProvider.BlockName] utils.printTabular([(DataProvider.URL, 'Filename'), (DataProvider.NEntries, 'Events')], block[DataProvider.FileList]) print def printMetadata(src, maxlen): for (mk, mv) in src: if len(str(mv)) > 200: mv = '<metadata entry size: %s> %s...' % (len(str(mv)), repr(mv)[:200]) print '\t%s: %s' % (mk.rjust(maxlen), mv) if src: print if opts.metadata and not opts.save: print for block in blocks: if len(datasets) > 1: print 'Dataset: %s' % block[DataProvider.Dataset] print 'Blockname: %s' % block[DataProvider.BlockName] mk_len = max(map(len, block.get(DataProvider.Metadata, ['']))) for f in block[DataProvider.FileList]: print '%s [%d events]' % (f[DataProvider.URL], f[DataProvider.NEntries]) printMetadata(zip(block.get(DataProvider.Metadata, []), f.get(DataProvider.Metadata, [])), mk_len) print if opts.blockmetadata and not opts.save: for block in blocks: if len(datasets) > 1: print 'Dataset: %s' % block[DataProvider.Dataset] print 'Blockname: %s' % block[DataProvider.BlockName] mkdict = lambda x: dict(zip(block[DataProvider.Metadata], x[DataProvider.Metadata])) metadata = QM(block[DataProvider.FileList], mkdict(block[DataProvider.FileList][0]), {}) for fileInfo in block[DataProvider.FileList]: utils.intersectDict(metadata, mkdict(fileInfo)) printMetadata(metadata.items(), max(map(len, metadata.keys()))) if opts.liststorage: print infos = {} print 'Storage elements:' for block in blocks: dsName = block[DataProvider.Dataset] if len(headerbase) > 0: print 'Dataset: %s' % dsName if block.get(DataProvider.BlockName, None): print 'Blockname: %s' % block[DataProvider.BlockName] if block[DataProvider.Locations] == None: print '\tNo location contraint specified' elif block[DataProvider.Locations] == []: print '\tNot located at anywhere' else: for se in block[DataProvider.Locations]: print '\t%s' % se print if opts.info: evSum = 0 for block in blocks: print block.get(DataProvider.Dataset, '-'), print block.get(DataProvider.BlockName, '-'), if block.get(DataProvider.Locations, None): print str.join(',', block.get(DataProvider.Locations, '-')), else: print '-', print block.get(DataProvider.NEntries, 0), evSum += block.get(DataProvider.NEntries, 0) print evSum if opts.save: print blocks = provider.getBlocks() if opts.sort: blocks.sort(key = lambda b: b[DataProvider.Dataset] + '#' + b[DataProvider.BlockName]) for b in blocks: b[DataProvider.FileList].sort(key = lambda fi: fi[DataProvider.URL]) provider.saveState(opts.save, blocks) print 'Dataset information saved to ./%s' % opts.save
def main(): dataset = args[0].strip() cfgSettings = {'dbs blacklist T1 *': 'False', 'remove empty blocks *': 'False', 'remove empty files *': 'False', 'location format *': opts.locationfmt, 'nickname check collision *': 'False'} if opts.metadata or opts.blockmetadata: cfgSettings['lumi filter *'] = '-' cfgSettings['keep lumi metadata *'] = 'True' config = getConfig(configFile = opts.settings, configDict = {'dataset': cfgSettings}) if os.path.exists(dataset): provider = DataProvider.getInstance('ListProvider', config, dataset, None) else: provider = DataProvider.create(config, dataset, opts.provider) blocks = provider.getBlocks() if len(blocks) == 0: raise DatasetError('No blocks!') datasets = set(map(lambda x: x[DataProvider.Dataset], blocks)) if len(datasets) > 1 or opts.info: headerbase = [(DataProvider.Dataset, 'Dataset')] else: print('Dataset: %s' % blocks[0][DataProvider.Dataset]) headerbase = [] if opts.configentry: print('') print('dataset =') infos = {} order = [] maxnick = 5 for block in blocks: dsName = block[DataProvider.Dataset] if not infos.get(dsName, None): order.append(dsName) infos[dsName] = dict([(DataProvider.Dataset, dsName)]) if DataProvider.Nickname not in block and opts.confignick: try: if '/' in dsName: block[DataProvider.Nickname] = dsName.lstrip('/').split('/')[1] else: block[DataProvider.Nickname] = dsName except Exception: pass if DataProvider.Nickname not in block and opts.confignick: block[DataProvider.Nickname] = np.getName(None, dsName, block) if DataProvider.Nickname in block: nick = block[DataProvider.Nickname] infos[dsName][DataProvider.Nickname] = nick maxnick = max(maxnick, len(nick)) if len(block[DataProvider.FileList]): infos[dsName][DataProvider.URL] = block[DataProvider.FileList][0][DataProvider.URL] for dsID, dsName in enumerate(order): info = infos[dsName] short = DataProvider.providers.get(provider.__class__.__name__, provider.__class__.__name__) nickname = info.get(DataProvider.Nickname, 'nick%d' % dsID).rjust(maxnick) filterExpr = utils.QM(short == 'list', ' %% %s' % info[DataProvider.Dataset], '') print('\t%s : %s : %s%s' % (nickname, short, provider._datasetExpr, filterExpr)) if opts.listdatasets: # Add some enums for consistent access to info dicts DataProvider.NFiles = -1 DataProvider.NBlocks = -2 print('') infos = {} order = [] infosum = {DataProvider.Dataset : 'Sum'} for block in blocks: dsName = block.get(DataProvider.Dataset, '') if not infos.get(dsName, None): order.append(dsName) infos[dsName] = {DataProvider.Dataset: block[DataProvider.Dataset]} def updateInfos(target): target[DataProvider.NBlocks] = target.get(DataProvider.NBlocks, 0) + 1 target[DataProvider.NFiles] = target.get(DataProvider.NFiles, 0) + len(block[DataProvider.FileList]) target[DataProvider.NEntries] = target.get(DataProvider.NEntries, 0) + block[DataProvider.NEntries] updateInfos(infos[dsName]) updateInfos(infosum) head = [(DataProvider.Dataset, 'Dataset'), (DataProvider.NEntries, '#Events'), (DataProvider.NBlocks, '#Blocks'), (DataProvider.NFiles, '#Files')] utils.printTabular(head, map(lambda x: infos[x], order) + ['=', infosum]) if opts.listblocks: print('') utils.printTabular(headerbase + [(DataProvider.BlockName, 'Block'), (DataProvider.NEntries, 'Events')], blocks) if opts.listfiles: print('') for block in blocks: if len(datasets) > 1: print('Dataset: %s' % block[DataProvider.Dataset]) print('Blockname: %s' % block[DataProvider.BlockName]) utils.printTabular([(DataProvider.URL, 'Filename'), (DataProvider.NEntries, 'Events')], block[DataProvider.FileList]) print('') def printMetadata(src, maxlen): for (mk, mv) in src: if len(str(mv)) > 200: mv = '<metadata entry size: %s> %s...' % (len(str(mv)), repr(mv)[:200]) print('\t%s: %s' % (mk.rjust(maxlen), mv)) if src: print('') if opts.metadata and not opts.save: print('') for block in blocks: if len(datasets) > 1: print('Dataset: %s' % block[DataProvider.Dataset]) print('Blockname: %s' % block[DataProvider.BlockName]) mk_len = max(map(len, block.get(DataProvider.Metadata, ['']))) for f in block[DataProvider.FileList]: print('%s [%d events]' % (f[DataProvider.URL], f[DataProvider.NEntries])) printMetadata(zip(block.get(DataProvider.Metadata, []), f.get(DataProvider.Metadata, [])), mk_len) print('') if opts.blockmetadata and not opts.save: for block in blocks: if len(datasets) > 1: print('Dataset: %s' % block[DataProvider.Dataset]) print('Blockname: %s' % block[DataProvider.BlockName]) mkdict = lambda x: dict(zip(block[DataProvider.Metadata], x[DataProvider.Metadata])) metadata = utils.QM(block[DataProvider.FileList], mkdict(block[DataProvider.FileList][0]), {}) for fileInfo in block[DataProvider.FileList]: utils.intersectDict(metadata, mkdict(fileInfo)) printMetadata(metadata.items(), max(map(len, metadata.keys()))) if opts.liststorage: print('') infos = {} print('Storage elements:') for block in blocks: dsName = block[DataProvider.Dataset] if len(headerbase) > 0: print('Dataset: %s' % dsName) if block.get(DataProvider.BlockName, None): print('Blockname: %s' % block[DataProvider.BlockName]) if block[DataProvider.Locations] == None: print('\tNo location contraint specified') elif block[DataProvider.Locations] == []: print('\tNot located at anywhere') else: for se in block[DataProvider.Locations]: print('\t%s' % se) print('') if opts.info: evSum = 0 for block in blocks: blockId = '%s %s' % (block.get(DataProvider.Dataset, '-'), block.get(DataProvider.BlockName, '-')) blockStorage = '-' if block.get(DataProvider.Locations, None): blockStorage = str.join(',', block.get(DataProvider.Locations, '-')) evSum += block.get(DataProvider.NEntries, 0) print('%s %s %d %d' % (blockId, blockStorage, block.get(DataProvider.NEntries, 0), evSum)) if opts.save: print('') blocks = provider.getBlocks() if opts.sort: blocks.sort(key = lambda b: b[DataProvider.Dataset] + '#' + b[DataProvider.BlockName]) for b in blocks: b[DataProvider.FileList].sort(key = lambda fi: fi[DataProvider.URL]) provider.saveState(opts.save, blocks) print('Dataset information saved to ./%s' % opts.save)