def _parseFile(self, iterator): block = None for idx, line in enumerate(iterator): try: # Found start of block: line = line.strip() if line.startswith(';'): continue elif line.startswith('['): if block: yield self._finishBlock(block) block = self._createBlock(line) elif line != '': tmp = lmap( str.strip, utils.QM('[' in line, line.split(' = ', 1), rsplit(line, '=', 1))) if len(tmp) != 2: raise DatasetError( 'Malformed entry in dataset file:\n%s' % line) key, value = tmp handlerInfo = self._handleEntry.get(key.lower(), None) if handlerInfo: (prop, parser, msg) = handlerInfo block[prop] = try_apply(value, parser, msg) else: block[DataProvider.FileList].append( self._parseEntry(block, key, value)) except Exception: raise DatasetError('Unable to parse %s:%d\n\t%s' % (repr(self._filename), idx, repr(line))) if block: yield self._finishBlock(block)
def _create_blocks(self, iterable): block = None for idx, line in enumerate(iterable): try: # Found start of block: line = line.strip() if line.startswith(';'): continue elif line.startswith('['): if block: yield self._finish_block(block) block = self._create_block(line) elif line != '': if '[' in line: # metadata on this line -> enforce whitespace '/path/file = ...' tmp = lmap(str.strip, line.split(' = ', 1)) else: # loose whitespace convention (allow: '/path/file_var=B_test=1') tmp = lmap(str.strip, rsplit(line, '=', 1)) if len(tmp) != 2: raise DatasetError( 'Malformed entry in dataset file:\n%s' % line) self._fill_block(block, *tmp) except Exception: raise DatasetError('Unable to parse %s:%d\n\t%s' % (repr(self._filename), idx, repr(line))) if block: yield self._finish_block(block)
def getBlocks(self, show_stats): statsProcessor = NullDataProcessor(config=None, onChange=None) if show_stats: statsProcessor = self._stats if self._cache_block is None: ec = ExceptionCollector() def getAllBlocks(): for provider in self._providerList: try: for block in provider.getBlocksNormed(): yield block except Exception: ec.collect() if utils.abort(): raise DatasetError('Could not retrieve all datasets!') try: self._cache_block = list( statsProcessor.process( self._datasetProcessor.process(getAllBlocks()))) except Exception: raise DatasetError( 'Unable to run datasets through processing pipeline!') ec.raise_any(DatasetError('Could not retrieve all datasets!')) return self._cache_block
def generateDatasetName(self, key, data): if self._discovery: return GCProvider.generateDatasetName(self, key, data) if 'CMSSW_DATATIER' not in data: raise DatasetError('Incompatible data tiers in dataset: %s' % data) getPathComponents = lambda path: utils.QM(path, tuple(path.strip('/').split('/')), ()) userPath = getPathComponents(self.nameDS) (primary, processed, tier) = (None, None, None) # In case of a child dataset, use the parent infos to construct new path for parent in data.get('PARENT_PATH', []): if len(userPath) == 3: (primary, processed, tier) = userPath else: try: (primary, processed, tier) = getPathComponents(parent) except Exception: pass if (primary is None) and (len(userPath) > 0): primary = userPath[0] userPath = userPath[1:] if len(userPath) == 2: (processed, tier) = userPath elif len(userPath) == 1: (processed, tier) = (userPath[0], data['CMSSW_DATATIER']) elif len(userPath) == 0: (processed, tier) = ('Dataset_%s' % key, data['CMSSW_DATATIER']) rawDS = '/%s/%s/%s' % (primary, processed, tier) if None in (primary, processed, tier): raise DatasetError('Invalid dataset name supplied: %r\nresulting in %s' % (self.nameDS, rawDS)) return utils.replaceDict(rawDS, data)
def getDatasets(self): if self._cache_dataset is None: self._cache_dataset = [] ec = ExceptionCollector() for provider in self._providerList: try: self._cache_dataset.extend(provider.getDatasets()) except Exception: ec.collect() if utils.abort(): raise DatasetError('Could not retrieve all datasets!') ec.raise_any(DatasetError('Could not retrieve all datasets!')) return self._cache_dataset
def _readBlockFromConfig(self, ds_config, datasetExpr, datasetNick, datasetID): metadata_keys = parseJSON( ds_config.get('metadata', '[]', onChange=None)) common_metadata = parseJSON( ds_config.get('metadata common', '[]', onChange=None)) if len(common_metadata) > len(metadata_keys): raise DatasetError( 'Unable to set %d common metadata items with %d metadata keys' % (len(common_metadata), len(metadata_keys))) common_prefix = ds_config.get('prefix', '', onChange=None) file_list = [] has_events = False has_se_list = False for url in ds_config.getOptions(): if url == 'se list': has_se_list = True elif url == 'events': has_events = True elif url not in [ 'dataset hash', 'id', 'metadata', 'metadata common', 'nickname', 'prefix' ]: file_list.append( self._readFileFromConfig(ds_config, url, metadata_keys, common_metadata, common_prefix)) if not file_list: raise DatasetError( 'There are no dataset files specified for dataset %r' % datasetExpr) result = { DataProvider.Nickname: ds_config.get('nickname', datasetNick, onChange=None), DataProvider.DatasetID: ds_config.getInt('id', datasetID, onChange=None), DataProvider.Dataset: datasetExpr, DataProvider.FileList: sorted(file_list, key=lambda fi: fi[DataProvider.URL]), } if metadata_keys: result[DataProvider.Metadata] = metadata_keys if has_events: result[DataProvider.NEntries] = ds_config.getInt('events', -1, onChange=None) if has_se_list: result[DataProvider.Locations] = parseList( ds_config.get('se list', '', onChange=None), ',') return result
def processBlock(self, block): blockDS = block[DataProvider.Dataset] oldNick = block.get(DataProvider.Nickname, '') newNick = self.getName(oldNick, blockDS, block) # Check if nickname is used consistenly in all blocks of a datasets if self._checkConsistency: if self._checkConsistencyData.setdefault(blockDS, newNick) != newNick: raise DatasetError('Different blocks of dataset "%s" have different nicknames: "%s" != "%s"' % ( blockDS, self._checkConsistencyData[blockDS], newNick)) if self._checkCollision: if self._checkCollisionData.setdefault(newNick, blockDS) != blockDS: raise DatasetError('Multiple datasets use the same nickname "%s": "%s" != "%s"' % ( newNick, self._checkCollisionData[newNick], blockDS)) block[DataProvider.Nickname] = newNick return block
def check_splitter(self, splitter): def _get_proposal(splitter): return reduce(lambda prop, prov: prov.check_splitter(prop), self._provider_list, splitter) prop_splitter = _get_proposal(splitter) if prop_splitter != _get_proposal(prop_splitter): raise DatasetError('Dataset providers could not agree on valid dataset splitter!') return prop_splitter
def _splitJobs(self, fileList, eventsPerJob, firstEvent): nextEvent = firstEvent succEvent = nextEvent + eventsPerJob curEvent = 0 lastEvent = 0 curSkip = 0 fileListIter = iter(fileList) job = { DataSplitter.Skipped: 0, DataSplitter.NEntries: 0, DataSplitter.FileList: [] } while True: if curEvent >= lastEvent: try: fileObj = next(fileListIter) except StopIteration: if job[DataSplitter.FileList]: yield job break nEvents = fileObj[DataProvider.NEntries] if nEvents < 0: raise DatasetError( 'EventBoundarySplitter does not support files with a negative number of events!' ) curEvent = lastEvent lastEvent = curEvent + nEvents curSkip = 0 if nextEvent >= lastEvent: curEvent = lastEvent continue curSkip += nextEvent - curEvent curEvent = nextEvent available = lastEvent - curEvent if succEvent - nextEvent < available: available = succEvent - nextEvent if not len(job[DataSplitter.FileList]): job[DataSplitter.Skipped] = curSkip job[DataSplitter.NEntries] += available nextEvent += available job[DataSplitter.FileList].append(fileObj[DataProvider.URL]) if DataProvider.Metadata in fileObj: job.setdefault(DataSplitter.Metadata, []).append(fileObj[DataProvider.Metadata]) if nextEvent >= succEvent: succEvent += eventsPerJob yield job job = { DataSplitter.Skipped: 0, DataSplitter.NEntries: 0, DataSplitter.FileList: [] }
def process_block(self, block): # Check uniqueness of URLs url_hash_list = [] if self._check_url != DatasetUniqueMode.ignore: block[DataProvider.FileList] = list( self._process_fi_list(url_hash_list, block[DataProvider.FileList])) url_hash_list.sort() # Check uniqueness of blocks if self._check_block != DatasetUniqueMode.ignore: block_hash = md5_hex( repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName], url_hash_list, block[DataProvider.NEntries], block[DataProvider.Locations], block.get(DataProvider.Metadata)))) if block_hash in self._recorded_block: msg = 'Multiple occurences of block: "%s"!' % DataProvider.get_block_id( block) msg += ' (This check can be configured with %r)' % 'dataset check unique block' if self._check_block == DatasetUniqueMode.warn: self._log.warning(msg) elif self._check_block == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._check_block == DatasetUniqueMode.skip: return None self._recorded_block.add(block_hash) return block
def _read_block(self, ds_config, dataset_expr, dataset_nick): metadata_name_list = parse_json( ds_config.get('metadata', '[]', on_change=None)) common_metadata = parse_json( ds_config.get('metadata common', '[]', on_change=None)) if len(common_metadata) > len(metadata_name_list): raise DatasetError('Unable to set %d common metadata items ' % len(common_metadata) + 'with %d metadata keys' % len(metadata_name_list)) common_prefix = ds_config.get('prefix', '', on_change=None) fn_list = [] has_events = False has_se_list = False for url in ds_config.get_option_list(): if url == 'se list': has_se_list = True elif url == 'events': has_events = True elif url not in [ 'dataset hash', 'metadata', 'metadata common', 'nickname', 'prefix' ]: fi = self._read_fi(ds_config, url, metadata_name_list, common_metadata, common_prefix) fn_list.append(fi) if not fn_list: raise DatasetError( 'There are no dataset files specified for dataset %r' % dataset_expr) result = { DataProvider.Nickname: ds_config.get('nickname', dataset_nick or '', on_change=None), DataProvider.FileList: sorted(fn_list, key=lambda fi: fi[DataProvider.URL]) } result.update(DataProvider.parse_block_id(dataset_expr)) if metadata_name_list: result[DataProvider.Metadata] = metadata_name_list if has_events: result[DataProvider.NEntries] = ds_config.get_int('events', -1, on_change=None) if has_se_list: result[DataProvider.Locations] = parse_list( ds_config.get('se list', '', on_change=None), ',') return result
def _partition_block(self, fi_list, events_per_job, entry_first): event_next = entry_first event_succ = event_next + events_per_job event_current = 0 event_prev = 0 skip_current = 0 fi_iter = iter(fi_list) proto_partition = { DataSplitter.Skipped: 0, DataSplitter.NEntries: 0, DataSplitter.FileList: [] } while True: if event_current >= event_prev: fi = next(fi_iter, None) if fi is None: if proto_partition[DataSplitter.FileList]: yield proto_partition break event_count = fi[DataProvider.NEntries] if event_count < 0: raise DatasetError( '%s does not support files with a negative number of events!' % self.__class__.__name__) event_current = event_prev event_prev = event_current + event_count skip_current = 0 if event_next >= event_prev: event_current = event_prev continue skip_current += event_next - event_current event_current = event_next available = event_prev - event_current if event_succ - event_next < available: available = event_succ - event_next if not proto_partition[DataSplitter.FileList]: proto_partition[DataSplitter.Skipped] = skip_current proto_partition[DataSplitter.NEntries] += available event_next += available proto_partition[DataSplitter.FileList].append(fi[DataProvider.URL]) if DataProvider.Metadata in fi: proto_partition.setdefault(DataSplitter.Metadata, []).append( fi[DataProvider.Metadata]) if event_next >= event_succ: event_succ += events_per_job yield proto_partition proto_partition = { DataSplitter.Skipped: 0, DataSplitter.NEntries: 0, DataSplitter.FileList: [] }
def _get_dataset_name(self, metadata_dict, hash_dataset): if self._discovery: return GCProvider._get_dataset_name(self, metadata_dict, hash_dataset) if 'CMSSW_DATATIER' not in metadata_dict: raise DatasetError('Incompatible data tiers in dataset: %s' % repr(metadata_dict)) def _get_path_components(path): if path: return path.strip('/').split('/') return [] user_dataset_part_list = tuple( _get_path_components(self._dataset_pattern)) (primary, processed, tier) = (None, None, None) # In case of a child dataset, use the parent infos to construct new path for parent in metadata_dict.get('PARENT_PATH', []): if len(user_dataset_part_list) == 3: (primary, processed, tier) = user_dataset_part_list else: try: (primary, processed, tier) = tuple(_get_path_components(parent)) except Exception: clear_current_exception() if (primary is None) and (len(user_dataset_part_list) > 0): primary = user_dataset_part_list[0] user_dataset_part_list = user_dataset_part_list[1:] if len(user_dataset_part_list) == 2: (processed, tier) = user_dataset_part_list elif len(user_dataset_part_list) == 1: (processed, tier) = (user_dataset_part_list[0], metadata_dict['CMSSW_DATATIER']) elif len(user_dataset_part_list) == 0: (processed, tier) = ('Dataset_%s' % hash_dataset, metadata_dict['CMSSW_DATATIER']) raw_dataset_name = '/%s/%s/%s' % (primary, processed, tier) if None in (primary, processed, tier): raise DatasetError( 'Invalid dataset name supplied: %r\nresulting in %s' % (self._dataset_pattern, raw_dataset_name)) return replace_with_dict(raw_dataset_name, metadata_dict)
def getAllBlocks(): for provider in self._providerList: try: for block in provider.getBlocks(silent): yield block except Exception: ec.collect() if utils.abort(): raise DatasetError('Could not retrieve all datasets!')
def checkSplitter(self, splitter): def getProposal(x): return reduce(lambda prop, prov: prov.checkSplitter(prop), self._providerList, x) if getProposal(splitter) != getProposal(getProposal(splitter)): raise DatasetError( 'Dataset providers could not agree on valid dataset splitter!') return getProposal(splitter)
def get_dataset_name_list(self): if self._cache_dataset is None: self._cache_dataset = set() exc = ExceptionCollector() for provider in self._provider_list: try: self._cache_dataset.update(provider.get_dataset_name_list()) except Exception: exc.collect() exc.raise_any(DatasetError('Could not retrieve all datasets!')) return list(self._cache_dataset)
def _readBlockFromConfig(self, config, datasetExpr, datasetNick, datasetID): common_metadata = parseJSON( config.get('metadata common', '[]', onChange=None)) common_prefix = config.get('prefix', '', onChange=None) file_list = [] has_events = False has_se_list = False for url in config.getOptions(): if url == 'se list': has_se_list = True elif url == 'events': has_events = True elif url not in [ 'dataset hash', 'id', 'metadata', 'metadata common', 'nickname', 'prefix' ]: info = config.get(url, onChange=None) tmp = info.split(' ', 1) fi = { DataProvider.URL: common_prefix + url, DataProvider.NEntries: int(tmp[0]) } if common_metadata: fi[DataProvider.Metadata] = common_metadata if len(tmp) == 2: fi[DataProvider.Metadata] = fi.get(DataProvider.Metadata, []) + parseJSON(tmp[1]) file_list.append(fi) if not file_list: raise DatasetError( 'There are no dataset files specified for dataset %r' % datasetExpr) result = { DataProvider.Nickname: config.get('nickname', datasetNick, onChange=None), DataProvider.DatasetID: config.getInt('id', datasetID, onChange=None), DataProvider.Dataset: datasetExpr, DataProvider.Metadata: parseJSON(config.get('metadata', '[]', onChange=None)), DataProvider.FileList: sorted(file_list, key=lambda fi: fi[DataProvider.URL]), } if has_events: result[DataProvider.NEntries] = config.getInt('events', -1, onChange=None) if has_se_list: result[DataProvider.Locations] = parseList( config.get('se list', '', onChange=None), ',') return result
def processFI(fiList): for fi in fiList: urlHash = md5_hex(repr((fi[DataProvider.URL], fi[DataProvider.NEntries], fi.get(DataProvider.Metadata)))) if urlHash in self._recordedURL: msg = 'Multiple occurences of URL: %r!' % fi[DataProvider.URL] msg += ' (This check can be configured with %r)' % 'dataset check unique url' if self._checkURL == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkURL == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkURL == DatasetUniqueMode.skip: continue self._recordedURL.add(urlHash) recordedBlockURL.append(urlHash) yield fi
def processBlock(self, block): # Check uniqueness of URLs recordedBlockURL = [] if self._checkURL != DatasetUniqueMode.ignore: def processFI(fiList): for fi in fiList: urlHash = md5_hex( repr((fi[DataProvider.URL], fi[DataProvider.NEntries], fi.get(DataProvider.Metadata)))) if urlHash in self._recordedURL: msg = 'Multiple occurences of URL: %r!' % fi[ DataProvider.URL] msg += ' (This check can be configured with %r)' % 'dataset check unique url' if self._checkURL == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkURL == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkURL == DatasetUniqueMode.skip: continue self._recordedURL.add(urlHash) recordedBlockURL.append(urlHash) yield fi block[DataProvider.FileList] = list( processFI(block[DataProvider.FileList])) recordedBlockURL.sort() # Check uniqueness of blocks if self._checkBlock != DatasetUniqueMode.ignore: blockHash = md5_hex( repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName], recordedBlockURL, block[DataProvider.NEntries], block[DataProvider.Locations], block.get(DataProvider.Metadata)))) if blockHash in self._recordedBlock: msg = 'Multiple occurences of block: "%s"!' % DataProvider.bName( block) msg += ' (This check can be configured with %r)' % 'dataset check unique block' if self._checkBlock == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkBlock == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkBlock == DatasetUniqueMode.skip: return None self._recordedBlock.add(blockHash) return block
def _iter_blocks_raw(self): def _filter_block(block): if self._filter: return self._filter in '/%s#' % DataProvider.get_block_id( block) return True try: fp = SafeFile(self._filename) except Exception: raise DatasetError('Unable to open dataset file %s' % repr(self._filename)) for block in self._create_blocks(fp.iter_close()): if _filter_block(block): self._raise_on_abort() yield block
def _getBlocksInternal(self): def _filterBlock(block): if self._filter: name = '/%s#%s#' % (block[DataProvider.Dataset], block.get(DataProvider.BlockName, '')) return self._filter in name return True try: fp = open(self._filename, 'r') except Exception: raise DatasetError('Unable to open dataset file %s' % repr(self._filename)) try: for block in self._parseFile(fp): if _filterBlock(block): yield block fp.close() except Exception: fp.close() raise
def _readFileFromConfig(self, ds_config, url, metadata_keys, common_metadata, common_prefix): info = ds_config.get(url, onChange=None) tmp = info.split(' ', 1) fi = { DataProvider.URL: common_prefix + url, DataProvider.NEntries: int(tmp[0]) } if common_metadata: fi[DataProvider.Metadata] = common_metadata if len(tmp) == 2: file_metadata = parseJSON(tmp[1]) if len(common_metadata) + len(file_metadata) > len(metadata_keys): raise DatasetError( 'Unable to set %d file metadata items with %d metadata keys (%d common metadata items)' % (len(file_metadata), len(metadata_keys), len(common_metadata))) fi[DataProvider.Metadata] = fi.get(DataProvider.Metadata, []) + file_metadata return fi
def getBlocks(self, silent=True): if self._cache_block is None: ec = ExceptionCollector() def getAllBlocks(): for provider in self._providerList: try: for block in provider.getBlocks(silent): yield block except Exception: ec.collect() if utils.abort(): raise DatasetError('Could not retrieve all datasets!') self._cache_block = list( self._stats.process( self._datasetProcessor.process(getAllBlocks()))) ec.raise_any(DatasetError('Could not retrieve all datasets!')) logging.getLogger('user').info( 'Summary: Running over %s distributed over %d blocks.', *self._stats.getStats()) return self._cache_block
def _read_fi(self, ds_config, url, metadata_name_list, common_metadata, common_prefix): info = ds_config.get(url, on_change=None) tmp = info.split(' ', 1) fi = { DataProvider.URL: common_prefix + url, DataProvider.NEntries: int(tmp[0]) } if common_metadata: fi[DataProvider.Metadata] = common_metadata if len(tmp) == 2: file_metadata = parse_json(tmp[1]) if len(common_metadata) + len(file_metadata) > len( metadata_name_list): raise DatasetError( 'Unable to set %d file metadata items ' % len(file_metadata) + 'with %d metadata keys ' % len(metadata_name_list) + '(%d common metadata items)' % len(common_metadata)) fi[DataProvider.Metadata] = fi.get(DataProvider.Metadata, []) + file_metadata return fi
def _handleError(self, msg, mode): if mode == DatasetCheckMode.warn: self._log.warning(msg) elif mode == DatasetCheckMode.abort: raise DatasetError(msg)
def get_block_list_cached(self, show_stats): exc = ExceptionCollector() result = self._create_block_cache(show_stats, lambda: self._iter_all_blocks(exc)) exc.raise_any(DatasetError('Could not retrieve all datasets!')) return result
def _try_apply(value, fun, desc): try: return fun(value) except Exception: raise DatasetError('Unable to parse %s: %s' % (desc, repr(value)))