def dataset_show_diff(options): if len(options.args) != 2: options.parser.exit_with_usage(options.parser.usage('data')) provider_a = DataProvider.load_from_file(options.args[0]) provider_b = DataProvider.load_from_file(options.args[1]) block_resync_tuple = DataProvider.resync_blocks(provider_a.get_block_list_cached(show_stats=False), provider_b.get_block_list_cached(show_stats=False)) (block_list_added, block_list_missing, block_list_matching) = block_resync_tuple def _dataset_iter_matching_blocks(): for (block_old, block_new, _, _) in block_list_matching: def _format_change(old, new): if old != new: return '%s -> %s' % (old, new) return old block_old[DataProvider.NFiles] = _format_change(len(block_old.get(DataProvider.FileList, [])), len(block_new.get(DataProvider.FileList, []))) block_old[DataProvider.NEntries] = _format_change(block_old[DataProvider.NEntries], block_new[DataProvider.NEntries]) yield block_old header_list = [(DataProvider.Dataset, 'Dataset'), (DataProvider.BlockName, 'Block'), (DataProvider.NFiles, '#Files'), (DataProvider.NEntries, '#Entries')] if block_list_added: ConsoleTable.create(header_list, dataset_iter_blocks(block_list_added), title='Added blocks') if block_list_missing: ConsoleTable.create(header_list, dataset_iter_blocks(block_list_missing), title='Removed blocks') if block_list_matching: ConsoleTable.create(header_list, _dataset_iter_matching_blocks(), title='Matching blocks')
def discover_blocks(options): # Get work directory, create dbs dump directory if os.path.isdir(options.args[0]): work_dn = os.path.abspath(os.path.normpath(options.args[0])) else: work_dn = gc_create_config(config_file=options.args[0]).get_work_path() if not options.opts.tempdir: options.opts.tempdir = os.path.join(work_dn, 'dbs') if not os.path.exists(options.opts.tempdir): os.mkdir(options.opts.tempdir) # get provider with dataset information config = gc_create_config(config_dict={'dataset': options.config_dict}, load_old_config=False) if options.opts.input_file: provider = DataProvider.create_instance('ListProvider', config, 'dataset', options.opts.input_file) else: provider = DataProvider.create_instance('DBSInfoProvider', config, 'dataset', options.args[0]) blocks = provider.get_block_list_cached(show_stats=False) DataProvider.save_to_file(os.path.join(options.opts.tempdir, 'dbs.dat'), blocks) if options.opts.discovery: sys.exit(os.EX_OK) return blocks
def __init__(self, config, datasetExpr, datasetNick = None): self._changeTrigger = triggerResync(['datasets', 'parameters']) self._lumi_filter = config.getLookup('lumi filter', {}, parser = parseLumiFilter, strfun = strLumi, onChange = self._changeTrigger) if not self._lumi_filter.empty(): config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange = self._changeTrigger) config.set('phedex sites matcher mode', 'shell', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedexFilter = config.getFilter('phedex sites', '-* T1_*_Disk T2_* T3_*', defaultMatcher = 'blackwhite', defaultFilter = 'strict', onChange = self._changeTrigger) self._onlyComplete = config.getBool('only complete sites', True, onChange = self._changeTrigger) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange = self._changeTrigger) self._pjrc = JSONRestClient(url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas') self._sitedb = SiteDB() (self._datasetPath, self._datasetInstance, self._datasetBlock) = optSplit(datasetExpr, '@#') instance_default = config.get('dbs instance', '', onChange = self._changeTrigger) self._datasetInstance = self._datasetInstance or instance_default if not self._datasetInstance: self._datasetInstance = 'prod/global' elif '/' not in self._datasetInstance: self._datasetInstance = 'prod/%s' % self._datasetInstance self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True, onChange = self._changeTrigger)
def discover_blocks(options): # Get work directory, create dbs dump directory if os.path.isdir(options.args[0]): workDir = os.path.abspath(os.path.normpath(options.args[0])) else: workDir = getConfig(configFile=options.args[0]).getWorkPath() if not options.opts.tempdir: options.opts.tempdir = os.path.join(workDir, 'dbs') if not os.path.exists(options.opts.tempdir): os.mkdir(options.opts.tempdir) # get provider with dataset information if options.opts.input_file: provider = DataProvider.createInstance('ListProvider', getConfig(), options.opts.input_file, None) else: config = getConfig(configDict={'dataset': options.config_dict}) provider = DataProvider.createInstance('DBSInfoProvider', config, options.args[0], None) blocks = provider.getBlocks(show_stats=False) DataProvider.saveToFile(os.path.join(options.opts.tempdir, 'dbs.dat'), blocks) if options.opts.discovery: sys.exit(os.EX_OK) return blocks
def __init__(self, dataDir, srcName, dataProvider, dataSplitter, dataProc, repository, keepOld = True): LimitedResyncParameterSource.__init__(self) (self._dn, self._name, self._data_provider, self._data_splitter, self._part_proc, self._keepOld) = \ (dataDir, srcName, dataProvider, dataSplitter, dataProc, keepOld) repository['dataset:%s' % srcName] = self self.resyncSetup(interval = -1) if not dataProvider: # debug mode - used by scripts - disables resync self._maxN = self._data_splitter.getMaxJobs() return # look for aborted resyncs - and try to restore old state if possible if self._existsDataPath('cache.dat.resync') and self._existsDataPath('map.tar.resync'): utils.renameFile(self._getDataPath('cache.dat.resync'), self._getDataPath('cache.dat')) utils.renameFile(self._getDataPath('map.tar.resync'), self._getDataPath('map.tar')) elif self._existsDataPath('cache.dat.resync') or self._existsDataPath('map.tar.resync'): raise DatasetError('Found broken resync state') if self._existsDataPath('cache.dat') and self._existsDataPath('map.tar'): self._data_splitter.importPartitions(self._getDataPath('map.tar')) else: DataProvider.saveToFile(self._getDataPath('cache.dat'), self._data_provider.getBlocks(show_stats = False)) self._data_splitter.splitDataset(self._getDataPath('map.tar'), self._data_provider.getBlocks(show_stats = False)) self._maxN = self._data_splitter.getMaxJobs()
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): dataset_config = config.change_view(default_on_change=TriggerResync(['datasets', 'parameters'])) self._lumi_filter = dataset_config.get_lookup(['lumi filter', '%s lumi filter' % datasource_name], default={}, parser=parse_lumi_filter, strfun=str_lumi) if not self._lumi_filter.empty(): config.set('%s processor' % datasource_name, 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = dataset_config.get_bool( ['lumi metadata', '%s lumi metadata' % datasource_name], default=not self._lumi_filter.empty()) config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedex_filter = dataset_config.get_filter('phedex sites', '-* T1_*_Disk T2_* T3_*', default_matcher='BlackWhiteMatcher', default_filter='StrictListFilter') self._only_complete = dataset_config.get_bool('only complete sites', True) self._only_valid = dataset_config.get_bool('only valid', True) self._location_format = dataset_config.get_enum('location format', CMSLocationFormat, CMSLocationFormat.hostname) self._pjrc = JSONRestClient(url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas') self._sitedb = SiteDB() dataset_expr_parts = split_opt(dataset_expr, '@#') (self._dataset_path, self._dataset_instance, self._dataset_block_selector) = dataset_expr_parts instance_default = dataset_config.get('dbs instance', '') self._dataset_instance = self._dataset_instance or instance_default if not self._dataset_instance: self._dataset_instance = 'prod/global' elif '/' not in self._dataset_instance: self._dataset_instance = 'prod/%s' % self._dataset_instance self._dataset_block_selector = self._dataset_block_selector or 'all'
def __init__(self, config, datasetExpr, datasetNick=None, datasetID=0): self._lumi_filter = parseLumiFilter(config.get('lumi filter', '')) if self._lumi_filter: config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont! self._lumi_query = config.getBool('lumi metadata', self._lumi_filter != []) self._phedexFilter = config.getFilter('phedex sites', '-T3_US_FNALLPC', defaultMatcher='blackwhite', defaultFilter='weak') self._phedexT1Filter = config.getFilter('phedex t1 accept', 'T1_DE_KIT T1_US_FNAL', defaultMatcher='blackwhite', defaultFilter='weak') self._phedexT1Mode = config.get('phedex t1 mode', 'disk').lower() self.onlyComplete = config.getBool('only complete sites', True) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname) (self._datasetPath, self._url, self._datasetBlock) = utils.optSplit(datasetExpr, '@#') self._url = self._url or config.get('dbs instance', '') self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True)
def save_dataset(opts, provider): print('') blocks = provider.getBlocks() if opts.ordered: sort_inplace(blocks, key = itemgetter(DataProvider.Dataset, DataProvider.BlockName)) for b in blocks: sort_inplace(b[DataProvider.FileList], key = itemgetter(DataProvider.URL)) DataProvider.saveToFile(opts.save, blocks) print('Dataset information saved to ./%s' % opts.save)
def _check_lumi_filter(self, block, idx_runs, idx_lumi): lumi_filter = self._lumi_filter.lookup(block[DataProvider.Nickname], is_selector=False) if not lumi_filter: return if (self._lumi_strict == LumiMode.strict) and ((idx_runs is None) or (idx_lumi is None)): raise DatasetError('Strict lumi filter active but ' + 'dataset %s does not provide lumi information!' % DataProvider.get_block_id(block)) elif (self._lumi_strict == LumiMode.weak) and (idx_runs is None): raise DatasetError('Weak lumi filter active but ' + 'dataset %s does not provide run information!' % DataProvider.get_block_id(block))
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): dataset_config = config.change_view( default_on_change=TriggerResync(['datasets', 'parameters'])) self._lumi_filter = dataset_config.get_lookup( ['lumi filter', '%s lumi filter' % datasource_name], default={}, parser=parse_lumi_filter, strfun=str_lumi) if not self._lumi_filter.empty(): config.set('%s processor' % datasource_name, 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = dataset_config.get_bool( ['lumi metadata', '%s lumi metadata' % datasource_name], default=not self._lumi_filter.empty()) config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedex_filter = dataset_config.get_filter( 'phedex sites', '-* T1_*_Disk T2_* T3_*', default_matcher='BlackWhiteMatcher', default_filter='StrictListFilter') self._only_complete = dataset_config.get_bool('only complete sites', True) self._only_valid = dataset_config.get_bool('only valid', True) self._allow_phedex = dataset_config.get_bool('allow phedex', True) self._location_format = dataset_config.get_enum( 'location format', CMSLocationFormat, CMSLocationFormat.hostname) self._pjrc = JSONRestClient( url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas' ) self._sitedb = SiteDB() dataset_expr_parts = split_opt(dataset_expr, '@#') (self._dataset_path, self._dataset_instance, self._dataset_block_selector) = dataset_expr_parts instance_default = dataset_config.get('dbs instance', '') self._dataset_instance = self._dataset_instance or instance_default if not self._dataset_instance: self._dataset_instance = 'prod/global' elif '/' not in self._dataset_instance: self._dataset_instance = 'prod/%s' % self._dataset_instance self._dataset_block_selector = self._dataset_block_selector or 'all'
def __init__(self, dataDir, srcName, dataProvider, dataSplitter, dataProc, keepOld = True): ParameterSource.__init__(self) (self._dataDir, self._srcName, self._dataProvider, self._dataSplitter, self._part_proc) = \ (dataDir, srcName, dataProvider, dataSplitter, dataProc) if not dataProvider: pass # debug mode - used by scripts - disables resync elif os.path.exists(self.getDataPath('cache.dat') and self.getDataPath('map.tar')): self._dataSplitter.importPartitions(self.getDataPath('map.tar')) else: DataProvider.saveToFile(self.getDataPath('cache.dat'), self._dataProvider.getBlocks(silent = False)) self._dataSplitter.splitDataset(self.getDataPath('map.tar'), self._dataProvider.getBlocks()) self._maxN = self._dataSplitter.getMaxJobs() self._keepOld = keepOld
def create_dbs3_json_blocks(opts, dataset_blocks): dbs3_proto_block_iter = create_dbs3_proto_blocks(opts, dataset_blocks) for (block, block_dump, block_size, dataset_type) in dbs3_proto_block_iter: dataset = block[DataProvider.Dataset] try: primary_dataset, processed_dataset, data_tier = dataset[1:].split( '/') except Exception: raise DatasetError('Dataset name %s is not a valid DBS name!' % dataset) # add primary dataset information block_dump['primds'] = { 'primary_ds_type': dataset_type, 'primary_ds_name': primary_dataset } # add dataset information block_dump['dataset'] = { 'dataset': dataset, 'processed_ds_name': processed_dataset, 'data_tier_name': data_tier, 'physics_group_name': None, 'dataset_access_type': 'VALID', 'xtcrosssection': None, # TODO: Add to metadata from FrameWorkJobReport, if possible! } # add block information site_db = CRIC() try: origin_site_name = site_db.se_to_cms_name( block[DataProvider.Locations][0])[0] except IndexError: clear_current_exception() origin_site_name = 'UNKNOWN' block_dump['block'] = { 'block_name': DataProvider.get_block_id(block), 'block_size': block_size, 'file_count': len(block[DataProvider.FileList]), 'origin_site_name': origin_site_name } if opts.do_close_blocks: block_dump['block']['open_for_writing'] = 0 else: block_dump['block']['open_for_writing'] = 1 # add acquisition_era, CRAB is important because of checks within DBS 3 block_dump['acquisition_era'] = { 'acquisition_era_name': 'CRAB', 'start_date': 0 } # add processing_era block_dump['processing_era'] = { 'processing_version': 1, 'description': 'grid-control' } yield validate_dbs3_json('blockBulk', block_dump)
def _displaySetup(self, dsPath, head): if os.path.exists(dsPath): nickNames = set() for block in DataProvider.loadFromFile(dsPath).getBlocks(): nickNames.add(block[DataProvider.Nickname]) utils.vprint('Mapping between nickname and other settings:\n', -1) report = [] for nick in sorted(nickNames): lumi_filter_str = formatLumi( self._nmLumi.lookup(nick, '', is_selector=False)) if len(lumi_filter_str) > 4: nice_lumi_filter = '%s ... %s (%d entries)' % ( lumi_filter_str[0], lumi_filter_str[-1], len(lumi_filter_str)) else: nice_lumi_filter = str.join(', ', lumi_filter_str) config_files = self._nmCfg.lookup(nick, '', is_selector=False) tmp = { 0: nick, 1: str.join(', ', imap(os.path.basename, config_files)), 2: nice_lumi_filter } lookupvars = {'DATASETNICK': nick} for src in self._pm.lookupSources: src.fillParameterInfo(None, lookupvars) tmp.update(lookupvars) report.append(tmp) utils.printTabular(head, report, 'cl') utils.vprint(level=-1)
def _resync(self): if self._data_provider: activity = Activity('Performing resync of datasource %r' % self._name) # Get old and new dataset information ds_old = DataProvider.loadFromFile(self._getDataPath('cache.dat')).getBlocks(show_stats = False) self._data_provider.clearCache() ds_new = self._data_provider.getBlocks(show_stats = False) self._data_provider.saveToFile(self._getDataPath('cache-new.dat'), ds_new) # Use old splitting information to synchronize with new dataset infos old_maxN = self._data_splitter.getMaxJobs() jobChanges = self._data_splitter.resyncMapping(self._getDataPath('map-new.tar'), ds_old, ds_new) activity.finish() if jobChanges is not None: # Move current splitting to backup and use the new splitting from now on def backupRename(old, cur, new): if self._keepOld: os.rename(self._getDataPath(cur), self._getDataPath(old)) os.rename(self._getDataPath(new), self._getDataPath(cur)) backupRename( 'map-old-%d.tar' % time.time(), 'map.tar', 'map-new.tar') backupRename('cache-old-%d.dat' % time.time(), 'cache.dat', 'cache-new.dat') self._data_splitter.importPartitions(self._getDataPath('map.tar')) self._maxN = self._data_splitter.getMaxJobs() self._log.debug('Dataset resync finished: %d -> %d partitions', old_maxN, self._maxN) return (set(jobChanges[0]), set(jobChanges[1]), old_maxN != self._maxN)
def create_dbs3_proto_blocks(opts, dataset_blocks): for dataset in dataset_blocks: missing_info_blocks = [] dataset_types = set() for block in dataset_blocks[dataset]: block_dump = {'dataset_conf_list': [], 'files': [], 'file_conf_list': [], 'file_parent_list': []} (block_size, block_dataset_types) = create_dbs3_json_files(opts, block, block_dump) if len(block_dataset_types) > 1: raise Exception('Data and MC files are mixed in block %s' % DataProvider.bName(block)) elif len(block_dataset_types) == 1: yield (block, block_dump, block_size, block_dataset_types.pop()) else: missing_info_blocks.append((block, block_dump, block_size)) dataset_types.update(block_dataset_types) # collect dataset types in this dataset for blocks with missing type information if missing_info_blocks: if len(dataset_types) > 1: raise Exception('Data and MC files are mixed in dataset %s! Unable to determine dataset type for blocks without type info') elif len(dataset_types) == 0: if not opts.datatype: raise Exception('Please supply dataset type via --datatype!') dataset_type = opts.datatype else: dataset_type = dataset_types.pop() for (block, block_dump, block_size) in missing_info_blocks: yield (block, block_dump, block_size, dataset_type)
def create_dbs3_proto_blocks(opts, dataset_blocks): for dataset in dataset_blocks: missing_info_blocks = [] dataset_types = set() for block in dataset_blocks[dataset]: block_dump = {'dataset_conf_list': [], 'files': [], 'file_conf_list': [], 'file_parent_list': []} (block_size, block_dataset_types) = create_dbs3_json_files(opts, block, block_dump) if len(block_dataset_types) > 1: raise Exception('Data and MC files are mixed in block %s' % DataProvider.get_block_id(block)) elif len(block_dataset_types) == 1: yield (block, block_dump, block_size, block_dataset_types.pop()) else: missing_info_blocks.append((block, block_dump, block_size)) # collect dataset types in this dataset for blocks with missing type information dataset_types.update(block_dataset_types) if missing_info_blocks: if len(dataset_types) > 1: raise Exception('Data and MC files are mixed in dataset %s! ' + 'Unable to determine dataset type for blocks without type info') elif len(dataset_types) == 0: if not opts.datatype: raise Exception('Please supply dataset type via --datatype!') dataset_type = opts.datatype else: dataset_type = dataset_types.pop() for (block, block_dump, block_size) in missing_info_blocks: yield (block, block_dump, block_size, dataset_type)
def _resync_psrc(self): activity = Activity('Performing resync of datasource %r' % self.get_datasource_name()) # Get old and new dataset information provider_old = DataProvider.load_from_file(self._get_data_path('cache.dat')) block_list_old = provider_old.get_block_list_cached(show_stats=False) self._provider.clear_cache() block_list_new = self._provider.get_block_list_cached(show_stats=False) self._provider.save_to_file(self._get_data_path('cache-new.dat'), block_list_new) # Use old splitting information to synchronize with new dataset infos partition_len_old = self.get_parameter_len() partition_changes = self._resync_partitions( self._get_data_path('map-new.tar'), block_list_old, block_list_new) activity.finish() if partition_changes is not None: # Move current splitting to backup and use the new splitting from now on def _rename_with_backup(new, cur, old): if self._keep_old: os.rename(self._get_data_path(cur), self._get_data_path(old)) os.rename(self._get_data_path(new), self._get_data_path(cur)) _rename_with_backup('map-new.tar', 'map.tar', 'map-old-%d.tar' % time.time()) _rename_with_backup('cache-new.dat', 'cache.dat', 'cache-old-%d.dat' % time.time()) self._set_reader(DataSplitter.load_partitions(self._get_data_path('map.tar'))) self._log.debug('Dataset resync finished: %d -> %d partitions', partition_len_old, self._len) (pnum_list_redo, pnum_list_disable) = partition_changes return (set(pnum_list_redo), set(pnum_list_disable), partition_len_old != self._len)
def _display_setup(self, dataset_fn, head): if os.path.exists(dataset_fn): nick_name_set = set() for block in DataProvider.load_from_file( dataset_fn).get_block_list_cached(show_stats=False): nick_name_set.add(block[DataProvider.Nickname]) self._log.info('Mapping between nickname and other settings:') report = [] def _get_dataset_lookup_psrc(psrc): is_lookup_cls = isinstance( psrc, ParameterSource.get_class('LookupBaseParameterSource')) return is_lookup_cls and ('DATASETNICK' in psrc.get_parameter_deps()) ps_lookup = lfilter(_get_dataset_lookup_psrc, self._source.get_used_psrc_list()) for nick in sorted(nick_name_set): tmp = {'DATASETNICK': nick} for src in ps_lookup: src.fill_parameter_content(None, tmp) tmp[1] = str.join( ', ', imap(os.path.basename, self._nm_cfg.lookup(nick, '', is_selector=False))) tmp[2] = str_lumi_nice( self._nm_lumi.lookup(nick, '', is_selector=False)) report.append(tmp) ConsoleTable.create(head, report, 'cl')
def _displaySetup(self, dsPath, head): if os.path.exists(dsPath): nickNames = set() for block in DataProvider.loadFromFile(dsPath).getBlocks(): nickNames.add(block[DataProvider.Nickname]) log = logging.getLogger('user') log.info('Mapping between nickname and other settings:') report = [] (ps_basic, ps_nested) = self._pfactory.getLookupSources() if ps_nested: log.info( 'This list doesn\'t show "nickname constants" with multiple values!' ) for nick in sorted(nickNames): tmp = {'DATASETNICK': nick} for src in ps_basic: src.fillParameterInfo(None, tmp) tmp[1] = str.join( ', ', imap(os.path.basename, self._nmCfg.lookup(nick, '', is_selector=False))) tmp[2] = formatLumiNice( self._nmLumi.lookup(nick, '', is_selector=False)) report.append(tmp) utils.printTabular(head, report, 'cl')
def resync(self): (result_redo, result_disable, result_sizeChange) = ParameterSource.resync(self) if self.resyncEnabled() and self._dataProvider: # Get old and new dataset information old = DataProvider.loadFromFile(self.getDataPath('cache.dat')).getBlocks() self._dataProvider.clearCache() new = self._dataProvider.getBlocks() self._dataProvider.saveToFile(self.getDataPath('cache-new.dat'), new) # Use old splitting information to synchronize with new dataset infos jobChanges = self._dataSplitter.resyncMapping(self.getDataPath('map-new.tar'), old, new) if jobChanges: # Move current splitting to backup and use the new splitting from now on def backupRename(old, cur, new): if self._keepOld: os.rename(self.getDataPath(cur), self.getDataPath(old)) os.rename(self.getDataPath(new), self.getDataPath(cur)) backupRename( 'map-old-%d.tar' % time.time(), 'map.tar', 'map-new.tar') backupRename('cache-old-%d.dat' % time.time(), 'cache.dat', 'cache-new.dat') old_maxN = self._dataSplitter.getMaxJobs() self._dataSplitter.importPartitions(self.getDataPath('map.tar')) self._maxN = self._dataSplitter.getMaxJobs() result_redo.update(jobChanges[0]) result_disable.update(jobChanges[1]) result_sizeChange = result_sizeChange or (old_maxN != self._maxN) self.resyncFinished() return (result_redo, result_disable, result_sizeChange)
def getEntries(self, path, metadata, events, seList, objStore): datacachePath = os.path.join(objStore.get('GC_WORKDIR', ''), 'datacache.dat') source = utils.QM((self._source == '') and os.path.exists(datacachePath), datacachePath, self._source) if source and (source not in self._lfnMap): pSource = DataProvider.createInstance('ListProvider', createConfig(), source) for (n, fl) in imap( lambda b: (b[DataProvider.Dataset], b[DataProvider.FileList]), pSource.getBlocks()): self._lfnMap.setdefault(source, {}).update( dict( imap( lambda fi: (self.lfnTrans(fi[DataProvider.URL]), n), fl))) pList = set() for key in ifilter(lambda k: k in metadata, self._parentKeys): pList.update( imap( lambda pPath: self._lfnMap.get(source, {}).get( self.lfnTrans(pPath)), metadata[key])) metadata['PARENT_PATH'] = lfilter(identity, pList) yield (path, metadata, events, seList, objStore)
def setupJobParameters(self, config, pm): config = config.addSections(['dataset']).addTags([self]) self.dataSplitter = None self.dataRefresh = None self.dataset = config.get('dataset', '').strip() if self.dataset == '': return config.set('se output pattern', '@NICK@_job_@MY_JOBID@_@X@', override = False) config.set('default lookup', 'DATASETNICK', override = False) defaultProvider = config.get('dataset provider', 'ListProvider') dataProvider = DataProvider.create(config, self.dataset, defaultProvider) splitterName = config.get('dataset splitter', 'FileBoundarySplitter') splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName)) self.dataSplitter = splitterClass(config) self.checkSE = config.getBool('dataset storage check', True, onChange = None) # Create and register dataset parameter plugin paramSource = DataParameterSource(config.getWorkPath(), 'data', dataProvider, self.dataSplitter, self.initDataProcessor()) DataParameterSource.datasetsAvailable['data'] = paramSource # Select dataset refresh rate self.dataRefresh = config.getTime('dataset refresh', -1, onChange = None) if self.dataRefresh > 0: paramSource.resyncSetup(interval = max(self.dataRefresh, dataProvider.queryLimit())) utils.vprint('Dataset source will be queried every %s' % utils.strTime(self.dataRefresh), -1) else: paramSource.resyncSetup(interval = 0) def externalRefresh(sig, frame): paramSource.resyncSetup(force = True) signal.signal(signal.SIGUSR2, externalRefresh) if self.dataSplitter.getMaxJobs() == 0: raise UserError('There are no events to process')
def __init__(self, config, datasetExpr, datasetNick=None, datasetID=0): changeTrigger = triggerResync(['datasets', 'parameters']) self._lumi_filter = config.getLookup('lumi filter', {}, parser=parseLumiFilter, strfun=strLumi, onChange=changeTrigger) if not self._lumi_filter.empty(): config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange=changeTrigger) # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont! self._phedexFilter = config.getFilter('phedex sites', '-T3_US_FNALLPC', defaultMatcher='blackwhite', defaultFilter='weak', onChange=changeTrigger) self._phedexT1Filter = config.getFilter('phedex t1 accept', 'T1_DE_KIT T1_US_FNAL', defaultMatcher='blackwhite', defaultFilter='weak', onChange=changeTrigger) self._phedexT1Mode = config.getEnum('phedex t1 mode', PhedexT1Mode, PhedexT1Mode.disk, onChange=changeTrigger) self.onlyComplete = config.getBool('only complete sites', True, onChange=changeTrigger) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange=changeTrigger) self._pjrc = JSONRestClient( url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas' ) (self._datasetPath, self._url, self._datasetBlock) = optSplit(datasetExpr, '@#') self._url = self._url or config.get('dbs instance', '') self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True, onChange=changeTrigger)
def __init__(self, config, datasetExpr, datasetNick=None): self._changeTrigger = triggerResync(['datasets', 'parameters']) self._lumi_filter = config.getLookup('lumi filter', {}, parser=parseLumiFilter, strfun=strLumi, onChange=self._changeTrigger) if not self._lumi_filter.empty(): config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange=self._changeTrigger) config.set('phedex sites matcher mode', 'shell', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedexFilter = config.getFilter('phedex sites', '-* T1_*_Disk T2_* T3_*', defaultMatcher='blackwhite', defaultFilter='strict', onChange=self._changeTrigger) self._onlyComplete = config.getBool('only complete sites', True, onChange=self._changeTrigger) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange=self._changeTrigger) self._pjrc = JSONRestClient( url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas' ) self._sitedb = SiteDB() (self._datasetPath, self._datasetInstance, self._datasetBlock) = optSplit(datasetExpr, '@#') instance_default = config.get('dbs instance', '', onChange=self._changeTrigger) self._datasetInstance = self._datasetInstance or instance_default if not self._datasetInstance: self._datasetInstance = 'prod/global' elif '/' not in self._datasetInstance: self._datasetInstance = 'prod/%s' % self._datasetInstance self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True, onChange=self._changeTrigger)
def _read_plfnp_map(self, config, parent_dataset_expr): if parent_dataset_expr and (parent_dataset_expr not in self._plfnp2pdn_cache): # read parent source and fill lfnMap with parent_lfn_parts -> parent dataset name mapping map_plfnp2pdn = self._plfnp2pdn_cache.setdefault(parent_dataset_expr, {}) for block in DataProvider.iter_blocks_from_expr(self._empty_config, parent_dataset_expr): for fi in block[DataProvider.FileList]: map_plfnp2pdn[self._get_lfnp(fi[DataProvider.URL])] = block[DataProvider.Dataset] return self._plfnp2pdn_cache.get(parent_dataset_expr, {}) # return cached mapping
def __init__(self, config, name): head = [(0, "Nickname")] # Mapping between nickname and config files: cfgList = config.get("nickname config", "") self.nmCfg = config.getDict( "nickname config", {}, parser=lambda x: map(str.strip, x.split(",")), str=lambda x: str.join(",", x) )[0] if cfgList: if "config file" in config.getOptions(): raise ConfigError("Please use 'nickname config' instead of 'config file'") allConfigFiles = utils.flatten(self.nmCfg.values()) config.set("config file", str.join("\n", allConfigFiles)) head.append((1, "Config file")) # Mapping between nickname and constants: self.nmCName = map(str.strip, config.get("nickname constants", "").split()) self.nmConst = {} for var in self.nmCName: tmp = config.getDict(var, {})[0] for (nick, value) in tmp.items(): if value: self.nmConst.setdefault(nick, {})[var] = value else: self.nmConst.setdefault(nick, {})[var] = "" head.append((var, var)) # Mapping between nickname and lumi filter: if "lumi filter" in config.getOptions(): raise ConfigError("Please use 'nickname lumi filter' instead of 'lumi filter'") lumiParse = lambda x: formatLumi(parseLumiFilter(x)) self.nmLumi = config.getDict("nickname lumi filter", {}, parser=lumiParse)[0] if self.nmLumi: for dataset in config.get("dataset", "").splitlines(): (datasetNick, datasetProvider, datasetExpr) = DataProvider.parseDatasetExpr(config, dataset, None) config.set( "dataset %s" % datasetNick, "lumi filter", str.join(",", utils.flatten(fromNM(self.nmLumi, datasetNick, []))), ) config.set("lumi filter", str.join(",", self.nmLumi.get(None, []))) head.append((2, "Lumi filter")) utils.vprint("Mapping between nickname and other settings:\n", -1) def report(): for nick in sorted(set(self.nmCfg.keys() + self.nmConst.keys() + self.nmLumi.keys())): tmp = { 0: nick, 1: str.join(", ", map(os.path.basename, self.nmCfg.get(nick, ""))), 2: self.displayLumi(self.nmLumi.get(nick, "")), } yield utils.mergeDicts([tmp, self.nmConst.get(nick, {})]) utils.printTabular(head, report(), "cl") utils.vprint(level=-1) CMSSW.__init__(self, config, name)
def dataset_show_diff(options): if len(options.args) != 2: options.parser.exit_with_usage(options.parser.usage('data')) provider_a = DataProvider.load_from_file(options.args[0]) provider_b = DataProvider.load_from_file(options.args[1]) block_resync_tuple = DataProvider.resync_blocks( provider_a.get_block_list_cached(show_stats=False), provider_b.get_block_list_cached(show_stats=False)) (block_list_added, block_list_missing, block_list_matching) = block_resync_tuple def _dataset_iter_matching_blocks(): for (block_old, block_new, _, _) in block_list_matching: def _format_change(old, new): if old != new: return '%s -> %s' % (old, new) return old block_old[DataProvider.NFiles] = _format_change( len(block_old.get(DataProvider.FileList, [])), len(block_new.get(DataProvider.FileList, []))) block_old[DataProvider.NEntries] = _format_change( block_old[DataProvider.NEntries], block_new[DataProvider.NEntries]) yield block_old header_list = [(DataProvider.Dataset, 'Dataset'), (DataProvider.BlockName, 'Block'), (DataProvider.NFiles, '#Files'), (DataProvider.NEntries, '#Entries')] if block_list_added: ConsoleTable.create(header_list, dataset_iter_blocks(block_list_added), title='Added blocks') if block_list_missing: ConsoleTable.create(header_list, dataset_iter_blocks(block_list_missing), title='Removed blocks') if block_list_matching: ConsoleTable.create(header_list, _dataset_iter_matching_blocks(), title='Matching blocks')
def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): self._lumi_filter = parseLumiFilter(config.get('lumi filter', '')) if self._lumi_filter: config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont! self._lumi_query = config.getBool('lumi metadata', self._lumi_filter != []) self._phedexFilter = config.getFilter('phedex sites', '-T3_US_FNALLPC', defaultMatcher = 'blackwhite', defaultFilter = 'weak') self._phedexT1Filter = config.getFilter('phedex t1 accept', 'T1_DE_KIT T1_US_FNAL', defaultMatcher = 'blackwhite', defaultFilter = 'weak') self._phedexT1Mode = config.get('phedex t1 mode', 'disk').lower() self.onlyComplete = config.getBool('only complete sites', True) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname) (self._datasetPath, self._url, self._datasetBlock) = utils.optSplit(datasetExpr, '@#') self._url = self._url or config.get('dbs instance', '') self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True)
def __init__(self, dataDir, srcName, dataProvider, dataSplitter, dataProc, repository, keepOld=True): LimitedResyncParameterSource.__init__(self) (self._dn, self._name, self._data_provider, self._data_splitter, self._part_proc, self._keepOld) = \ (dataDir, srcName, dataProvider, dataSplitter, dataProc, keepOld) repository['dataset:%s' % srcName] = self self.resyncSetup(interval=-1) if not dataProvider: # debug mode - used by scripts - disables resync self._maxN = self._data_splitter.getMaxJobs() return # look for aborted resyncs - and try to restore old state if possible if self._existsDataPath('cache.dat.resync') and self._existsDataPath( 'map.tar.resync'): utils.renameFile(self._getDataPath('cache.dat.resync'), self._getDataPath('cache.dat')) utils.renameFile(self._getDataPath('map.tar.resync'), self._getDataPath('map.tar')) elif self._existsDataPath('cache.dat.resync') or self._existsDataPath( 'map.tar.resync'): raise DatasetError('Found broken resync state') if self._existsDataPath('cache.dat') and self._existsDataPath( 'map.tar'): self._data_splitter.importPartitions(self._getDataPath('map.tar')) else: DataProvider.saveToFile( self._getDataPath('cache.dat'), self._data_provider.getBlocks(show_stats=False)) self._data_splitter.splitDataset( self._getDataPath('map.tar'), self._data_provider.getBlocks(show_stats=False)) self._maxN = self._data_splitter.getMaxJobs()
def getEntries(self, path, metadata, events, seList, objStore): datacachePath = os.path.join(objStore.get('GC_WORKDIR', ''), 'datacache.dat') source = utils.QM((self.source == '') and os.path.exists(datacachePath), datacachePath, self.source) if source and (source not in self.lfnMap): pSource = DataProvider.create(createConfigFactory().getConfig(), source, 'ListProvider') for (n, fl) in map(lambda b: (b[DataProvider.Dataset], b[DataProvider.FileList]), pSource.getBlocks()): self.lfnMap.setdefault(source, {}).update(dict(map(lambda fi: (self.lfnTrans(fi[DataProvider.URL]), n), fl))) pList = set() for key in filter(lambda k: k in metadata, self.parentKeys): pList.update(map(lambda pPath: self.lfnMap.get(source, {}).get(self.lfnTrans(pPath)), metadata[key])) metadata['PARENT_PATH'] = filter(lambda x: x, pList) yield (path, metadata, events, seList, objStore)
def dataset_show_removed(options): if len(options.args) < 2: options.parser.exit_with_usage(options.parser.usage('data')) block_list_missing = [] provider_old = DataProvider.load_from_file(options.args[0]) for dataset_fn in options.args[1:]: provider_new = DataProvider.load_from_file(dataset_fn) block_resync_tuple = DataProvider.resync_blocks( provider_old.get_block_list_cached(show_stats=False), provider_new.get_block_list_cached(show_stats=False)) for block in block_resync_tuple[1]: # iterate missing block list tmp = dict(block) tmp[DataProvider.RemovedIn] = dataset_fn block_list_missing.append(tmp) provider_old = provider_new if block_list_missing: ConsoleTable.create([(DataProvider.Dataset, 'Dataset'), (DataProvider.BlockName, 'Block'), (DataProvider.NFiles, '#Files'), (DataProvider.NEntries, '#Entries'), (DataProvider.RemovedIn, 'Removed in file')], dataset_iter_blocks(block_list_missing), title='Removed blocks')
def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): changeTrigger = triggerResync(['datasets', 'parameters']) self._lumi_filter = config.getLookup('lumi filter', {}, parser = parseLumiFilter, strfun = strLumi, onChange = changeTrigger) if not self._lumi_filter.empty(): config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange = changeTrigger) # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont! self._phedexFilter = config.getFilter('phedex sites', '-T3_US_FNALLPC', defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = changeTrigger) self._phedexT1Filter = config.getFilter('phedex t1 accept', 'T1_DE_KIT T1_US_FNAL', defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = changeTrigger) self._phedexT1Mode = config.getEnum('phedex t1 mode', PhedexT1Mode, PhedexT1Mode.disk, onChange = changeTrigger) self.onlyComplete = config.getBool('only complete sites', True, onChange = changeTrigger) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange = changeTrigger) self._pjrc = JSONRestClient(url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas') (self._datasetPath, self._url, self._datasetBlock) = optSplit(datasetExpr, '@#') self._url = self._url or config.get('dbs instance', '') self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True, onChange = changeTrigger)
def _read_plfnp_map(self, config, parent_dataset_expr): if parent_dataset_expr and (parent_dataset_expr not in self._plfnp2pdn_cache): # read parent source and fill lfnMap with parent_lfn_parts -> parent dataset name mapping map_plfnp2pdn = self._plfnp2pdn_cache.setdefault( parent_dataset_expr, {}) for block in DataProvider.iter_blocks_from_expr( self._empty_config, parent_dataset_expr): for fi in block[DataProvider.FileList]: map_plfnp2pdn[self._get_lfnp( fi[DataProvider.URL])] = block[DataProvider.Dataset] return self._plfnp2pdn_cache.get(parent_dataset_expr, {}) # return cached mapping
def discover_blocks(options): # Get work directory, create dbs dump directory if os.path.isdir(options.args[0]): workDir = os.path.abspath(os.path.normpath(options.args[0])) else: workDir = getConfig(configFile = options.args[0]).getWorkPath() if not options.opts.tempdir: options.opts.tempdir = os.path.join(workDir, 'dbs') if not os.path.exists(options.opts.tempdir): os.mkdir(options.opts.tempdir) # get provider with dataset information if options.opts.input_file: provider = DataProvider.createInstance('ListProvider', getConfig(), options.opts.input_file, None) else: config = getConfig(configDict = {'dataset': options.config_dict}) provider = DataProvider.createInstance('DBSInfoProvider', config, options.args[0], None) blocks = provider.getBlocks(show_stats = False) DataProvider.saveToFile(os.path.join(options.opts.tempdir, 'dbs.dat'), blocks) if options.opts.discovery: sys.exit(os.EX_OK) return blocks
def _init_reader(self): # look for aborted inits / resyncs - and try to restore old state if possible if self._exists_data_path('map.tar.resync') and self._exists_data_path('cache.dat.resync'): rename_file(self._get_data_path('cache.dat.resync'), self._get_data_path('cache.dat')) rename_file(self._get_data_path('map.tar.resync'), self._get_data_path('map.tar')) elif self._exists_data_path('map.tar.resync') or self._exists_data_path('cache.dat.resync'): raise DatasetError('Found broken dataset partition resync state in work directory') if self._exists_data_path('map.tar') and not self._exists_data_path('cache.dat'): raise DatasetError('Found broken dataset partition in work directory') elif not self._exists_data_path('map.tar'): # create initial partition map file if not self._exists_data_path('cache.dat'): provider = self._provider else: provider = DataProvider.load_from_file(self._get_data_path('cache.dat')) block_iter = DataProvider.save_to_file_iter(self._get_data_path('cache.dat.init'), provider.get_block_list_cached(show_stats=True)) partition_iter = self._splitter.split_partitions(block_iter) DataSplitter.save_partitions(self._get_data_path('map.tar.init'), partition_iter) rename_file(self._get_data_path('cache.dat.init'), self._get_data_path('cache.dat')) rename_file(self._get_data_path('map.tar.init'), self._get_data_path('map.tar')) return DataSplitter.load_partitions(self._get_data_path('map.tar'))
def _init_reader(self): # look for aborted inits / resyncs - and try to restore old state if possible if self._exists_data_path('map.tar.resync') and self._exists_data_path( 'cache.dat.resync'): rename_file(self._get_data_path('cache.dat.resync'), self._get_data_path('cache.dat')) rename_file(self._get_data_path('map.tar.resync'), self._get_data_path('map.tar')) elif self._exists_data_path( 'map.tar.resync') or self._exists_data_path( 'cache.dat.resync'): raise DatasetError( 'Found broken dataset partition resync state in work directory' ) if self._exists_data_path( 'map.tar') and not self._exists_data_path('cache.dat'): raise DatasetError( 'Found broken dataset partition in work directory') elif not self._exists_data_path('map.tar'): # create initial partition map file if not self._exists_data_path('cache.dat'): provider = self._provider else: provider = DataProvider.load_from_file( self._get_data_path('cache.dat')) block_iter = DataProvider.save_to_file_iter( self._get_data_path('cache.dat.init'), provider.get_block_list_cached(show_stats=True)) partition_iter = self._splitter.split_partitions(block_iter) DataSplitter.save_partitions(self._get_data_path('map.tar.init'), partition_iter) rename_file(self._get_data_path('cache.dat.init'), self._get_data_path('cache.dat')) rename_file(self._get_data_path('map.tar.init'), self._get_data_path('map.tar')) return DataSplitter.load_partitions(self._get_data_path('map.tar'))
def _readParents(self, config, source): # read parent source and fill lfnMap with parent_lfn_refs -> parent dataset name mapping if source and (source not in self._lfnMapCache): block_iter = DataProvider.getBlocksFromExpr(config, source) for (dsName, fl) in imap( lambda b: (b[DataProvider.Dataset], b[DataProvider.FileList]), block_iter): self._lfnMapCache.setdefault(source, {}).update( dict( imap( lambda fi: (self._lfnTrans(fi[DataProvider.URL]), dsName), fl))) return self._lfnMapCache.get(source, {})
def list_config_entries(dataset_list, block_list, opts, provider): dataset_config_str_list = [] max_nick_len = 0 for ds_info in merge_blocks(block_list): max_nick_len = max(max_nick_len, len(ds_info.get(DataProvider.Nickname, ''))) for ds_info in merge_blocks(block_list): nickname = ds_info.get(DataProvider.Nickname, '').rjust(max_nick_len) provider_name_list = DataProvider.get_class(ds_info[DataProvider.Provider]).get_class_name_list() provider_name = sorted(provider_name_list, key=len)[0] query = ds_info[DataProvider.Query] if provider_name == 'list': query += ' %% %s' % ds_info[DataProvider.Dataset] dataset_config_str = '\t%s : %s : %s' % (nickname, provider_name, query) dataset_config_str_list.append(dataset_config_str) logging.getLogger('script').info('\ndataset =\n' + str.join('\n', dataset_config_str_list) + '\n')
def __init__(self, config, datasetExpr, datasetNick, datasetID = 0): DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont! self.phedexBL = config.getList('phedex sites', ['-T3_US_FNALLPC']) self.phedexWL = config.getList('phedex t1 accept', ['T1_DE_KIT', 'T1_US_FNAL']) self.phedexT1 = config.get('phedex t1 mode', 'disk').lower() self.onlyComplete = config.getBool('only complete sites', True) self.locationFormat = config.get('location format', 'hostname').lower() # hostname or sitedb if self.locationFormat not in ['hostname', 'sitedb', 'both']: raise ConfigError('Invalid location format: %s' % self.locationFormat) (self.datasetPath, self.url, self.datasetBlock) = utils.optSplit(datasetExpr, '@#') self.url = QM(self.url, self.url, config.get('dbs instance', '')) self.datasetBlock = QM(self.datasetBlock, self.datasetBlock, 'all') self.includeLumi = config.getBool('keep lumi metadata', False) self.onlyValid = config.getBool('only valid', True) self.checkUnique = config.getBool('check unique', True) # This works in tandem with active task module (cmssy.py supports only [section] lumi filter!) self.selectedLumis = parseLumiFilter(config.get('lumi filter', '')) if self.selectedLumis: utils.vprint('Runs/lumi section filter enabled! (%d entries)' % len(self.selectedLumis), -1, once = True) utils.vprint('\tThe following runs and lumi sections are selected:', 1, once = True) utils.vprint('\t' + utils.wrapList(formatLumi(self.selectedLumis), 65, ',\n\t'), 1, once = True)
def __init__(self, dataDir, srcName, dataProvider, dataSplitter, dataProc, keepOld=True): ParameterSource.__init__(self) (self._dataDir, self._srcName, self._dataProvider, self._dataSplitter, self._part_proc) = \ (dataDir, srcName, dataProvider, dataSplitter, dataProc) if not dataProvider: pass # debug mode - used by scripts - disables resync elif os.path.exists( self.getDataPath('cache.dat') and self.getDataPath('map.tar')): self._dataSplitter.importPartitions(self.getDataPath('map.tar')) else: DataProvider.saveToFile(self.getDataPath('cache.dat'), self._dataProvider.getBlocks(silent=False)) self._dataSplitter.splitDataset(self.getDataPath('map.tar'), self._dataProvider.getBlocks()) self._maxN = self._dataSplitter.getMaxJobs() self._keepOld = keepOld
def setupJobParameters(self, config, pm): config = config.changeView(viewClass = TaggedConfigView, addSections = ['dataset'], addTags = [self]) self.dataSplitter = None self.dataRefresh = None self._forceRefresh = config.getState('resync', detail = 'dataset', default = False) def userRefresh(config, old_obj, cur_obj, cur_entry, obj2str): if ((old_obj == '') and (cur_obj != '')): raise UserError('It is currently not possible to attach a dataset to a non-dataset task!') self._forceRefresh = True return cur_obj self.dataset = config.get('dataset', '', onChange = userRefresh).strip() if self.dataset == '': return config.set('se output pattern', '@NICK@_job_@GC_JOB_ID@_@X@') config.set('default lookup', 'DATASETNICK') defaultProvider = config.get('dataset provider', 'ListProvider') dataProvider = DataProvider.create(config, self.dataset, defaultProvider) splitterName = config.get('dataset splitter', 'FileBoundarySplitter') splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName)) self.dataSplitter = splitterClass(config) # Create and register dataset parameter source paramSplitProcessor = config.getCompositePlugin('dataset processor', 'BasicDataSplitProcessor SECheckSplitProcessor', 'MultiDataSplitProcessor', cls = DataSplitProcessor).getInstance(config) paramSource = DataParameterSource(config.getWorkPath(), 'data', dataProvider, self.dataSplitter, paramSplitProcessor) DataParameterSource.datasetsAvailable['data'] = paramSource # Select dataset refresh rate self.dataRefresh = config.getTime('dataset refresh', -1, onChange = None) if self.dataRefresh > 0: paramSource.resyncSetup(interval = max(self.dataRefresh, dataProvider.queryLimit())) utils.vprint('Dataset source will be queried every %s' % utils.strTime(self.dataRefresh), -1) else: paramSource.resyncSetup(interval = 0) if self._forceRefresh: paramSource.resyncSetup(force = True) def externalRefresh(sig, frame): paramSource.resyncSetup(force = True) signal.signal(signal.SIGUSR2, externalRefresh) if self.dataSplitter.getMaxJobs() == 0: raise UserError('There are no events to process')
def _displaySetup(self, dsPath, head): if os.path.exists(dsPath): nickNames = set() for block in DataProvider.loadFromFile(dsPath).getBlocks(show_stats = False): nickNames.add(block[DataProvider.Nickname]) log = logging.getLogger('user') log.info('Mapping between nickname and other settings:') report = [] (ps_basic, ps_nested) = self._pfactory.getLookupSources() if ps_nested: log.info('This list doesn\'t show "nickname constants" with multiple values!') for nick in sorted(nickNames): tmp = {'DATASETNICK': nick} for src in ps_basic: src.fillParameterInfo(None, tmp) tmp[1] = str.join(', ', imap(os.path.basename, self._nmCfg.lookup(nick, '', is_selector = False))) tmp[2] = formatLumiNice(self._nmLumi.lookup(nick, '', is_selector = False)) report.append(tmp) utils.printTabular(head, report, 'cl')
def list_config_entries(dataset_list, block_list, opts, provider): dataset_config_str_list = [] max_nick_len = 0 for ds_info in merge_blocks(block_list): max_nick_len = max(max_nick_len, len(ds_info.get(DataProvider.Nickname, ''))) for ds_info in merge_blocks(block_list): nickname = ds_info.get(DataProvider.Nickname, '').rjust(max_nick_len) provider_name_list = DataProvider.get_class( ds_info[DataProvider.Provider]).get_class_name_list() provider_name = sorted(provider_name_list, key=len)[0] query = ds_info[DataProvider.Query] if provider_name == 'list': query += ' %% %s' % ds_info[DataProvider.Dataset] dataset_config_str = '\t%s : %s : %s' % (nickname, provider_name, query) dataset_config_str_list.append(dataset_config_str) logging.getLogger('script').info('\ndataset =\n' + str.join('\n', dataset_config_str_list) + '\n')
def create_dbs3_json_blocks(opts, dataset_blocks): dbs3_proto_block_iter = create_dbs3_proto_blocks(opts, dataset_blocks) for (block, block_dump, block_size, dataset_type) in dbs3_proto_block_iter: dataset = block[DataProvider.Dataset] try: primary_dataset, processed_dataset, data_tier = dataset[1:].split('/') except Exception: raise DatasetError('Dataset name %s is not a valid DBS name!' % dataset) # add primary dataset information block_dump['primds'] = {'primary_ds_type': dataset_type, 'primary_ds_name': primary_dataset} # add dataset information block_dump['dataset'] = { 'dataset': dataset, 'processed_ds_name': processed_dataset, 'data_tier_name': data_tier, 'physics_group_name': None, 'dataset_access_type': 'VALID', 'xtcrosssection': None, # TODO: Add to metadata from FrameWorkJobReport, if possible! } # add block information site_db = SiteDB() try: origin_site_name = site_db.se_to_cms_name(block[DataProvider.Locations][0])[0] except IndexError: clear_current_exception() origin_site_name = 'UNKNOWN' block_dump['block'] = {'block_name': DataProvider.get_block_id(block), 'block_size': block_size, 'file_count': len(block[DataProvider.FileList]), 'origin_site_name': origin_site_name} if opts.do_close_blocks: block_dump['block']['open_for_writing'] = 0 else: block_dump['block']['open_for_writing'] = 1 # add acquisition_era, CRAB is important because of checks within DBS 3 block_dump['acquisition_era'] = {'acquisition_era_name': 'CRAB', 'start_date': 0} # add processing_era block_dump['processing_era'] = {'processing_version': 1, 'description': 'grid-control'} yield validate_dbs3_json('blockBulk', block_dump)
def _displaySetup(self, dsPath, head): if os.path.exists(dsPath): nickNames = set() for block in DataProvider.loadFromFile(dsPath).getBlocks(): nickNames.add(block[DataProvider.Nickname]) utils.vprint('Mapping between nickname and other settings:\n', -1) report = [] for nick in sorted(nickNames): lumi_filter_str = formatLumi(self._nmLumi.lookup(nick, '', is_selector = False)) if len(lumi_filter_str) > 4: nice_lumi_filter = '%s ... %s (%d entries)' % (lumi_filter_str[0], lumi_filter_str[-1], len(lumi_filter_str)) else: nice_lumi_filter = str.join(', ', lumi_filter_str) config_files = self._nmCfg.lookup(nick, '', is_selector = False) tmp = {0: nick, 1: str.join(', ', imap(os.path.basename, config_files)), 2: nice_lumi_filter} lookupvars = {'DATASETNICK': nick} for src in self._pm.lookupSources: src.fillParameterInfo(None, lookupvars) tmp.update(lookupvars) report.append(tmp) utils.printTabular(head, report, 'cl') utils.vprint(level = -1)
def _resync(self): if self._data_provider: activity = Activity('Performing resync of datasource %r' % self._name) # Get old and new dataset information ds_old = DataProvider.loadFromFile( self._getDataPath('cache.dat')).getBlocks(show_stats=False) self._data_provider.clearCache() ds_new = self._data_provider.getBlocks(show_stats=False) self._data_provider.saveToFile(self._getDataPath('cache-new.dat'), ds_new) # Use old splitting information to synchronize with new dataset infos old_maxN = self._data_splitter.getMaxJobs() jobChanges = self._data_splitter.resyncMapping( self._getDataPath('map-new.tar'), ds_old, ds_new) activity.finish() if jobChanges is not None: # Move current splitting to backup and use the new splitting from now on def backupRename(old, cur, new): if self._keepOld: os.rename(self._getDataPath(cur), self._getDataPath(old)) os.rename(self._getDataPath(new), self._getDataPath(cur)) backupRename('map-old-%d.tar' % time.time(), 'map.tar', 'map-new.tar') backupRename('cache-old-%d.dat' % time.time(), 'cache.dat', 'cache-new.dat') self._data_splitter.importPartitions( self._getDataPath('map.tar')) self._maxN = self._data_splitter.getMaxJobs() self._log.debug('Dataset resync finished: %d -> %d partitions', old_maxN, self._maxN) return (set(jobChanges[0]), set(jobChanges[1]), old_maxN != self._maxN)
def resync(self): (result_redo, result_disable, result_sizeChange) = ParameterSource.resync(self) if self.resyncEnabled() and self._dataProvider: # Get old and new dataset information old = DataProvider.loadFromFile( self.getDataPath('cache.dat')).getBlocks() self._dataProvider.clearCache() new = self._dataProvider.getBlocks() self._dataProvider.saveToFile(self.getDataPath('cache-new.dat'), new) # Use old splitting information to synchronize with new dataset infos jobChanges = self._dataSplitter.resyncMapping( self.getDataPath('map-new.tar'), old, new) if jobChanges: # Move current splitting to backup and use the new splitting from now on def backupRename(old, cur, new): if self._keepOld: os.rename(self.getDataPath(cur), self.getDataPath(old)) os.rename(self.getDataPath(new), self.getDataPath(cur)) backupRename('map-old-%d.tar' % time.time(), 'map.tar', 'map-new.tar') backupRename('cache-old-%d.dat' % time.time(), 'cache.dat', 'cache-new.dat') old_maxN = self._dataSplitter.getMaxJobs() self._dataSplitter.importPartitions( self.getDataPath('map.tar')) self._maxN = self._dataSplitter.getMaxJobs() result_redo.update(jobChanges[0]) result_disable.update(jobChanges[1]) result_sizeChange = result_sizeChange or (old_maxN != self._maxN) self.resyncFinished() return (result_redo, result_disable, result_sizeChange)
def _resync_psrc(self): activity = Activity('Performing resync of datasource %r' % self.get_datasource_name()) # Get old and new dataset information provider_old = DataProvider.load_from_file( self._get_data_path('cache.dat')) block_list_old = provider_old.get_block_list_cached(show_stats=False) self._provider.clear_cache() block_list_new = self._provider.get_block_list_cached(show_stats=False) self._provider.save_to_file(self._get_data_path('cache-new.dat'), block_list_new) # Use old splitting information to synchronize with new dataset infos partition_len_old = self.get_parameter_len() partition_changes = self._resync_partitions( self._get_data_path('map-new.tar'), block_list_old, block_list_new) activity.finish() if partition_changes is not None: # Move current splitting to backup and use the new splitting from now on def _rename_with_backup(new, cur, old): if self._keep_old: os.rename(self._get_data_path(cur), self._get_data_path(old)) os.rename(self._get_data_path(new), self._get_data_path(cur)) _rename_with_backup('map-new.tar', 'map.tar', 'map-old-%d.tar' % time.time()) _rename_with_backup('cache-new.dat', 'cache.dat', 'cache-old-%d.dat' % time.time()) self._set_reader( DataSplitter.load_partitions(self._get_data_path('map.tar'))) self._log.debug('Dataset resync finished: %d -> %d partitions', partition_len_old, self._len) (pnum_list_redo, pnum_list_disable) = partition_changes return (set(pnum_list_redo), set(pnum_list_disable), partition_len_old != self._len)
def save_dataset(fn, block_list): DataProvider.save_to_file(fn, block_list) logging.getLogger('script').info('Dataset information saved to %r', fn)
selected = JobSelector.create(opts.job_selector, task = task) logging.info('Matching jobs: ' + str.join(' ', imap(str, jobDB.getJobsIter(selected)))) if opts.job_reset_attempts: jobs_reset_attempts(jobDB, selected) if opts.job_force_state: jobs_force_state(opts, jobDB, selected) if opts.job_show_jdl: jobs_show_jdl(jobDB, selected) ######################################################## # DATASET INFOS if opts.dataset_show_diff: if len(args) != 2: utils.exitWithUsage('%s <dataset source 1> <dataset source 2>' % sys.argv[0]) a = DataProvider.createInstance('ListProvider', config, args[0], None) b = DataProvider.createInstance('ListProvider', config, args[1], None) (blocksAdded, blocksMissing, blocksChanged) = DataProvider.resyncSources(a.getBlocks(show_stats = False), b.getBlocks(show_stats = False)) utils.printTabular([(DataProvider.Dataset, 'Dataset'), (DataProvider.BlockName, 'Block')], blocksMissing) if opts.dataset_show_removed: if len(args) < 2: utils.exitWithUsage('%s <dataset source 1> <dataset source 2> ... <dataset source N> ' % sys.argv[0]) removed = [] oldDP = DataProvider.createInstance('ListProvider', config, args[0], None) for new in args[1:]: newDP = DataProvider.createInstance('ListProvider', config, new, None) (blocksAdded, blocksMissing, blocksChanged) = DataProvider.resyncSources(oldDP.getBlocks(show_stats = False), newDP.getBlocks(show_stats = False)) for block in blocksMissing: tmp = dict(block) tmp[-1] = new
def __init__(self, config): InfoScanner.__init__(self, config) dsPath = config.get('source dataset path') self._source = DataProvider.createInstance('ListProvider', config, dsPath)
def _get_fi_class(self, fi, block): run_range = self._run_range.lookup(DataProvider.get_block_id(block)) metadata_idx = block[DataProvider.Metadata].index('Runs') return tuple( imap(lambda r: int(r / run_range), fi[DataProvider.Metadata][metadata_idx]))
def __init__(self, config): dsPath = config.get('source dataset path') self.source = DataProvider.create(config, None, dsPath, 'ListProvider')
def processBlock(self, block): if self._lumi_filter.empty() and ((self._lumi_keep == LumiKeep.RunLumi) or (DataProvider.Metadata not in block)): return block def getMetadataIdx(key): if key in block.get(DataProvider.Metadata, []): return block[DataProvider.Metadata].index(key) idxRuns = getMetadataIdx('Runs') idxLumi = getMetadataIdx('Lumi') if not self._lumi_filter.empty(): lumi_filter = self._lumi_filter.lookup(block[DataProvider.Nickname], is_selector = False) if lumi_filter and (self._lumi_strict == LumiMode.strict) and ((idxRuns is None) or (idxLumi is None)): raise DatasetError('Strict lumi filter active but dataset %s does not provide lumi information!' % DataProvider.bName(block)) elif lumi_filter and (self._lumi_strict == LumiMode.weak) and (idxRuns is None): raise DatasetError('Weak lumi filter active but dataset %s does not provide run information!' % DataProvider.bName(block)) block[DataProvider.FileList] = list(self._processFI(block, idxRuns, idxLumi)) if not block[DataProvider.FileList]: return block[DataProvider.NEntries] = sum(imap(lambda fi: fi[DataProvider.NEntries], block[DataProvider.FileList])) # Prune metadata if self._lumi_keep == LumiKeep.RunLumi: return block elif self._lumi_keep == LumiKeep.Run: idxRuns = None removeRunLumi(block[DataProvider.Metadata], idxRuns, idxLumi) return block