def __init__(self, url): self._reader_url = '%s/%s' % (url, 'DBSReader') self._writer_url = '%s/%s' % (url, 'DBSWriter') self._migrate_url = '%s/%s' % (url, 'DBSMigrate') self._proxy_path = os.environ.get('X509_USER_PROXY', '') if not os.path.exists(self._proxy_path): raise UserError( 'VOMS proxy needed to query DBS3! Environment variable X509_USER_PROXY is "%s"' % self._proxy_path) self._jrc = JSONRestClient(cert=self._proxy_path)
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): dataset_config = config.change_view( default_on_change=TriggerResync(['datasets', 'parameters'])) self._lumi_filter = dataset_config.get_lookup( ['lumi filter', '%s lumi filter' % datasource_name], default={}, parser=parse_lumi_filter, strfun=str_lumi) if not self._lumi_filter.empty(): config.set('%s processor' % datasource_name, 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = dataset_config.get_bool( ['lumi metadata', '%s lumi metadata' % datasource_name], default=not self._lumi_filter.empty()) config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedex_filter = dataset_config.get_filter( 'phedex sites', '-* T1_*_Disk T2_* T3_*', default_matcher='BlackWhiteMatcher', default_filter='StrictListFilter') self._only_complete = dataset_config.get_bool('only complete sites', True) self._only_valid = dataset_config.get_bool('only valid', True) self._allow_phedex = dataset_config.get_bool('allow phedex', True) self._location_format = dataset_config.get_enum( 'location format', CMSLocationFormat, CMSLocationFormat.hostname) self._pjrc = JSONRestClient( url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas' ) self._sitedb = SiteDB() dataset_expr_parts = split_opt(dataset_expr, '@#') (self._dataset_path, self._dataset_instance, self._dataset_block_selector) = dataset_expr_parts instance_default = dataset_config.get('dbs instance', '') self._dataset_instance = self._dataset_instance or instance_default if not self._dataset_instance: self._dataset_instance = 'prod/global' elif '/' not in self._dataset_instance: self._dataset_instance = 'prod/%s' % self._dataset_instance self._dataset_block_selector = self._dataset_block_selector or 'all'
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): dataset_config = config.change_view(default_on_change=TriggerResync(['datasets', 'parameters'])) self._lumi_filter = dataset_config.get_lookup(['lumi filter', '%s lumi filter' % datasource_name], default={}, parser=parse_lumi_filter, strfun=str_lumi) if not self._lumi_filter.empty(): config.set('%s processor' % datasource_name, 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = dataset_config.get_bool( ['lumi metadata', '%s lumi metadata' % datasource_name], default=not self._lumi_filter.empty()) config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedex_filter = dataset_config.get_filter('phedex sites', '-* T1_*_Disk T2_* T3_*', default_matcher='BlackWhiteMatcher', default_filter='StrictListFilter') self._only_complete = dataset_config.get_bool('only complete sites', True) self._only_valid = dataset_config.get_bool('only valid', True) self._location_format = dataset_config.get_enum('location format', CMSLocationFormat, CMSLocationFormat.hostname) self._pjrc = JSONRestClient(url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas') self._sitedb = SiteDB() dataset_expr_parts = split_opt(dataset_expr, '@#') (self._dataset_path, self._dataset_instance, self._dataset_block_selector) = dataset_expr_parts instance_default = dataset_config.get('dbs instance', '') self._dataset_instance = self._dataset_instance or instance_default if not self._dataset_instance: self._dataset_instance = 'prod/global' elif '/' not in self._dataset_instance: self._dataset_instance = 'prod/%s' % self._dataset_instance self._dataset_block_selector = self._dataset_block_selector or 'all'
def __init__(self, config, datasetExpr, datasetNick = None): self._changeTrigger = triggerResync(['datasets', 'parameters']) self._lumi_filter = config.getLookup('lumi filter', {}, parser = parseLumiFilter, strfun = strLumi, onChange = self._changeTrigger) if not self._lumi_filter.empty(): config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange = self._changeTrigger) config.set('phedex sites matcher mode', 'shell', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedexFilter = config.getFilter('phedex sites', '-* T1_*_Disk T2_* T3_*', defaultMatcher = 'blackwhite', defaultFilter = 'strict', onChange = self._changeTrigger) self._onlyComplete = config.getBool('only complete sites', True, onChange = self._changeTrigger) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange = self._changeTrigger) self._pjrc = JSONRestClient(url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas') self._sitedb = SiteDB() (self._datasetPath, self._datasetInstance, self._datasetBlock) = optSplit(datasetExpr, '@#') instance_default = config.get('dbs instance', '', onChange = self._changeTrigger) self._datasetInstance = self._datasetInstance or instance_default if not self._datasetInstance: self._datasetInstance = 'prod/global' elif '/' not in self._datasetInstance: self._datasetInstance = 'prod/%s' % self._datasetInstance self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True, onChange = self._changeTrigger)
def __init__(self, config, datasetExpr, datasetNick=None, datasetID=0): changeTrigger = triggerResync(['datasets', 'parameters']) self._lumi_filter = config.getLookup('lumi filter', {}, parser=parseLumiFilter, strfun=strLumi, onChange=changeTrigger) if not self._lumi_filter.empty(): config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange=changeTrigger) # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont! self._phedexFilter = config.getFilter('phedex sites', '-T3_US_FNALLPC', defaultMatcher='blackwhite', defaultFilter='weak', onChange=changeTrigger) self._phedexT1Filter = config.getFilter('phedex t1 accept', 'T1_DE_KIT T1_US_FNAL', defaultMatcher='blackwhite', defaultFilter='weak', onChange=changeTrigger) self._phedexT1Mode = config.getEnum('phedex t1 mode', PhedexT1Mode, PhedexT1Mode.disk, onChange=changeTrigger) self.onlyComplete = config.getBool('only complete sites', True, onChange=changeTrigger) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange=changeTrigger) self._pjrc = JSONRestClient( url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas' ) (self._datasetPath, self._url, self._datasetBlock) = optSplit(datasetExpr, '@#') self._url = self._url or config.get('dbs instance', '') self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True, onChange=changeTrigger)
def __init__(self, config, datasetExpr, datasetNick=None): self._changeTrigger = triggerResync(['datasets', 'parameters']) self._lumi_filter = config.getLookup('lumi filter', {}, parser=parseLumiFilter, strfun=strLumi, onChange=self._changeTrigger) if not self._lumi_filter.empty(): config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange=self._changeTrigger) config.set('phedex sites matcher mode', 'shell', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedexFilter = config.getFilter('phedex sites', '-* T1_*_Disk T2_* T3_*', defaultMatcher='blackwhite', defaultFilter='strict', onChange=self._changeTrigger) self._onlyComplete = config.getBool('only complete sites', True, onChange=self._changeTrigger) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange=self._changeTrigger) self._pjrc = JSONRestClient( url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas' ) self._sitedb = SiteDB() (self._datasetPath, self._datasetInstance, self._datasetBlock) = optSplit(datasetExpr, '@#') instance_default = config.get('dbs instance', '', onChange=self._changeTrigger) self._datasetInstance = self._datasetInstance or instance_default if not self._datasetInstance: self._datasetInstance = 'prod/global' elif '/' not in self._datasetInstance: self._datasetInstance = 'prod/%s' % self._datasetInstance self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True, onChange=self._changeTrigger)
def _lfn2pfn(node, lfn, prot='srmv2'): return JSONRestClient().get( url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/lfn2pfn', params={ 'node': node, 'protocol': prot, 'lfn': lfn })['phedex']['mapping']
class DBS3LiteClient(object): def __init__(self, url): self._reader_url = '%s/%s' % (url, 'DBSReader') self._writer_url = '%s/%s' % (url, 'DBSWriter') self._migrate_url = '%s/%s' % (url, 'DBSMigrate') self._proxy_path = os.environ.get('X509_USER_PROXY', '') if not os.path.exists(self._proxy_path): raise UserError( 'VOMS proxy needed to query DBS3! Environment variable X509_USER_PROXY is "%s"' % self._proxy_path) self._jrc = JSONRestClient(cert=self._proxy_path) def listBlocks(self, **kwargs): return self._jrc.get(url=self._reader_url, api='blocks', params=kwargs) def listFiles(self, **kwargs): return self._jrc.get(url=self._reader_url, api='files', params=kwargs) def listFileParents(self, **kwargs): return self._jrc.get(url=self._reader_url, api='fileparents', params=kwargs) def insertBulkBlock(self, data): return self._jrc.post(url=self._writer_url, api='bulkblocks', data=data) def migrateSubmit(self, data): return self._jrc.post(url=self._migrate_url, api='submit', data=data) def migrateStatus(self, **kwargs): return self._jrc.get(url=self._migrate_url, api='status', params=kwargs)
def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): changeTrigger = triggerResync(['datasets', 'parameters']) self._lumi_filter = config.getLookup('lumi filter', {}, parser = parseLumiFilter, strfun = strLumi, onChange = changeTrigger) if not self._lumi_filter.empty(): config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange = changeTrigger) # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont! self._phedexFilter = config.getFilter('phedex sites', '-T3_US_FNALLPC', defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = changeTrigger) self._phedexT1Filter = config.getFilter('phedex t1 accept', 'T1_DE_KIT T1_US_FNAL', defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = changeTrigger) self._phedexT1Mode = config.getEnum('phedex t1 mode', PhedexT1Mode, PhedexT1Mode.disk, onChange = changeTrigger) self.onlyComplete = config.getBool('only complete sites', True, onChange = changeTrigger) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange = changeTrigger) self._pjrc = JSONRestClient(url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas') (self._datasetPath, self._url, self._datasetBlock) = optSplit(datasetExpr, '@#') self._url = self._url or config.get('dbs instance', '') self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True, onChange = changeTrigger)
class CMSBaseProvider(DataProvider): def __init__(self, config, datasetExpr, datasetNick = None): self._changeTrigger = triggerResync(['datasets', 'parameters']) self._lumi_filter = config.getLookup('lumi filter', {}, parser = parseLumiFilter, strfun = strLumi, onChange = self._changeTrigger) if not self._lumi_filter.empty(): config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange = self._changeTrigger) config.set('phedex sites matcher mode', 'shell', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedexFilter = config.getFilter('phedex sites', '-* T1_*_Disk T2_* T3_*', defaultMatcher = 'blackwhite', defaultFilter = 'strict', onChange = self._changeTrigger) self._onlyComplete = config.getBool('only complete sites', True, onChange = self._changeTrigger) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange = self._changeTrigger) self._pjrc = JSONRestClient(url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas') self._sitedb = SiteDB() (self._datasetPath, self._datasetInstance, self._datasetBlock) = optSplit(datasetExpr, '@#') instance_default = config.get('dbs instance', '', onChange = self._changeTrigger) self._datasetInstance = self._datasetInstance or instance_default if not self._datasetInstance: self._datasetInstance = 'prod/global' elif '/' not in self._datasetInstance: self._datasetInstance = 'prod/%s' % self._datasetInstance self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True, onChange = self._changeTrigger) # Define how often the dataprovider can be queried automatically def queryLimit(self): return 2 * 60 * 60 # 2 hour delay minimum # Check if splitterClass is valid def checkSplitter(self, splitterClass): if (DataSplitter.Skipped in splitterClass.neededEnums()) and not self._lumi_filter.empty(): self._log.debug('Selected splitter %s is not compatible with active lumi filter!', splitterClass.__name__) self._log.warning('Active lumi section filter forced selection of HybridSplitter') return HybridSplitter return splitterClass def _replicaLocation(self, replica_info): (name_node, name_hostname, _) = replica_info if self._locationFormat == CMSLocationFormat.siteDB: yield name_node else: if name_hostname is not None: name_hostnames = [name_hostname] else: name_hostnames = self._sitedb.cms_name_to_se(name_node) for name_hostname in name_hostnames: if self._locationFormat == CMSLocationFormat.hostname: yield name_hostname else: yield '%s/%s' % (name_node, name_hostname) def _fmtLocations(self, replica_infos): for replica_info in replica_infos: (_, _, completed) = replica_info if completed: for entry in self._replicaLocation(replica_info): yield entry else: for entry in self._replicaLocation(replica_info): yield '(%s)' % entry def _processReplicas(self, blockPath, replica_infos): def empty_with_warning(*args): self._log.warning(*args) return [] def expanded_replica_locations(replica_infos): for replica_info in replica_infos: for entry in self._replicaLocation(replica_info): yield entry if not replica_infos: return empty_with_warning('Dataset block %r has no replica information!', blockPath) replica_infos_selected = self._phedexFilter.filterList(replica_infos, key = itemgetter(0)) if not replica_infos_selected: return empty_with_warning('Dataset block %r is not available at the selected locations!\nAvailable locations: %s', blockPath, str.join(', ', self._fmtLocations(replica_infos))) if not self._onlyComplete: return list(expanded_replica_locations(replica_infos_selected)) replica_infos_complete = lfilter(lambda nn_nh_c: nn_nh_c[2], replica_infos_selected) if not replica_infos_complete: return empty_with_warning('Dataset block %r is not completely available at the selected locations!\nAvailable locations: %s', blockPath, str.join(', ', self._fmtLocations(replica_infos))) return list(expanded_replica_locations(replica_infos_complete)) # Get dataset se list from PhEDex (perhaps concurrent with listFiles) def _getPhedexReplicas(self, blockPath, dictReplicas): dictReplicas[blockPath] = [] for phedexBlock in self._pjrc.get(params = {'block': blockPath})['phedex']['block']: for replica in phedexBlock['replica']: dictReplicas[blockPath].append((replica['node'], replica.get('se'), replica['complete'] == 'y')) def getDatasets(self): if self._cache_dataset is None: self._cache_dataset = [self._datasetPath] if '*' in self._datasetPath: self._cache_dataset = list(self._getCMSDatasets(self._datasetPath)) if not self._cache_dataset: raise DatasetError('No datasets selected by DBS wildcard %s !' % self._datasetPath) return self._cache_dataset def _getCMSBlocks(self, datasetPath, getSites): iter_blockname_selist = self._getCMSBlocksImpl(datasetPath, getSites) n_blocks = 0 selected_blocks = False for (blockname, selist) in iter_blockname_selist: n_blocks += 1 if (self._datasetBlock != 'all') and (str.split(blockname, '#')[1] != self._datasetBlock): continue selected_blocks = True yield (blockname, selist) if (n_blocks > 0) and not selected_blocks: raise DatasetError('Dataset %r contains %d blocks, but none were selected by %r' % (datasetPath, n_blocks, self._datasetBlock)) def _fillCMSFiles(self, block, blockPath): lumi_used = False lumiDict = {} if self._lumi_query: # central lumi query lumiDict = self._getCMSLumisImpl(blockPath) fileList = [] for (fileInfo, listLumi) in self._getCMSFilesImpl(blockPath, self.onlyValid, self._lumi_query): if lumiDict and not listLumi: listLumi = lumiDict.get(fileInfo[DataProvider.URL], []) if listLumi: (listLumiExt_Run, listLumiExt_Lumi) = ([], []) for (run, lumi_list) in sorted(listLumi): listLumiExt_Run.extend([run] * len(lumi_list)) listLumiExt_Lumi.extend(lumi_list) fileInfo[DataProvider.Metadata] = [listLumiExt_Run, listLumiExt_Lumi] lumi_used = True fileList.append(fileInfo) if lumi_used: block.setdefault(DataProvider.Metadata, []).extend(['Runs', 'Lumi']) block[DataProvider.FileList] = fileList def _getCMSLumisImpl(self, blockPath): return None def _getGCBlocks(self, usePhedex): for datasetPath in self.getDatasets(): counter = 0 for (blockPath, replica_infos) in self._getCMSBlocks(datasetPath, getSites = not usePhedex): result = {} result[DataProvider.Dataset] = blockPath.split('#')[0] result[DataProvider.BlockName] = blockPath.split('#')[1] if usePhedex: # Start parallel phedex query dictReplicas = {} tPhedex = start_thread('Query phedex site info for %s' % blockPath, self._getPhedexReplicas, blockPath, dictReplicas) self._fillCMSFiles(result, blockPath) tPhedex.join() replica_infos = dictReplicas.get(blockPath) else: self._fillCMSFiles(result, blockPath) result[DataProvider.Locations] = self._processReplicas(blockPath, replica_infos) if len(result[DataProvider.FileList]): counter += 1 yield result if counter == 0: raise DatasetError('Dataset %s does not contain any valid blocks!' % datasetPath)
tau-cream.hep.tau.ac.il tech-crm.hep.technion.ac.il top.ucr.edu umiss001.hep.olemiss.edu uosaf0008.sscc.uos.ac.kr uscms1.fltech-grid3.fit.edu v6ce00.grid.hep.ph.ic.ac.uk vserv13.hep.phy.cam.ac.uk wipp-crm.weizmann.ac.il """ import sys, time from grid_control.utils.webservice import JSONRestClient from python_compat import set, imap, lmap, lfilter, sorted jrc = JSONRestClient( url='http://maps.googleapis.com/maps/api/geocode/json') def geocode(loc): result = jrc.get(params={ 'address': str.join('.', loc.split('.')[2:]), 'sensor': 'false' }) if 'Placemark' in result: result = lmap( lambda x: (x['address'], tuple(reversed(x['Point']['coordinates'][:2]))), result['Placemark']) return result counter = 0
class CMSBaseProvider(DataProvider): # required format: <dataset path>[@<instance>][#<block>] def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): dataset_config = config.change_view( default_on_change=TriggerResync(['datasets', 'parameters'])) self._lumi_filter = dataset_config.get_lookup( ['lumi filter', '%s lumi filter' % datasource_name], default={}, parser=parse_lumi_filter, strfun=str_lumi) if not self._lumi_filter.empty(): config.set('%s processor' % datasource_name, 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = dataset_config.get_bool( ['lumi metadata', '%s lumi metadata' % datasource_name], default=not self._lumi_filter.empty()) config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedex_filter = dataset_config.get_filter( 'phedex sites', '-* T1_*_Disk T2_* T3_*', default_matcher='BlackWhiteMatcher', default_filter='StrictListFilter') self._only_complete = dataset_config.get_bool('only complete sites', True) self._only_valid = dataset_config.get_bool('only valid', True) self._allow_phedex = dataset_config.get_bool('allow phedex', True) self._location_format = dataset_config.get_enum( 'location format', CMSLocationFormat, CMSLocationFormat.hostname) self._pjrc = JSONRestClient( url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas' ) self._sitedb = SiteDB() dataset_expr_parts = split_opt(dataset_expr, '@#') (self._dataset_path, self._dataset_instance, self._dataset_block_selector) = dataset_expr_parts instance_default = dataset_config.get('dbs instance', '') self._dataset_instance = self._dataset_instance or instance_default if not self._dataset_instance: self._dataset_instance = 'prod/global' elif '/' not in self._dataset_instance: self._dataset_instance = 'prod/%s' % self._dataset_instance self._dataset_block_selector = self._dataset_block_selector or 'all' def check_splitter(self, splitter): # Check if splitter is valid if (DataSplitter.Skipped in splitter.get_needed_enums() ) and not self._lumi_filter.empty(): self._log.debug( 'Selected splitter %s is not compatible with active lumi filter!', splitter.__name__) self._log.warning( 'Active lumi section filter forced selection of HybridSplitter' ) return HybridSplitter return splitter def get_dataset_name_list(self): if self._cache_dataset is None: self._cache_dataset = [self._dataset_path] if '*' in self._dataset_path: activity = Activity('Getting dataset list for %s' % self._dataset_path) self._cache_dataset = list( self._get_cms_dataset_list(self._dataset_path)) if not self._cache_dataset: raise DatasetError( 'No datasets selected by DBS wildcard %s !' % self._dataset_path) activity.finish() return self._cache_dataset def get_query_interval(self): # Define how often the dataprovider can be queried automatically return 2 * 60 * 60 # 2 hour delay minimum def _fill_cms_fi_list(self, block, block_path): activity_fi = Activity('Getting file information') lumi_used = False lumi_info_dict = {} if self._lumi_query: # central lumi query lumi_info_dict = self._get_cms_lumi_dict(block_path) fi_list = [] for (fi, lumi_info_list) in self._iter_cms_files(block_path, self._only_valid, self._lumi_query): self._raise_on_abort() if lumi_info_dict and not lumi_info_list: lumi_info_list = lumi_info_dict.get(fi[DataProvider.URL], []) if lumi_info_list: (run_list_result, lumi_list_result) = ([], []) for (run, lumi_list) in sorted(lumi_info_list): run_list_result.extend([run] * len(lumi_list)) lumi_list_result.extend(lumi_list) assert len(run_list_result) == len(lumi_list_result) fi[DataProvider.Metadata] = [run_list_result, lumi_list_result] lumi_used = True fi_list.append(fi) if lumi_used: block.setdefault(DataProvider.Metadata, []).extend(['Runs', 'Lumi']) block[DataProvider.FileList] = fi_list activity_fi.finish() def _filter_cms_blockinfo_list(self, dataset_path, do_query_sites): iter_dataset_block_name_selist = self._iter_cms_blocks( dataset_path, do_query_sites) n_blocks = 0 selected_blocks = False for (dataset_block_name, selist) in iter_dataset_block_name_selist: n_blocks += 1 block_name = str.split(dataset_block_name, '#')[1] if (self._dataset_block_selector != 'all') and (block_name != self._dataset_block_selector): continue selected_blocks = True yield (dataset_block_name, selist) if (n_blocks > 0) and not selected_blocks: raise DatasetError( 'Dataset %r contains %d blocks, but none were selected by %r' % (dataset_path, n_blocks, self._dataset_block_selector)) def _get_cms_dataset_list(self, dataset_path): raise AbstractError def _get_cms_lumi_dict(self, block_path): return None def _get_gc_block_list(self, use_phedex): dataset_name_list = self.get_dataset_name_list() progress_ds = ProgressActivity('Getting dataset', len(dataset_name_list)) for dataset_idx, dataset_path in enumerate(dataset_name_list): progress_ds.update_progress(dataset_idx, msg='Getting dataset %s' % dataset_path) counter = 0 blockinfo_list = list( self._filter_cms_blockinfo_list(dataset_path, not use_phedex)) progress_block = ProgressActivity('Getting block information', len(blockinfo_list)) for (block_path, replica_infos) in blockinfo_list: result = {} result[DataProvider.Dataset] = block_path.split('#')[0] result[DataProvider.BlockName] = block_path.split('#')[1] progress_block.update_progress( counter, msg='Getting block information for ' + result[DataProvider.BlockName]) if use_phedex and self._allow_phedex: # Start parallel phedex query replicas_dict = {} phedex_thread = start_thread( 'Query phedex site info for %s' % block_path, self._get_phedex_replica_list, block_path, replicas_dict) self._fill_cms_fi_list(result, block_path) phedex_thread.join() replica_infos = replicas_dict.get(block_path) else: self._fill_cms_fi_list(result, block_path) result[DataProvider.Locations] = self._process_replica_list( block_path, replica_infos) if len(result[DataProvider.FileList]): counter += 1 yield result progress_block.finish() if counter == 0: raise DatasetError( 'Dataset %s does not contain any valid blocks!' % dataset_path) progress_ds.finish() def _get_phedex_replica_list(self, block_path, replicas_dict): activity_fi = Activity('Getting file replica information from PhEDex') # Get dataset se list from PhEDex (perhaps concurrent with get_dbs_file_list) replicas_dict[block_path] = [] for phedex_block in self._pjrc.get( params={'block': block_path})['phedex']['block']: for replica in phedex_block['replica']: replica_info = (replica['node'], replica.get('se'), replica['complete'] == 'y') replicas_dict[block_path].append(replica_info) activity_fi.finish() def _iter_cms_blocks(self, dataset_path, do_query_sites): raise AbstractError def _iter_cms_files(self, block_path, query_only_valid, query_lumi): raise AbstractError def _iter_formatted_locations(self, replica_infos): for replica_info in replica_infos: (_, _, completed) = replica_info if completed: for entry in self._iter_replica_locations(replica_info): yield entry else: for entry in self._iter_replica_locations(replica_info): yield '(%s)' % entry def _iter_replica_locations(self, replica_info): (name_node, name_hostname, _) = replica_info if self._location_format == CMSLocationFormat.siteDB: yield name_node else: if name_hostname is not None: name_hostnames = [name_hostname] else: name_hostnames = self._sitedb.cms_name_to_se(name_node) for name_hostname in name_hostnames: if self._location_format == CMSLocationFormat.hostname: yield name_hostname else: yield '%s/%s' % (name_node, name_hostname) def _process_replica_list(self, block_path, replica_infos): def _empty_with_warning(error_msg, *args): self._log.warning('Dataset block %r ' + error_msg, block_path, *args) return [] def _expanded_replica_locations(replica_infos): for replica_info in replica_infos: for entry in self._iter_replica_locations(replica_info): yield entry if not replica_infos: return _empty_with_warning('has no replica information!') replica_infos_selected = self._phedex_filter.filter_list( replica_infos, key=itemgetter(0)) if not replica_infos_selected: return _empty_with_warning( 'is not available at the selected locations!\n' + 'Available locations: %s', str.join(', ', self._iter_formatted_locations(replica_infos))) if not self._only_complete: return list(_expanded_replica_locations(replica_infos_selected)) replica_infos_complete = lfilter(lambda nn_nh_c: nn_nh_c[2], replica_infos_selected) if not replica_infos_complete: return _empty_with_warning( 'is not completely available at the selected locations!\n' + 'Available locations: %s', str.join(', ', self._iter_formatted_locations(replica_infos))) return list(_expanded_replica_locations(replica_infos_complete))
def __init__(self): from grid_control.utils.webservice import JSONRestClient self._jrc = JSONRestClient( url='http://maps.googleapis.com/maps/api/geocode/json')
class GeoResolver(object): def __init__(self): from grid_control.utils.webservice import JSONRestClient self._jrc = JSONRestClient( url='http://maps.googleapis.com/maps/api/geocode/json') def run(self): # output of lcg-infosites ce | while read X X X X X CE; do echo $CE; done # | cut -d "/" -f 1 | cut -d ":" -f 1 | sort | uniq ce_list_str = """alcyone-cms.grid.helsinki.fi alice23.spbu.ru arc-ce01.gridpp.rl.ac.uk arc-ce02.gridpp.rl.ac.uk arc-ce03.gridpp.rl.ac.uk argoce01.na.infn.it atlasce1.lnf.infn.it atlasce2.lnf.infn.it atlasce3.lnf.infn.it atlas-cream01.na.infn.it atlas-cream02.na.infn.it boce.bo.infn.it bonner-grid.rice.edu brux3.hep.brown.edu cale.uniandes.edu.co carter-osg.rcac.purdue.edu cccreamceli07.in2p3.fr cccreamceli08.in2p3.fr cce.ihep.ac.cn ce0002.m45.ihep.su ce0004.m45.ihep.su ce01.cmsaf.mit.edu ce01.jinr-t1.ru ce01-lcg.cr.cnaf.infn.it ce-01.roma3.infn.it ce01.tier2.hep.manchester.ac.uk ce02.cmsaf.mit.edu ce02.jinr-t1.ru ce02.ngcc.acad.bg ce02.tier2.hep.manchester.ac.uk ce04-lcg.cr.cnaf.infn.it ce05.esc.qmul.ac.uk ce05-lcg.cr.cnaf.infn.it ce05.ncg.ingrid.pt ce06.esc.qmul.ac.uk ce06-lcg.cr.cnaf.infn.it ce07.esc.qmul.ac.uk ce07-lcg.cr.cnaf.infn.it ce08-lcg.cr.cnaf.infn.it ce101.grid.ucy.ac.cy ce1.accre.vanderbilt.edu ce1.dur.scotgrid.ac.uk ce1.grid.lebedev.ru ce1.ts.infn.it ce201.cern.ch ce202.cern.ch ce203.cern.ch ce204.cern.ch ce205.cern.ch ce206.cern.ch ce207.cern.ch ce208.cern.ch ce2.accre.vanderbilt.edu ce2.particles.ipm.ac.ir ce301.cern.ch ce302.cern.ch ce3.ppgrid1.rhul.ac.uk ce401.cern.ch ce402.cern.ch ce403.cern.ch ce404.cern.ch ce405.cern.ch ce406.cern.ch ce407.cern.ch ce408.cern.ch ce64.ipb.ac.rs ce6.grid.icm.edu.pl ce7.glite.ecdf.ed.ac.uk ce9.grid.icm.edu.pl cebo-t3-01.cr.cnaf.infn.it cebo-t3-02.cr.cnaf.infn.it ce.cis.gov.pl cecream.ca.infn.it ce.fesb.egi.cro-ngi.hr ce.grid.unesp.br ce.irb.egi.cro-ngi.hr ceprod05.grid.hep.ph.ic.ac.uk ceprod06.grid.hep.ph.ic.ac.uk ceprod07.grid.hep.ph.ic.ac.uk ceprod08.grid.hep.ph.ic.ac.uk cert-37.pd.infn.it ce.scope.unina.it ce.srce.egi.cro-ngi.hr cetest01.grid.hep.ph.ic.ac.uk cetest02.grid.hep.ph.ic.ac.uk ce.ulakbim.gov.tr cit-gatekeeper2.ultralight.org cit-gatekeeper.ultralight.org cluster118.knu.ac.kr cluster50.knu.ac.kr cms-0.mps.ohio-state.edu cmsce01.na.infn.it cmsgrid01.hep.wisc.edu cmsgrid02.hep.wisc.edu cms-grid0.hep.uprm.edu cmsosgce2.fnal.gov cmsosgce4.fnal.gov cmsosgce.fnal.gov cmsrm-cream01.roma1.infn.it cmsrm-cream02.roma1.infn.it cmsrm-cream03.roma1.infn.it cmstest1.rcac.purdue.edu cms.tier3.ucdavis.edu conte-osg.rcac.purdue.edu cox01.grid.metu.edu.tr cr1.ipp.acad.bg cream01.grid.auth.gr cream01.grid.sinica.edu.tw cream01.grid.uoi.gr cream01.kallisto.hellasgrid.gr cream01.lcg.cscs.ch cream02.grid.cyf-kr.edu.pl cream02.iihe.ac.be cream02.lcg.cscs.ch cream03.lcg.cscs.ch cream04.grid.sinica.edu.tw cream04.lcg.cscs.ch cream05.grid.sinica.edu.tw cream2.ppgrid1.rhul.ac.uk cream3.hep.kbfi.ee cream4.hep.kbfi.ee cream.afroditi.hellasgrid.gr cream-ce01.ariagni.hellasgrid.gr cream-ce01.indiacms.res.in cream-ce01.marie.hellasgrid.gr cream-ce02.cat.cbpf.br creamce02.ciemat.es cream-ce02.marie.hellasgrid.gr creamce03.ciemat.es creamce1.itep.ru cream-ce-2.ba.infn.it cream-ce-4.ba.infn.it cream-ce.cat.cbpf.br cream-ce.grid.atomki.hu creamce.hephy.oeaw.ac.at creamce.inula.man.poznan.pl cream-ce.kipt.kharkov.ua cream-ce.pg.infn.it creamce.reef.man.poznan.pl cream.grid.cyf-kr.edu.pl cream.ipb.ac.rs dc2-grid-66.brunel.ac.uk dc2-grid-68.brunel.ac.uk dc2-grid-70.brunel.ac.uk dwarf.wcss.wroc.pl earth.crc.nd.edu epgr02.ph.bham.ac.uk erbium.lsr.nectec.or.th f-cream01.grid.sinica.edu.tw f-cream04.grid.sinica.edu.tw fiupg.hep.fiu.edu foam.grid.kiae.ru fornax-ce2.itwm.fhg.de fornax-ce.itwm.fhg.de grcreamce01.inr.troitsk.ru grid001.ics.forth.gr grid002.jet.efda.org grid012.ct.infn.it grid01.physics.uoi.gr grid0.fe.infn.it grid106.kfki.hu grid107.kfki.hu grid109.kfki.hu grid129.sinp.msu.ru grid36.lal.in2p3.fr grid72.phy.ncu.edu.tw gridce01.ifca.es gridce03.ifca.es gridce0.pi.infn.it gridce1.pi.infn.it grid-ce2.physik.rwth-aachen.de gridce2.pi.infn.it gridce3.pi.infn.it gridce4.pi.infn.it gridce.ilc.cnr.it grid-ce.physik.rwth-aachen.de grid-cr0.desy.de grid-cr1.desy.de grid-cr2.desy.de grid-cr3.desy.de grid-cr4.desy.de gridgk01.racf.bnl.gov gridgk02.racf.bnl.gov gridgk03.racf.bnl.gov gridgk04.racf.bnl.gov gridgk05.racf.bnl.gov gridgk06.racf.bnl.gov gridgk08.racf.bnl.gov gridtest02.racf.bnl.gov gridvm03.roma2.infn.it grisuce.scope.unina.it gt3.pnpi.nw.ru hansen-osg.rcac.purdue.edu hepcms-0.umd.edu hepgrid10.ph.liv.ac.uk hepgrid5.ph.liv.ac.uk hepgrid6.ph.liv.ac.uk hepgrid97.ph.liv.ac.uk hephygr.oeaw.ac.at heposg01.colorado.edu hurr.tamu.edu ingrid.cism.ucl.ac.be jade-cms.hip.fi juk.nikhef.nl kalkan1.ulakbim.gov.tr khaldun.biruni.upm.my klomp.nikhef.nl kodiak-ce.baylor.edu lcg18.sinp.msu.ru lcg52.sinp.msu.ru lcgce01.phy.bris.ac.uk lcgce03.phy.bris.ac.uk lcgce04.phy.bris.ac.uk lcgce12.jinr.ru lcgce1.shef.ac.uk lcgce21.jinr.ru lcgce2.shef.ac.uk lcg-cream.ifh.de llrcream.in2p3.fr lpnhe-cream.in2p3.fr lyogrid07.in2p3.fr magic.cse.buffalo.edu mwt2-gk.campuscluster.illinois.edu ndcms.crc.nd.edu node01-03.usm.renam.md node01-04.grid.renam.md node05-02.imi.renam.md node74.datagrid.cea.fr nodeslab-0002.nlab.tb.hiit.fi ntugrid2.phys.ntu.edu.tw ntugrid5.phys.ntu.edu.tw nys1.cac.cornell.edu osgce.hepgrid.uerj.br osg-ce.sprace.org.br osg-gk.mwt2.org osg-gw-6.t2.ucsd.edu osg-gw-7.t2.ucsd.edu osg.hpc.ufl.edu osg-nemo-ce.phys.uwm.edu osg.rcac.purdue.edu osgserv01.slac.stanford.edu osgserv02.slac.stanford.edu ouhep0.nhn.ou.edu pamelace01.na.infn.it pcncp04.ncp.edu.pk pcncp05.ncp.edu.pk pre7230.datagrid.cea.fr prod-ce-01.pd.infn.it razi.biruni.upm.my recasce01.na.infn.it red-gw1.unl.edu red-gw2.unl.edu red.unl.edu rossmann-osg.rcac.purdue.edu sbgce2.in2p3.fr snf-189278.vm.okeanos.grnet.gr snf-458754.vm.okeanos.grnet.gr spacina-ce.scope.unina.it svr009.gla.scotgrid.ac.uk svr010.gla.scotgrid.ac.uk svr011.gla.scotgrid.ac.uk svr014.gla.scotgrid.ac.uk t2arc01.physics.ox.ac.uk t2-ce-01.lnl.infn.it t2-ce-01.to.infn.it t2-ce-02.lnl.infn.it t2ce02.physics.ox.ac.uk t2-ce-03.lnl.infn.it t2-ce-04.lnl.infn.it t2-ce-04.mi.infn.it t2ce04.physics.ox.ac.uk t2-ce-05.mi.infn.it t2-ce-06.lnl.infn.it t2ce06.physics.ox.ac.uk t3serv007.mit.edu tau-cream.hep.tau.ac.il tech-crm.hep.technion.ac.il top.ucr.edu umiss001.hep.olemiss.edu uosaf0008.sscc.uos.ac.kr uscms1.fltech-grid3.fit.edu v6ce00.grid.hep.ph.ic.ac.uk vserv13.hep.phy.cam.ac.uk wipp-crm.weizmann.ac.il """ import sys, time from python_compat import set, imap, lfilter, sorted counter = 0 used = set() for line in imap(str.strip, ce_list_str.splitlines()): time.sleep(0.2) match = get_geo_match(line) if not match: counter += 1 sys.stderr.write('\t%r: %r\n' % (line, self._geocode(line))) else: used.add(match) sys.stderr.write('%s unmatched entries\n' % counter) sys.stderr.write('unused entries:\n%s\n' % repr(lfilter(lambda x: x not in used, _GEO_DICT))) sys.stdout.write('_GEO_DICT = {\n') geo_dict_key_list = sorted( _GEO_DICT.keys(), key=lambda x: str.join('.', reversed(x.split('.')))) for entry in geo_dict_key_list: sys.stdout.write('\t%r: (%.6f, %.6f),\n' % (entry, _GEO_DICT[entry][0], _GEO_DICT[entry][1])) sys.stdout.write('}\n') def _geocode(self, loc): result = self._jrc.get(params={ 'address': str.join('.', loc.split('.')[2:]), 'sensor': 'false' }) if 'Placemark' in result: # unfold placemake entries place_list = [] for entry in result['Placemark']: place_list.append( (entry['address'], tuple(reversed(entry['Point']['coordinates'][:2])))) return place_list return result
class GeoResolver(object): def __init__(self): from grid_control.utils.webservice import JSONRestClient self._jrc = JSONRestClient(url='http://maps.googleapis.com/maps/api/geocode/json') def run(self): # output of lcg-infosites ce | while read X X X X X CE; do echo $CE; done # | cut -d "/" -f 1 | cut -d ":" -f 1 | sort | uniq ce_list_str = """alcyone-cms.grid.helsinki.fi alice23.spbu.ru arc-ce01.gridpp.rl.ac.uk arc-ce02.gridpp.rl.ac.uk arc-ce03.gridpp.rl.ac.uk argoce01.na.infn.it atlasce1.lnf.infn.it atlasce2.lnf.infn.it atlasce3.lnf.infn.it atlas-cream01.na.infn.it atlas-cream02.na.infn.it boce.bo.infn.it bonner-grid.rice.edu brux3.hep.brown.edu cale.uniandes.edu.co carter-osg.rcac.purdue.edu cccreamceli07.in2p3.fr cccreamceli08.in2p3.fr cce.ihep.ac.cn ce0002.m45.ihep.su ce0004.m45.ihep.su ce01.cmsaf.mit.edu ce01.jinr-t1.ru ce01-lcg.cr.cnaf.infn.it ce-01.roma3.infn.it ce01.tier2.hep.manchester.ac.uk ce02.cmsaf.mit.edu ce02.jinr-t1.ru ce02.ngcc.acad.bg ce02.tier2.hep.manchester.ac.uk ce04-lcg.cr.cnaf.infn.it ce05.esc.qmul.ac.uk ce05-lcg.cr.cnaf.infn.it ce05.ncg.ingrid.pt ce06.esc.qmul.ac.uk ce06-lcg.cr.cnaf.infn.it ce07.esc.qmul.ac.uk ce07-lcg.cr.cnaf.infn.it ce08-lcg.cr.cnaf.infn.it ce101.grid.ucy.ac.cy ce1.accre.vanderbilt.edu ce1.dur.scotgrid.ac.uk ce1.grid.lebedev.ru ce1.ts.infn.it ce201.cern.ch ce202.cern.ch ce203.cern.ch ce204.cern.ch ce205.cern.ch ce206.cern.ch ce207.cern.ch ce208.cern.ch ce2.accre.vanderbilt.edu ce2.particles.ipm.ac.ir ce301.cern.ch ce302.cern.ch ce3.ppgrid1.rhul.ac.uk ce401.cern.ch ce402.cern.ch ce403.cern.ch ce404.cern.ch ce405.cern.ch ce406.cern.ch ce407.cern.ch ce408.cern.ch ce64.ipb.ac.rs ce6.grid.icm.edu.pl ce7.glite.ecdf.ed.ac.uk ce9.grid.icm.edu.pl cebo-t3-01.cr.cnaf.infn.it cebo-t3-02.cr.cnaf.infn.it ce.cis.gov.pl cecream.ca.infn.it ce.fesb.egi.cro-ngi.hr ce.grid.unesp.br ce.irb.egi.cro-ngi.hr ceprod05.grid.hep.ph.ic.ac.uk ceprod06.grid.hep.ph.ic.ac.uk ceprod07.grid.hep.ph.ic.ac.uk ceprod08.grid.hep.ph.ic.ac.uk cert-37.pd.infn.it ce.scope.unina.it ce.srce.egi.cro-ngi.hr cetest01.grid.hep.ph.ic.ac.uk cetest02.grid.hep.ph.ic.ac.uk ce.ulakbim.gov.tr cit-gatekeeper2.ultralight.org cit-gatekeeper.ultralight.org cluster118.knu.ac.kr cluster50.knu.ac.kr cms-0.mps.ohio-state.edu cmsce01.na.infn.it cmsgrid01.hep.wisc.edu cmsgrid02.hep.wisc.edu cms-grid0.hep.uprm.edu cmsosgce2.fnal.gov cmsosgce4.fnal.gov cmsosgce.fnal.gov cmsrm-cream01.roma1.infn.it cmsrm-cream02.roma1.infn.it cmsrm-cream03.roma1.infn.it cmstest1.rcac.purdue.edu cms.tier3.ucdavis.edu conte-osg.rcac.purdue.edu cox01.grid.metu.edu.tr cr1.ipp.acad.bg cream01.grid.auth.gr cream01.grid.sinica.edu.tw cream01.grid.uoi.gr cream01.kallisto.hellasgrid.gr cream01.lcg.cscs.ch cream02.grid.cyf-kr.edu.pl cream02.iihe.ac.be cream02.lcg.cscs.ch cream03.lcg.cscs.ch cream04.grid.sinica.edu.tw cream04.lcg.cscs.ch cream05.grid.sinica.edu.tw cream2.ppgrid1.rhul.ac.uk cream3.hep.kbfi.ee cream4.hep.kbfi.ee cream.afroditi.hellasgrid.gr cream-ce01.ariagni.hellasgrid.gr cream-ce01.indiacms.res.in cream-ce01.marie.hellasgrid.gr cream-ce02.cat.cbpf.br creamce02.ciemat.es cream-ce02.marie.hellasgrid.gr creamce03.ciemat.es creamce1.itep.ru cream-ce-2.ba.infn.it cream-ce-4.ba.infn.it cream-ce.cat.cbpf.br cream-ce.grid.atomki.hu creamce.hephy.oeaw.ac.at creamce.inula.man.poznan.pl cream-ce.kipt.kharkov.ua cream-ce.pg.infn.it creamce.reef.man.poznan.pl cream.grid.cyf-kr.edu.pl cream.ipb.ac.rs dc2-grid-66.brunel.ac.uk dc2-grid-68.brunel.ac.uk dc2-grid-70.brunel.ac.uk dwarf.wcss.wroc.pl earth.crc.nd.edu epgr02.ph.bham.ac.uk erbium.lsr.nectec.or.th f-cream01.grid.sinica.edu.tw f-cream04.grid.sinica.edu.tw fiupg.hep.fiu.edu foam.grid.kiae.ru fornax-ce2.itwm.fhg.de fornax-ce.itwm.fhg.de grcreamce01.inr.troitsk.ru grid001.ics.forth.gr grid002.jet.efda.org grid012.ct.infn.it grid01.physics.uoi.gr grid0.fe.infn.it grid106.kfki.hu grid107.kfki.hu grid109.kfki.hu grid129.sinp.msu.ru grid36.lal.in2p3.fr grid72.phy.ncu.edu.tw gridce01.ifca.es gridce03.ifca.es gridce0.pi.infn.it gridce1.pi.infn.it grid-ce2.physik.rwth-aachen.de gridce2.pi.infn.it gridce3.pi.infn.it gridce4.pi.infn.it gridce.ilc.cnr.it grid-ce.physik.rwth-aachen.de grid-cr0.desy.de grid-cr1.desy.de grid-cr2.desy.de grid-cr3.desy.de grid-cr4.desy.de gridgk01.racf.bnl.gov gridgk02.racf.bnl.gov gridgk03.racf.bnl.gov gridgk04.racf.bnl.gov gridgk05.racf.bnl.gov gridgk06.racf.bnl.gov gridgk08.racf.bnl.gov gridtest02.racf.bnl.gov gridvm03.roma2.infn.it grisuce.scope.unina.it gt3.pnpi.nw.ru hansen-osg.rcac.purdue.edu hepcms-0.umd.edu hepgrid10.ph.liv.ac.uk hepgrid5.ph.liv.ac.uk hepgrid6.ph.liv.ac.uk hepgrid97.ph.liv.ac.uk hephygr.oeaw.ac.at heposg01.colorado.edu hurr.tamu.edu ingrid.cism.ucl.ac.be jade-cms.hip.fi juk.nikhef.nl kalkan1.ulakbim.gov.tr khaldun.biruni.upm.my klomp.nikhef.nl kodiak-ce.baylor.edu lcg18.sinp.msu.ru lcg52.sinp.msu.ru lcgce01.phy.bris.ac.uk lcgce03.phy.bris.ac.uk lcgce04.phy.bris.ac.uk lcgce12.jinr.ru lcgce1.shef.ac.uk lcgce21.jinr.ru lcgce2.shef.ac.uk lcg-cream.ifh.de llrcream.in2p3.fr lpnhe-cream.in2p3.fr lyogrid07.in2p3.fr magic.cse.buffalo.edu mwt2-gk.campuscluster.illinois.edu ndcms.crc.nd.edu node01-03.usm.renam.md node01-04.grid.renam.md node05-02.imi.renam.md node74.datagrid.cea.fr nodeslab-0002.nlab.tb.hiit.fi ntugrid2.phys.ntu.edu.tw ntugrid5.phys.ntu.edu.tw nys1.cac.cornell.edu osgce.hepgrid.uerj.br osg-ce.sprace.org.br osg-gk.mwt2.org osg-gw-6.t2.ucsd.edu osg-gw-7.t2.ucsd.edu osg.hpc.ufl.edu osg-nemo-ce.phys.uwm.edu osg.rcac.purdue.edu osgserv01.slac.stanford.edu osgserv02.slac.stanford.edu ouhep0.nhn.ou.edu pamelace01.na.infn.it pcncp04.ncp.edu.pk pcncp05.ncp.edu.pk pre7230.datagrid.cea.fr prod-ce-01.pd.infn.it razi.biruni.upm.my recasce01.na.infn.it red-gw1.unl.edu red-gw2.unl.edu red.unl.edu rossmann-osg.rcac.purdue.edu sbgce2.in2p3.fr snf-189278.vm.okeanos.grnet.gr snf-458754.vm.okeanos.grnet.gr spacina-ce.scope.unina.it svr009.gla.scotgrid.ac.uk svr010.gla.scotgrid.ac.uk svr011.gla.scotgrid.ac.uk svr014.gla.scotgrid.ac.uk t2arc01.physics.ox.ac.uk t2-ce-01.lnl.infn.it t2-ce-01.to.infn.it t2-ce-02.lnl.infn.it t2ce02.physics.ox.ac.uk t2-ce-03.lnl.infn.it t2-ce-04.lnl.infn.it t2-ce-04.mi.infn.it t2ce04.physics.ox.ac.uk t2-ce-05.mi.infn.it t2-ce-06.lnl.infn.it t2ce06.physics.ox.ac.uk t3serv007.mit.edu tau-cream.hep.tau.ac.il tech-crm.hep.technion.ac.il top.ucr.edu umiss001.hep.olemiss.edu uosaf0008.sscc.uos.ac.kr uscms1.fltech-grid3.fit.edu v6ce00.grid.hep.ph.ic.ac.uk vserv13.hep.phy.cam.ac.uk wipp-crm.weizmann.ac.il """ import sys, time from python_compat import set, imap, lfilter, sorted counter = 0 used = set() for line in imap(str.strip, ce_list_str.splitlines()): time.sleep(0.2) match = get_geo_match(line) if not match: counter += 1 sys.stderr.write('\t%r: %r\n' % (line, self._geocode(line))) else: used.add(match) sys.stderr.write('%s unmatched entries\n' % counter) sys.stderr.write('unused entries:\n%s\n' % repr(lfilter(lambda x: x not in used, _GEO_DICT))) sys.stdout.write('_GEO_DICT = {\n') geo_dict_key_list = sorted(_GEO_DICT.keys(), key=lambda x: str.join('.', reversed(x.split('.')))) for entry in geo_dict_key_list: sys.stdout.write('\t%r: (%.6f, %.6f),\n' % (entry, _GEO_DICT[entry][0], _GEO_DICT[entry][1])) sys.stdout.write('}\n') def _geocode(self, loc): result = self._jrc.get(params={'address': str.join('.', loc.split('.')[2:]), 'sensor': 'false'}) if 'Placemark' in result: # unfold placemake entries place_list = [] for entry in result['Placemark']: place_list.append((entry['address'], tuple(reversed(entry['Point']['coordinates'][:2])))) return place_list return result
class CMSBaseProvider(DataProvider): # required format: <dataset path>[@<instance>][#<block>] def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): dataset_config = config.change_view(default_on_change=TriggerResync(['datasets', 'parameters'])) self._lumi_filter = dataset_config.get_lookup(['lumi filter', '%s lumi filter' % datasource_name], default={}, parser=parse_lumi_filter, strfun=str_lumi) if not self._lumi_filter.empty(): config.set('%s processor' % datasource_name, 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = dataset_config.get_bool( ['lumi metadata', '%s lumi metadata' % datasource_name], default=not self._lumi_filter.empty()) config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedex_filter = dataset_config.get_filter('phedex sites', '-* T1_*_Disk T2_* T3_*', default_matcher='BlackWhiteMatcher', default_filter='StrictListFilter') self._only_complete = dataset_config.get_bool('only complete sites', True) self._only_valid = dataset_config.get_bool('only valid', True) self._location_format = dataset_config.get_enum('location format', CMSLocationFormat, CMSLocationFormat.hostname) self._pjrc = JSONRestClient(url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas') self._sitedb = SiteDB() dataset_expr_parts = split_opt(dataset_expr, '@#') (self._dataset_path, self._dataset_instance, self._dataset_block_selector) = dataset_expr_parts instance_default = dataset_config.get('dbs instance', '') self._dataset_instance = self._dataset_instance or instance_default if not self._dataset_instance: self._dataset_instance = 'prod/global' elif '/' not in self._dataset_instance: self._dataset_instance = 'prod/%s' % self._dataset_instance self._dataset_block_selector = self._dataset_block_selector or 'all' def check_splitter(self, splitter): # Check if splitter is valid if (DataSplitter.Skipped in splitter.get_needed_enums()) and not self._lumi_filter.empty(): self._log.debug('Selected splitter %s is not compatible with active lumi filter!', splitter.__name__) self._log.warning('Active lumi section filter forced selection of HybridSplitter') return HybridSplitter return splitter def get_dataset_name_list(self): if self._cache_dataset is None: self._cache_dataset = [self._dataset_path] if '*' in self._dataset_path: activity = Activity('Getting dataset list for %s' % self._dataset_path) self._cache_dataset = list(self._get_cms_dataset_list(self._dataset_path)) if not self._cache_dataset: raise DatasetError('No datasets selected by DBS wildcard %s !' % self._dataset_path) activity.finish() return self._cache_dataset def get_query_interval(self): # Define how often the dataprovider can be queried automatically return 2 * 60 * 60 # 2 hour delay minimum def _fill_cms_fi_list(self, block, block_path): activity_fi = Activity('Getting file information') lumi_used = False lumi_info_dict = {} if self._lumi_query: # central lumi query lumi_info_dict = self._get_cms_lumi_dict(block_path) fi_list = [] for (fi, lumi_info_list) in self._iter_cms_files(block_path, self._only_valid, self._lumi_query): self._raise_on_abort() if lumi_info_dict and not lumi_info_list: lumi_info_list = lumi_info_dict.get(fi[DataProvider.URL], []) if lumi_info_list: (run_list_result, lumi_list_result) = ([], []) for (run, lumi_list) in sorted(lumi_info_list): run_list_result.extend([run] * len(lumi_list)) lumi_list_result.extend(lumi_list) assert len(run_list_result) == len(lumi_list_result) fi[DataProvider.Metadata] = [run_list_result, lumi_list_result] lumi_used = True fi_list.append(fi) if lumi_used: block.setdefault(DataProvider.Metadata, []).extend(['Runs', 'Lumi']) block[DataProvider.FileList] = fi_list activity_fi.finish() def _filter_cms_blockinfo_list(self, dataset_path, do_query_sites): iter_dataset_block_name_selist = self._iter_cms_blocks(dataset_path, do_query_sites) n_blocks = 0 selected_blocks = False for (dataset_block_name, selist) in iter_dataset_block_name_selist: n_blocks += 1 block_name = str.split(dataset_block_name, '#')[1] if (self._dataset_block_selector != 'all') and (block_name != self._dataset_block_selector): continue selected_blocks = True yield (dataset_block_name, selist) if (n_blocks > 0) and not selected_blocks: raise DatasetError('Dataset %r contains %d blocks, but none were selected by %r' % ( dataset_path, n_blocks, self._dataset_block_selector)) def _get_cms_dataset_list(self, dataset_path): raise AbstractError def _get_cms_lumi_dict(self, block_path): return None def _get_gc_block_list(self, use_phedex): dataset_name_list = self.get_dataset_name_list() progress_ds = ProgressActivity('Getting dataset', len(dataset_name_list)) for dataset_idx, dataset_path in enumerate(dataset_name_list): progress_ds.update_progress(dataset_idx, msg='Getting dataset %s' % dataset_path) counter = 0 blockinfo_list = list(self._filter_cms_blockinfo_list(dataset_path, not use_phedex)) progress_block = ProgressActivity('Getting block information', len(blockinfo_list)) for (block_path, replica_infos) in blockinfo_list: result = {} result[DataProvider.Dataset] = block_path.split('#')[0] result[DataProvider.BlockName] = block_path.split('#')[1] progress_block.update_progress(counter, msg='Getting block information for ' + result[DataProvider.BlockName]) if use_phedex: # Start parallel phedex query replicas_dict = {} phedex_thread = start_thread('Query phedex site info for %s' % block_path, self._get_phedex_replica_list, block_path, replicas_dict) self._fill_cms_fi_list(result, block_path) phedex_thread.join() replica_infos = replicas_dict.get(block_path) else: self._fill_cms_fi_list(result, block_path) result[DataProvider.Locations] = self._process_replica_list(block_path, replica_infos) if len(result[DataProvider.FileList]): counter += 1 yield result progress_block.finish() if counter == 0: raise DatasetError('Dataset %s does not contain any valid blocks!' % dataset_path) progress_ds.finish() def _get_phedex_replica_list(self, block_path, replicas_dict): activity_fi = Activity('Getting file replica information from PhEDex') # Get dataset se list from PhEDex (perhaps concurrent with get_dbs_file_list) replicas_dict[block_path] = [] for phedex_block in self._pjrc.get(params={'block': block_path})['phedex']['block']: for replica in phedex_block['replica']: replica_info = (replica['node'], replica.get('se'), replica['complete'] == 'y') replicas_dict[block_path].append(replica_info) activity_fi.finish() def _iter_cms_blocks(self, dataset_path, do_query_sites): raise AbstractError def _iter_cms_files(self, block_path, query_only_valid, query_lumi): raise AbstractError def _iter_formatted_locations(self, replica_infos): for replica_info in replica_infos: (_, _, completed) = replica_info if completed: for entry in self._iter_replica_locations(replica_info): yield entry else: for entry in self._iter_replica_locations(replica_info): yield '(%s)' % entry def _iter_replica_locations(self, replica_info): (name_node, name_hostname, _) = replica_info if self._location_format == CMSLocationFormat.siteDB: yield name_node else: if name_hostname is not None: name_hostnames = [name_hostname] else: name_hostnames = self._sitedb.cms_name_to_se(name_node) for name_hostname in name_hostnames: if self._location_format == CMSLocationFormat.hostname: yield name_hostname else: yield '%s/%s' % (name_node, name_hostname) def _process_replica_list(self, block_path, replica_infos): def _empty_with_warning(error_msg, *args): self._log.warning('Dataset block %r ' + error_msg, block_path, *args) return [] def _expanded_replica_locations(replica_infos): for replica_info in replica_infos: for entry in self._iter_replica_locations(replica_info): yield entry if not replica_infos: return _empty_with_warning('has no replica information!') replica_infos_selected = self._phedex_filter.filter_list(replica_infos, key=itemgetter(0)) if not replica_infos_selected: return _empty_with_warning('is not available at the selected locations!\n' + 'Available locations: %s', str.join(', ', self._iter_formatted_locations(replica_infos))) if not self._only_complete: return list(_expanded_replica_locations(replica_infos_selected)) replica_infos_complete = lfilter(lambda nn_nh_c: nn_nh_c[2], replica_infos_selected) if not replica_infos_complete: return _empty_with_warning('is not completely available at the selected locations!\n' + 'Available locations: %s', str.join(', ', self._iter_formatted_locations(replica_infos))) return list(_expanded_replica_locations(replica_infos_complete))
def __init__(self): from grid_control.utils.webservice import JSONRestClient self._jrc = JSONRestClient(url='http://maps.googleapis.com/maps/api/geocode/json')
class CMSBaseProvider(DataProvider): def __init__(self, config, datasetExpr, datasetNick=None, datasetID=0): changeTrigger = triggerResync(['datasets', 'parameters']) self._lumi_filter = config.getLookup('lumi filter', {}, parser=parseLumiFilter, strfun=strLumi, onChange=changeTrigger) if not self._lumi_filter.empty(): config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange=changeTrigger) # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont! self._phedexFilter = config.getFilter('phedex sites', '-T3_US_FNALLPC', defaultMatcher='blackwhite', defaultFilter='weak', onChange=changeTrigger) self._phedexT1Filter = config.getFilter('phedex t1 accept', 'T1_DE_KIT T1_US_FNAL', defaultMatcher='blackwhite', defaultFilter='weak', onChange=changeTrigger) self._phedexT1Mode = config.getEnum('phedex t1 mode', PhedexT1Mode, PhedexT1Mode.disk, onChange=changeTrigger) self.onlyComplete = config.getBool('only complete sites', True, onChange=changeTrigger) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange=changeTrigger) self._pjrc = JSONRestClient( url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas' ) (self._datasetPath, self._url, self._datasetBlock) = optSplit(datasetExpr, '@#') self._url = self._url or config.get('dbs instance', '') self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True, onChange=changeTrigger) # Define how often the dataprovider can be queried automatically def queryLimit(self): return 2 * 60 * 60 # 2 hour delay minimum # Check if splitterClass is valid def checkSplitter(self, splitterClass): if (DataSplitter.Skipped in splitterClass.neededEnums() ) and not self._lumi_filter.empty(): self._log.debug( 'Selected splitter %s is not compatible with active lumi filter!', splitterClass.__name__) self._log.warning( 'Active lumi section filter forced selection of HybridSplitter' ) return HybridSplitter return splitterClass def _nodeFilter(self, nameSiteDB, complete): # Remove T0 and T1 by default result = not (nameSiteDB.startswith('T0_') or nameSiteDB.startswith('T1_')) # check if listed on the accepted list if self._phedexT1Mode in [PhedexT1Mode.disk, PhedexT1Mode.accept]: result = result or (self._phedexT1Filter.filterList([nameSiteDB]) == [nameSiteDB]) if self._phedexT1Mode == PhedexT1Mode.disk: result = result or nameSiteDB.lower().endswith('_disk') # apply phedex blacklist result = result and (self._phedexFilter.filterList([nameSiteDB]) == [nameSiteDB]) # check for completeness at the site result = result and (complete or not self.onlyComplete) return result # Get dataset se list from PhEDex (perhaps concurrent with listFiles) def _getPhedexSEList(self, blockPath, dictSE): dictSE[blockPath] = [] for phedexBlock in self._pjrc.get( params={'block': blockPath})['phedex']['block']: for replica in phedexBlock['replica']: if self._nodeFilter(replica['node'], replica['complete'] == 'y'): location = None if self._locationFormat == CMSLocationFormat.hostname: location = replica.get('se') elif self._locationFormat == CMSLocationFormat.siteDB: location = replica.get('node') elif (self._locationFormat == CMSLocationFormat.both) and ( replica.get('node') or replica.get('se')): location = '%s/%s' % (replica.get('node'), replica.get('se')) if location: dictSE[blockPath].append(location) else: self._log.warning( 'Dataset block %s replica at %s / %s is skipped!', blockPath, replica.get('node'), replica.get('se')) def getDatasets(self): if self._cache_dataset is None: self._cache_dataset = [self._datasetPath] if '*' in self._datasetPath: self._cache_dataset = list( self.getCMSDatasets(self._datasetPath)) if not self._cache_dataset: raise DatasetError( 'No datasets selected by DBS wildcard %s !' % self._datasetPath) return self._cache_dataset def getCMSBlocks(self, datasetPath, getSites): iter_blockname_selist = self.getCMSBlocksImpl(datasetPath, getSites) n_blocks = 0 selected_blocks = False for (blockname, selist) in iter_blockname_selist: n_blocks += 1 if (self._datasetBlock != 'all') and (str.split(blockname, '#')[1] != self._datasetBlock): continue selected_blocks = True yield (blockname, selist) if (n_blocks > 0) and not selected_blocks: raise DatasetError( 'Dataset %r contains %d blocks, but none were selected by %r' % (datasetPath, n_blocks, self._datasetBlock)) def fillCMSFiles(self, block, blockPath): lumi_used = False lumiDict = {} if self._lumi_query: # central lumi query lumiDict = self.getCMSLumisImpl(blockPath) fileList = [] for (fileInfo, listLumi) in self.getCMSFilesImpl(blockPath, self.onlyValid, self._lumi_query): if lumiDict and not listLumi: listLumi = lumiDict.get(fileInfo[DataProvider.URL], []) if listLumi: (listLumiExt_Run, listLumiExt_Lumi) = ([], []) for (run, lumi_list) in sorted(listLumi): listLumiExt_Run.extend([run] * len(lumi_list)) listLumiExt_Lumi.extend(lumi_list) fileInfo[DataProvider.Metadata] = [ listLumiExt_Run, listLumiExt_Lumi ] lumi_used = True fileList.append(fileInfo) if lumi_used: block.setdefault(DataProvider.Metadata, []).extend(['Runs', 'Lumi']) block[DataProvider.FileList] = fileList def getCMSLumisImpl(self, blockPath): return None def getGCBlocks(self, usePhedex): for datasetPath in self.getDatasets(): counter = 0 for (blockPath, listSE) in self.getCMSBlocks(datasetPath, getSites=not usePhedex): result = {} result[DataProvider.Dataset] = blockPath.split('#')[0] result[DataProvider.BlockName] = blockPath.split('#')[1] if usePhedex: # Start parallel phedex query dictSE = {} tPhedex = start_thread( 'Query phedex site info for %s' % blockPath, self._getPhedexSEList, blockPath, dictSE) self.fillCMSFiles(result, blockPath) tPhedex.join() listSE = dictSE.get(blockPath) else: self.fillCMSFiles(result, blockPath) result[DataProvider.Locations] = listSE if len(result[DataProvider.FileList]): counter += 1 yield result if counter == 0: raise DatasetError( 'Dataset %s does not contain any valid blocks!' % datasetPath)
class CMSBaseProvider(DataProvider): def __init__(self, config, datasetExpr, datasetNick=None): self._changeTrigger = triggerResync(['datasets', 'parameters']) self._lumi_filter = config.getLookup('lumi filter', {}, parser=parseLumiFilter, strfun=strLumi, onChange=self._changeTrigger) if not self._lumi_filter.empty(): config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange=self._changeTrigger) config.set('phedex sites matcher mode', 'shell', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedexFilter = config.getFilter('phedex sites', '-* T1_*_Disk T2_* T3_*', defaultMatcher='blackwhite', defaultFilter='strict', onChange=self._changeTrigger) self._onlyComplete = config.getBool('only complete sites', True, onChange=self._changeTrigger) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange=self._changeTrigger) self._pjrc = JSONRestClient( url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas' ) self._sitedb = SiteDB() (self._datasetPath, self._datasetInstance, self._datasetBlock) = optSplit(datasetExpr, '@#') instance_default = config.get('dbs instance', '', onChange=self._changeTrigger) self._datasetInstance = self._datasetInstance or instance_default if not self._datasetInstance: self._datasetInstance = 'prod/global' elif '/' not in self._datasetInstance: self._datasetInstance = 'prod/%s' % self._datasetInstance self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True, onChange=self._changeTrigger) # Define how often the dataprovider can be queried automatically def queryLimit(self): return 2 * 60 * 60 # 2 hour delay minimum # Check if splitterClass is valid def checkSplitter(self, splitterClass): if (DataSplitter.Skipped in splitterClass.neededEnums() ) and not self._lumi_filter.empty(): self._log.debug( 'Selected splitter %s is not compatible with active lumi filter!', splitterClass.__name__) self._log.warning( 'Active lumi section filter forced selection of HybridSplitter' ) return HybridSplitter return splitterClass def _replicaLocation(self, replica_info): (name_node, name_hostname, _) = replica_info if self._locationFormat == CMSLocationFormat.siteDB: yield name_node else: if name_hostname is not None: name_hostnames = [name_hostname] else: name_hostnames = self._sitedb.cms_name_to_se(name_node) for name_hostname in name_hostnames: if self._locationFormat == CMSLocationFormat.hostname: yield name_hostname else: yield '%s/%s' % (name_node, name_hostname) def _fmtLocations(self, replica_infos): for replica_info in replica_infos: (_, _, completed) = replica_info if completed: for entry in self._replicaLocation(replica_info): yield entry else: for entry in self._replicaLocation(replica_info): yield '(%s)' % entry def _processReplicas(self, blockPath, replica_infos): def empty_with_warning(*args): self._log.warning(*args) return [] def expanded_replica_locations(replica_infos): for replica_info in replica_infos: for entry in self._replicaLocation(replica_info): yield entry if not replica_infos: return empty_with_warning( 'Dataset block %r has no replica information!', blockPath) replica_infos_selected = self._phedexFilter.filterList( replica_infos, key=itemgetter(0)) if not replica_infos_selected: return empty_with_warning( 'Dataset block %r is not available at the selected locations!\nAvailable locations: %s', blockPath, str.join(', ', self._fmtLocations(replica_infos))) if not self._onlyComplete: return list(expanded_replica_locations(replica_infos_selected)) replica_infos_complete = lfilter(lambda nn_nh_c: nn_nh_c[2], replica_infos_selected) if not replica_infos_complete: return empty_with_warning( 'Dataset block %r is not completely available at the selected locations!\nAvailable locations: %s', blockPath, str.join(', ', self._fmtLocations(replica_infos))) return list(expanded_replica_locations(replica_infos_complete)) # Get dataset se list from PhEDex (perhaps concurrent with listFiles) def _getPhedexReplicas(self, blockPath, dictReplicas): dictReplicas[blockPath] = [] for phedexBlock in self._pjrc.get( params={'block': blockPath})['phedex']['block']: for replica in phedexBlock['replica']: dictReplicas[blockPath].append( (replica['node'], replica.get('se'), replica['complete'] == 'y')) def getDatasets(self): if self._cache_dataset is None: self._cache_dataset = [self._datasetPath] if '*' in self._datasetPath: self._cache_dataset = list( self._getCMSDatasets(self._datasetPath)) if not self._cache_dataset: raise DatasetError( 'No datasets selected by DBS wildcard %s !' % self._datasetPath) return self._cache_dataset def _getCMSBlocks(self, datasetPath, getSites): iter_blockname_selist = self._getCMSBlocksImpl(datasetPath, getSites) n_blocks = 0 selected_blocks = False for (blockname, selist) in iter_blockname_selist: n_blocks += 1 if (self._datasetBlock != 'all') and (str.split(blockname, '#')[1] != self._datasetBlock): continue selected_blocks = True yield (blockname, selist) if (n_blocks > 0) and not selected_blocks: raise DatasetError( 'Dataset %r contains %d blocks, but none were selected by %r' % (datasetPath, n_blocks, self._datasetBlock)) def _fillCMSFiles(self, block, blockPath): lumi_used = False lumiDict = {} if self._lumi_query: # central lumi query lumiDict = self._getCMSLumisImpl(blockPath) fileList = [] for (fileInfo, listLumi) in self._getCMSFilesImpl(blockPath, self.onlyValid, self._lumi_query): if lumiDict and not listLumi: listLumi = lumiDict.get(fileInfo[DataProvider.URL], []) if listLumi: (listLumiExt_Run, listLumiExt_Lumi) = ([], []) for (run, lumi_list) in sorted(listLumi): listLumiExt_Run.extend([run] * len(lumi_list)) listLumiExt_Lumi.extend(lumi_list) fileInfo[DataProvider.Metadata] = [ listLumiExt_Run, listLumiExt_Lumi ] lumi_used = True fileList.append(fileInfo) if lumi_used: block.setdefault(DataProvider.Metadata, []).extend(['Runs', 'Lumi']) block[DataProvider.FileList] = fileList def _getCMSLumisImpl(self, blockPath): return None def _getGCBlocks(self, usePhedex): for datasetPath in self.getDatasets(): counter = 0 for (blockPath, replica_infos) in self._getCMSBlocks(datasetPath, getSites=not usePhedex): result = {} result[DataProvider.Dataset] = blockPath.split('#')[0] result[DataProvider.BlockName] = blockPath.split('#')[1] if usePhedex: # Start parallel phedex query dictReplicas = {} tPhedex = start_thread( 'Query phedex site info for %s' % blockPath, self._getPhedexReplicas, blockPath, dictReplicas) self._fillCMSFiles(result, blockPath) tPhedex.join() replica_infos = dictReplicas.get(blockPath) else: self._fillCMSFiles(result, blockPath) result[DataProvider.Locations] = self._processReplicas( blockPath, replica_infos) if len(result[DataProvider.FileList]): counter += 1 yield result if counter == 0: raise DatasetError( 'Dataset %s does not contain any valid blocks!' % datasetPath)
class CMSBaseProvider(DataProvider): def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): changeTrigger = triggerResync(['datasets', 'parameters']) self._lumi_filter = config.getLookup('lumi filter', {}, parser = parseLumiFilter, strfun = strLumi, onChange = changeTrigger) if not self._lumi_filter.empty(): config.set('dataset processor', 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange = changeTrigger) # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont! self._phedexFilter = config.getFilter('phedex sites', '-T3_US_FNALLPC', defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = changeTrigger) self._phedexT1Filter = config.getFilter('phedex t1 accept', 'T1_DE_KIT T1_US_FNAL', defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = changeTrigger) self._phedexT1Mode = config.getEnum('phedex t1 mode', PhedexT1Mode, PhedexT1Mode.disk, onChange = changeTrigger) self.onlyComplete = config.getBool('only complete sites', True, onChange = changeTrigger) self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange = changeTrigger) self._pjrc = JSONRestClient(url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas') (self._datasetPath, self._url, self._datasetBlock) = optSplit(datasetExpr, '@#') self._url = self._url or config.get('dbs instance', '') self._datasetBlock = self._datasetBlock or 'all' self.onlyValid = config.getBool('only valid', True, onChange = changeTrigger) # Define how often the dataprovider can be queried automatically def queryLimit(self): return 2 * 60 * 60 # 2 hour delay minimum # Check if splitterClass is valid def checkSplitter(self, splitterClass): if (DataSplitter.Skipped in splitterClass.neededEnums()) and not self._lumi_filter.empty(): self._log.debug('Selected splitter %s is not compatible with active lumi filter!', splitterClass.__name__) self._log.warning('Active lumi section filter forced selection of HybridSplitter') return HybridSplitter return splitterClass def _nodeFilter(self, nameSiteDB, complete): # Remove T0 and T1 by default result = not (nameSiteDB.startswith('T0_') or nameSiteDB.startswith('T1_')) # check if listed on the accepted list if self._phedexT1Mode in [PhedexT1Mode.disk, PhedexT1Mode.accept]: result = result or (self._phedexT1Filter.filterList([nameSiteDB]) == [nameSiteDB]) if self._phedexT1Mode == PhedexT1Mode.disk: result = result or nameSiteDB.lower().endswith('_disk') # apply phedex blacklist result = result and (self._phedexFilter.filterList([nameSiteDB]) == [nameSiteDB]) # check for completeness at the site result = result and (complete or not self.onlyComplete) return result # Get dataset se list from PhEDex (perhaps concurrent with listFiles) def _getPhedexSEList(self, blockPath, dictSE): dictSE[blockPath] = [] for phedexBlock in self._pjrc.get(params = {'block': blockPath})['phedex']['block']: for replica in phedexBlock['replica']: if self._nodeFilter(replica['node'], replica['complete'] == 'y'): location = None if self._locationFormat == CMSLocationFormat.hostname: location = replica.get('se') elif self._locationFormat == CMSLocationFormat.siteDB: location = replica.get('node') elif (self._locationFormat == CMSLocationFormat.both) and (replica.get('node') or replica.get('se')): location = '%s/%s' % (replica.get('node'), replica.get('se')) if location: dictSE[blockPath].append(location) else: self._log.warning('Dataset block %s replica at %s / %s is skipped!', blockPath, replica.get('node'), replica.get('se')) def getDatasets(self): if self._cache_dataset is None: self._cache_dataset = [self._datasetPath] if '*' in self._datasetPath: self._cache_dataset = list(self.getCMSDatasets(self._datasetPath)) if not self._cache_dataset: raise DatasetError('No datasets selected by DBS wildcard %s !' % self._datasetPath) return self._cache_dataset def getCMSBlocks(self, datasetPath, getSites): iter_blockname_selist = self.getCMSBlocksImpl(datasetPath, getSites) n_blocks = 0 selected_blocks = False for (blockname, selist) in iter_blockname_selist: n_blocks += 1 if (self._datasetBlock != 'all') and (str.split(blockname, '#')[1] != self._datasetBlock): continue selected_blocks = True yield (blockname, selist) if (n_blocks > 0) and not selected_blocks: raise DatasetError('Dataset %r contains %d blocks, but none were selected by %r' % (datasetPath, n_blocks, self._datasetBlock)) def fillCMSFiles(self, block, blockPath): lumi_used = False lumiDict = {} if self._lumi_query: # central lumi query lumiDict = self.getCMSLumisImpl(blockPath) fileList = [] for (fileInfo, listLumi) in self.getCMSFilesImpl(blockPath, self.onlyValid, self._lumi_query): if lumiDict and not listLumi: listLumi = lumiDict.get(fileInfo[DataProvider.URL], []) if listLumi: (listLumiExt_Run, listLumiExt_Lumi) = ([], []) for (run, lumi_list) in sorted(listLumi): listLumiExt_Run.extend([run] * len(lumi_list)) listLumiExt_Lumi.extend(lumi_list) fileInfo[DataProvider.Metadata] = [listLumiExt_Run, listLumiExt_Lumi] lumi_used = True fileList.append(fileInfo) if lumi_used: block.setdefault(DataProvider.Metadata, []).extend(['Runs', 'Lumi']) block[DataProvider.FileList] = fileList def getCMSLumisImpl(self, blockPath): return None def getGCBlocks(self, usePhedex): for datasetPath in self.getDatasets(): counter = 0 for (blockPath, listSE) in self.getCMSBlocks(datasetPath, getSites = not usePhedex): result = {} result[DataProvider.Dataset] = blockPath.split('#')[0] result[DataProvider.BlockName] = blockPath.split('#')[1] if usePhedex: # Start parallel phedex query dictSE = {} tPhedex = start_thread('Query phedex site info for %s' % blockPath, self._getPhedexSEList, blockPath, dictSE) self.fillCMSFiles(result, blockPath) tPhedex.join() listSE = dictSE.get(blockPath) else: self.fillCMSFiles(result, blockPath) result[DataProvider.Locations] = listSE if len(result[DataProvider.FileList]): counter += 1 yield result if counter == 0: raise DatasetError('Dataset %s does not contain any valid blocks!' % datasetPath)