def getEntries(self, path, metadata, events, seList, objStore): if 'JOBINFO' not in objStore: raise DatasetError( 'Job information is not filled! Ensure that "JobInfoFromOutputDir" is scheduled!' ) try: jobInfo = objStore['JOBINFO'] files = ifilter(lambda x: x[0].startswith('file'), jobInfo.items()) fileInfos = imap(lambda x_y: tuple(x_y[1].strip('"').split(' ')), files) for (hashMD5, name_local, name_dest, pathSE) in fileInfos: metadata.update({ 'SE_OUTPUT_HASH_MD5': hashMD5, 'SE_OUTPUT_FILE': name_local, 'SE_OUTPUT_BASE': os.path.splitext(name_local)[0], 'SE_OUTPUT_PATH': pathSE }) yield (os.path.join(pathSE, name_dest), metadata, events, seList, objStore) except KeyboardInterrupt: sys.exit(os.EX_TEMPFAIL) except Exception: raise DatasetError('Unable to read file stageout information!')
def processBlock(self, block): if self._lumi_filter.empty() and ((self._lumi_keep == LumiKeep.RunLumi) or (DataProvider.Metadata not in block)): return block def getMetadataIdx(key): if key in block.get(DataProvider.Metadata, []): return block[DataProvider.Metadata].index(key) idxRuns = getMetadataIdx('Runs') idxLumi = getMetadataIdx('Lumi') if not self._lumi_filter.empty(): lumi_filter = self._lumi_filter.lookup(block[DataProvider.Nickname], is_selector = False) if lumi_filter and (self._lumi_strict == LumiMode.strict) and ((idxRuns is None) or (idxLumi is None)): raise DatasetError('Strict lumi filter active but dataset %s does not provide lumi information!' % DataProvider.bName(block)) elif lumi_filter and (self._lumi_strict == LumiMode.weak) and (idxRuns is None): raise DatasetError('Weak lumi filter active but dataset %s does not provide run information!' % DataProvider.bName(block)) block[DataProvider.FileList] = list(self._processFI(block, idxRuns, idxLumi)) if not block[DataProvider.FileList]: return block[DataProvider.NEntries] = sum(imap(lambda fi: fi[DataProvider.NEntries], block[DataProvider.FileList])) # Prune metadata if self._lumi_keep == LumiKeep.RunLumi: return block elif self._lumi_keep == LumiKeep.Run: idxRuns = None removeRunLumi(block[DataProvider.Metadata], idxRuns, idxLumi) return block
def get_dataset_info(opts, args, query_blocks=True): config = get_dataset_config(opts, args) if opts.threads is not None: config.set_int('dataprovider thread max', int(opts.threads) or 1) provider = config.get_composited_plugin( 'dataset', cls=DataProvider, bind_kwargs={'provider_name_default': config.get('dataset provider')}, default_compositor=':ThreadedMultiDatasetProvider:') dataset_list = sorted(provider.get_dataset_name_list()) if len(dataset_list) == 0: raise DatasetError('No datasets matched!') # Query blocks only if needed query_blocks = False for option in opts.__dict__: if option.startswith('list_') and (option != 'list_dataset_names') or ( option == 'save'): if getattr(opts, option): query_blocks = True block_list = None if query_blocks: block_list = provider.get_block_list_cached(show_stats=False) if len(block_list) == 0: raise DatasetError('No blocks matched!') if opts.ordered: sort_inplace(block_list, key=itemgetter(DataProvider.Dataset, DataProvider.BlockName)) for block in block_list: sort_inplace(block[DataProvider.FileList], key=itemgetter(DataProvider.URL)) return (provider, dataset_list, block_list)
def _iter_datasource_items(self, item, metadata_dict, entries, location_list, obj_dict): if 'JOBINFO' not in obj_dict: raise DatasetError( 'Job infos not available! Ensure that "JobInfoFromOutputDir" is selected!' ) try: job_info_dict = obj_dict['JOBINFO'] file_info_str_iter = ifilter(lambda x: x[0].startswith('file'), job_info_dict.items()) file_info_tuple_list = imap( lambda x_y: tuple(x_y[1].strip('"').split(' ')), file_info_str_iter) for (file_hash, fn_local, fn_dest, se_path) in file_info_tuple_list: metadata_dict.update({ 'SE_OUTPUT_HASH_MD5': file_hash, 'SE_OUTPUT_FILE': fn_local, 'SE_OUTPUT_BASE': os.path.splitext(fn_local)[0], 'SE_OUTPUT_PATH': se_path }) yield (os.path.join(se_path, fn_dest), metadata_dict, entries, location_list, obj_dict) except Exception: raise DatasetError('Unable to read file stageout information!')
def _check_lumi_filter(self, block, idx_runs, idx_lumi): lumi_filter = self._lumi_filter.lookup(block[DataProvider.Nickname], is_selector=False) if not lumi_filter: return if (self._lumi_strict == LumiMode.strict) and ((idx_runs is None) or (idx_lumi is None)): raise DatasetError('Strict lumi filter active but ' + 'dataset %s does not provide lumi information!' % DataProvider.get_block_id(block)) elif (self._lumi_strict == LumiMode.weak) and (idx_runs is None): raise DatasetError('Weak lumi filter active but ' + 'dataset %s does not provide run information!' % DataProvider.get_block_id(block))
def _iter_datasource_items(self, item, metadata_dict, entries, location_list, obj_dict): jobnum = metadata_dict['GC_JOBNUM'] cms_log_fn = os.path.join(item, 'cmssw.dbs.tar.gz') if os.path.exists(cms_log_fn): tar = tarfile.open(cms_log_fn, 'r') # Collect infos about transferred files file_summary_map = {} try: file_info_str_list = tar.extractfile('files').readlines() for rawdata in imap(lambda value: bytes2str(value).split(), file_info_str_list): file_summary_map[rawdata[2]] = { 'SE_OUTPUT_HASH_CRC32': rawdata[0], 'SE_OUTPUT_SIZE': int(rawdata[1]) } obj_dict['CMSSW_FILES'] = file_summary_map except Exception: raise DatasetError('Could not read CMSSW file infos for job %d!' % jobnum) # Collect infos about CMSSW processing steps config_summary_map = {} self._process_steps(jobnum, tar, config_summary_map, file_summary_map) for cfg in config_summary_map: job_hash_list = metadata_dict.setdefault('CMSSW_CONFIG_JOBHASH', []) job_hash_list.append(config_summary_map[cfg]['CMSSW_CONFIG_HASH']) obj_dict.update({'CMSSW_CONFIG': config_summary_map, 'CMSSW_FILES': file_summary_map}) tar.close() yield (item, metadata_dict, entries, location_list, obj_dict)
def create_dbs3_json_blocks(opts, dataset_blocks): dbs3_proto_block_iter = create_dbs3_proto_blocks(opts, dataset_blocks) for (block, block_dump, block_size, dataset_type) in dbs3_proto_block_iter: dataset = block[DataProvider.Dataset] try: primary_dataset, processed_dataset, data_tier = dataset[1:].split( '/') except Exception: raise DatasetError('Dataset name %s is not a valid DBS name!' % dataset) # add primary dataset information block_dump['primds'] = { 'primary_ds_type': dataset_type, 'primary_ds_name': primary_dataset } # add dataset information block_dump['dataset'] = { 'dataset': dataset, 'processed_ds_name': processed_dataset, 'data_tier_name': data_tier, 'physics_group_name': None, 'dataset_access_type': 'VALID', 'xtcrosssection': None, # TODO: Add to metadata from FrameWorkJobReport, if possible! } # add block information site_db = CRIC() try: origin_site_name = site_db.se_to_cms_name( block[DataProvider.Locations][0])[0] except IndexError: clear_current_exception() origin_site_name = 'UNKNOWN' block_dump['block'] = { 'block_name': DataProvider.get_block_id(block), 'block_size': block_size, 'file_count': len(block[DataProvider.FileList]), 'origin_site_name': origin_site_name } if opts.do_close_blocks: block_dump['block']['open_for_writing'] = 0 else: block_dump['block']['open_for_writing'] = 1 # add acquisition_era, CRAB is important because of checks within DBS 3 block_dump['acquisition_era'] = { 'acquisition_era_name': 'CRAB', 'start_date': 0 } # add processing_era block_dump['processing_era'] = { 'processing_version': 1, 'description': 'grid-control' } yield validate_dbs3_json('blockBulk', block_dump)
def getEntries(self, path, metadata, events, seList, objStore): jobNum = metadata['GC_JOBNUM'] tar = tarfile.open(os.path.join(path, 'cmssw.dbs.tar.gz'), 'r') # Collect infos about transferred files fileSummaryMap = {} try: for rawdata in imap(str.split, tar.extractfile('files').readlines()): fileSummaryMap[rawdata[2]] = { 'SE_OUTPUT_HASH_CRC32': rawdata[0], 'SE_OUTPUT_SIZE': int(rawdata[1]) } objStore['CMSSW_FILES'] = fileSummaryMap except Exception: raise DatasetError('Could not read CMSSW file infos for job %d!' % jobNum) # Collect infos about CMSSW processing steps cfgSummaryMap = {} self._processSteps(jobNum, tar, cfgSummaryMap, fileSummaryMap) for cfg in cfgSummaryMap: metadata.setdefault('CMSSW_CONFIG_JOBHASH', []).append( cfgSummaryMap[cfg]['CMSSW_CONFIG_HASH']) objStore.update({ 'CMSSW_CONFIG': cfgSummaryMap, 'CMSSW_FILES': fileSummaryMap }) tar.close() yield (path, metadata, events, seList, objStore)
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None): # pylint:disable=super-init-not-called raise DatasetError('CMS deprecated all DBS2 Services in April 2014! ' + 'Please use DBS3Provider instead.')
def main(opts, args): config = get_dataset_config(opts, args) provider = config.getPlugin('dataset', cls = DataProvider) blocks = provider.getBlocks() if len(blocks) == 0: raise DatasetError('No blocks!') datasets = set(imap(itemgetter(DataProvider.Dataset), blocks)) if len(datasets) > 1 or opts.info: headerbase = [(DataProvider.Dataset, 'Dataset')] else: print('Dataset: %s' % blocks[0][DataProvider.Dataset]) headerbase = [] if opts.list_datasets: list_datasets(blocks) if opts.list_blocks: list_blocks(blocks, headerbase) if opts.list_files: list_files(datasets, blocks) if opts.list_storage: list_storage(blocks, headerbase) if opts.metadata and not opts.save: list_metadata(datasets, blocks) if opts.block_metadata and not opts.save: list_block_metadata(datasets, blocks) if opts.config_entry: list_config_entries(opts, blocks, provider) if opts.info: list_infos(blocks) if opts.save: save_dataset(opts, provider)
def getGCBlocks(self, usePhedex): for datasetPath in self.getDatasets(): counter = 0 for (blockPath, listSE) in self.getCMSBlocks(datasetPath, getSites=not usePhedex): result = {} result[DataProvider.Dataset] = blockPath.split('#')[0] result[DataProvider.BlockName] = blockPath.split('#')[1] if usePhedex: # Start parallel phedex query dictSE = {} tPhedex = start_thread( 'Query phedex site info for %s' % blockPath, self._getPhedexSEList, blockPath, dictSE) self.fillCMSFiles(result, blockPath) tPhedex.join() listSE = dictSE.get(blockPath) else: self.fillCMSFiles(result, blockPath) result[DataProvider.Locations] = listSE if len(result[DataProvider.FileList]): counter += 1 yield result if counter == 0: raise DatasetError( 'Dataset %s does not contain any valid blocks!' % datasetPath)
def _process(self, key, setup, path, metadata): if setup is not None: (delim, ds, de, mod) = setup value = str.join(delim, os.path.basename(path).split(delim)[ds:de]) try: metadata[key] = str(mod(value)) except Exception: raise DatasetError('Unable to modifiy %s: %r' % (key, value))
def _process(self, item, metadata_dict, key, delim, delim_start, delim_end, modifier_fun): value = str.join( delim, os.path.basename(item).split(delim)[delim_start:delim_end]) try: metadata_dict[key] = str(modifier_fun(value)) except Exception: raise DatasetError('Unable to modifiy %s: %r' % (key, value))
def __init__(self, config, datasource_name): InfoScanner.__init__(self, config, datasource_name) self._ext_work_dn = config.get_dn('source directory') self._ext_output_dir = os.path.join(self._ext_work_dn, 'output') if not os.path.isdir(self._ext_output_dir): raise DatasetError('Unable to find task output directory %s' % repr(self._ext_output_dir)) self._selector = JobSelector.create( config.get('source job selector', ''))
def __init__(self, config): InfoScanner.__init__(self, config) self._extWorkDir = config.getPath('source directory', onChange=triggerDataResync) self._extOutputDir = os.path.join(self._extWorkDir, 'output') if not os.path.isdir(self._extOutputDir): raise DatasetError('Unable to find task output directory %s' % repr(self._extOutputDir)) self._selector = JobSelector.create( config.get('source job selector', '', onChange=triggerDataResync))
def getDatasets(self): if self._cache_dataset is None: self._cache_dataset = [self._datasetPath] if '*' in self._datasetPath: self._cache_dataset = list( self.getCMSDatasets(self._datasetPath)) if not self._cache_dataset: raise DatasetError( 'No datasets selected by DBS wildcard %s !' % self._datasetPath) return self._cache_dataset
def __init__(self, config): InfoScanner.__init__(self, config) self._path = config.get('source directory', '.', onChange=triggerDataResync) self._recurse = config.getBool('source recurse', False, onChange=triggerDataResync) if ('://' in self._path) and self._recurse: raise DatasetError('Recursion is not supported for URL: %s' % repr(self._path)) elif '://' not in self._path: self._path = utils.cleanPath(self._path)
def get_dataset_name_list(self): if self._cache_dataset is None: self._cache_dataset = [self._dataset_path] if '*' in self._dataset_path: activity = Activity('Getting dataset list for %s' % self._dataset_path) self._cache_dataset = list( self._get_cms_dataset_list(self._dataset_path)) if not self._cache_dataset: raise DatasetError( 'No datasets selected by DBS wildcard %s !' % self._dataset_path) activity.finish() return self._cache_dataset
def _init_reader(self): # look for aborted inits / resyncs - and try to restore old state if possible if self._exists_data_path('map.tar.resync') and self._exists_data_path( 'cache.dat.resync'): rename_file(self._get_data_path('cache.dat.resync'), self._get_data_path('cache.dat')) rename_file(self._get_data_path('map.tar.resync'), self._get_data_path('map.tar')) elif self._exists_data_path( 'map.tar.resync') or self._exists_data_path( 'cache.dat.resync'): raise DatasetError( 'Found broken dataset partition resync state in work directory' ) if self._exists_data_path( 'map.tar') and not self._exists_data_path('cache.dat'): raise DatasetError( 'Found broken dataset partition in work directory') elif not self._exists_data_path('map.tar'): # create initial partition map file if not self._exists_data_path('cache.dat'): provider = self._provider else: provider = DataProvider.load_from_file( self._get_data_path('cache.dat')) block_iter = DataProvider.save_to_file_iter( self._get_data_path('cache.dat.init'), provider.get_block_list_cached(show_stats=True)) partition_iter = self._splitter.split_partitions(block_iter) DataSplitter.save_partitions(self._get_data_path('map.tar.init'), partition_iter) rename_file(self._get_data_path('cache.dat.init'), self._get_data_path('cache.dat')) rename_file(self._get_data_path('map.tar.init'), self._get_data_path('map.tar')) return DataSplitter.load_partitions(self._get_data_path('map.tar'))
def _process_steps(self, jobnum, tar, config_summary_map, file_summary_map): cmssw_version = bytes2str(tar.extractfile('version').read()).strip() for cfg in ifilter(lambda x: ('/' not in x) and (x not in ['version', 'files']), tar.getnames()): try: (config_summary, config_report, events_read) = self._process_config(tar, cfg) config_summary['CMSSW_VERSION'] = cmssw_version config_summary_map[cfg] = config_summary except Exception: raise DatasetError('Could not read config infos about %s in job %d' % (cfg, jobnum)) for output_file_node in config_report.getElementsByTagName('File'): (file_summary, pfn) = self._process_output_file(config_report, output_file_node) file_summary['CMSSW_EVENTS_READ'] = events_read file_summary['CMSSW_CONFIG_FILE'] = cfg file_summary_map.setdefault(pfn, {}).update(file_summary)
def getCMSBlocks(self, datasetPath, getSites): iter_blockname_selist = self.getCMSBlocksImpl(datasetPath, getSites) n_blocks = 0 selected_blocks = False for (blockname, selist) in iter_blockname_selist: n_blocks += 1 if (self._datasetBlock != 'all') and (str.split(blockname, '#')[1] != self._datasetBlock): continue selected_blocks = True yield (blockname, selist) if (n_blocks > 0) and not selected_blocks: raise DatasetError( 'Dataset %r contains %d blocks, but none were selected by %r' % (datasetPath, n_blocks, self._datasetBlock))
def _get_gc_block_list(self, use_phedex): dataset_name_list = self.get_dataset_name_list() progress_ds = ProgressActivity('Getting dataset', len(dataset_name_list)) for dataset_idx, dataset_path in enumerate(dataset_name_list): progress_ds.update_progress(dataset_idx, msg='Getting dataset %s' % dataset_path) counter = 0 blockinfo_list = list( self._filter_cms_blockinfo_list(dataset_path, not use_phedex)) progress_block = ProgressActivity('Getting block information', len(blockinfo_list)) for (block_path, replica_infos) in blockinfo_list: result = {} result[DataProvider.Dataset] = block_path.split('#')[0] result[DataProvider.BlockName] = block_path.split('#')[1] progress_block.update_progress( counter, msg='Getting block information for ' + result[DataProvider.BlockName]) if use_phedex and self._allow_phedex: # Start parallel phedex query replicas_dict = {} phedex_thread = start_thread( 'Query phedex site info for %s' % block_path, self._get_phedex_replica_list, block_path, replicas_dict) self._fill_cms_fi_list(result, block_path) phedex_thread.join() replica_infos = replicas_dict.get(block_path) else: self._fill_cms_fi_list(result, block_path) result[DataProvider.Locations] = self._process_replica_list( block_path, replica_infos) if len(result[DataProvider.FileList]): counter += 1 yield result progress_block.finish() if counter == 0: raise DatasetError( 'Dataset %s does not contain any valid blocks!' % dataset_path) progress_ds.finish()
def _filter_cms_blockinfo_list(self, dataset_path, do_query_sites): iter_dataset_block_name_selist = self._iter_cms_blocks( dataset_path, do_query_sites) n_blocks = 0 selected_blocks = False for (dataset_block_name, selist) in iter_dataset_block_name_selist: n_blocks += 1 block_name = str.split(dataset_block_name, '#')[1] if (self._dataset_block_selector != 'all') and (block_name != self._dataset_block_selector): continue selected_blocks = True yield (dataset_block_name, selist) if (n_blocks > 0) and not selected_blocks: raise DatasetError( 'Dataset %r contains %d blocks, but none were selected by %r' % (dataset_path, n_blocks, self._dataset_block_selector))
def _resync_partitions(self, path, block_list_old, block_list_new): partition_resync_handler = self._splitter.get_resync_handler() progress = ProgressActivity( progress_max=self.get_parameter_len(), msg= 'Writing resyncronized dataset partitions (progress is estimated)') path_tmp = path + '.tmp' try: resync_result = partition_resync_handler.resync( self._splitter, self._reader, block_list_old, block_list_new) DataSplitter.save_partitions(path_tmp, resync_result.partition_iter, progress) except Exception: raise DatasetError('Unable to resync %r' % self.get_datasource_name()) os.rename(path_tmp, path) return (resync_result.pnum_list_redo, resync_result.pnum_list_disable)
def _processSteps(self, jobNum, tar, cfgSummaryMap, fileSummaryMap): cmsswVersion = tar.extractfile('version').read().strip() for cfg in ifilter( lambda x: ('/' not in x) and (x not in ['version', 'files']), tar.getnames()): try: (cfgSummary, cfgReport, evRead) = self._processCfg(tar, cfg) cfgSummary['CMSSW_VERSION'] = cmsswVersion cfgSummaryMap[cfg] = cfgSummary except Exception: raise DatasetError( 'Could not read config infos about %s in job %d' % (cfg, jobNum)) for outputFile in cfgReport.getElementsByTagName('File'): (fileSummary, pfn) = self._processOutputFile(cfgReport, outputFile) fileSummary['CMSSW_EVENTS_READ'] = evRead fileSummary['CMSSW_CONFIG_FILE'] = cfg fileSummaryMap.setdefault(pfn, {}).update(fileSummary)
def __init__(self, dataDir, srcName, dataProvider, dataSplitter, dataProc, repository, keepOld=True): LimitedResyncParameterSource.__init__(self) (self._dn, self._name, self._data_provider, self._data_splitter, self._part_proc, self._keepOld) = \ (dataDir, srcName, dataProvider, dataSplitter, dataProc, keepOld) repository['dataset:%s' % srcName] = self self.resyncSetup(interval=-1) if not dataProvider: # debug mode - used by scripts - disables resync self._maxN = self._data_splitter.getMaxJobs() return # look for aborted resyncs - and try to restore old state if possible if self._existsDataPath('cache.dat.resync') and self._existsDataPath( 'map.tar.resync'): utils.renameFile(self._getDataPath('cache.dat.resync'), self._getDataPath('cache.dat')) utils.renameFile(self._getDataPath('map.tar.resync'), self._getDataPath('map.tar')) elif self._existsDataPath('cache.dat.resync') or self._existsDataPath( 'map.tar.resync'): raise DatasetError('Found broken resync state') if self._existsDataPath('cache.dat') and self._existsDataPath( 'map.tar'): self._data_splitter.importPartitions(self._getDataPath('map.tar')) else: DataProvider.saveToFile( self._getDataPath('cache.dat'), self._data_provider.getBlocks(show_stats=False)) self._data_splitter.splitDataset( self._getDataPath('map.tar'), self._data_provider.getBlocks(show_stats=False)) self._maxN = self._data_splitter.getMaxJobs()
def processBlock(self, block): if self._lumi_filter.empty() and ( (self._lumi_keep == LumiKeep.RunLumi) or (DataProvider.Metadata not in block)): return block def getMetadataIdx(key): if key in block[DataProvider.Metadata]: return block[DataProvider.Metadata].index(key) idxRuns = getMetadataIdx('Runs') idxLumi = getMetadataIdx('Lumi') if not self._lumi_filter.empty(): lumi_filter = self._lumi_filter.lookup( block[DataProvider.Nickname], is_selector=False) if lumi_filter and ((idxRuns is None) or (idxLumi is None)) and self._lumi_strict: fqName = block[DataProvider.Dataset] if block[DataProvider.BlockName] != '0': fqName += '#' + block[DataProvider.BlockName] raise DatasetError( 'Strict lumi filter active but dataset %s does not provide lumi information!' % fqName) block[DataProvider.FileList] = list( self._processFI(block, idxRuns, idxLumi)) if not block[DataProvider.FileList]: return block[DataProvider.NEntries] = sum( imap(lambda fi: fi[DataProvider.NEntries], block[DataProvider.FileList])) if self._lumi_keep == LumiKeep.RunLumi: return block elif self._lumi_keep == LumiKeep.Run: if idxLumi is not None: block[DataProvider.Metadata].pop(idxLumi) return block removeRunLumi(block[DataProvider.Metadata], idxRuns, idxLumi) return block
def __init__(self, config, datasetExpr, datasetNick, datasetID=0): raise DatasetError( 'CMS deprecated all DBS2 Services in April 2014! Please use DBS3Provider instead.' )