def main(): # args=sys.argv[1:] # data=args[0] sample_group = 'signal' # signal, background, data, all sample_list = get_sample_list(sample_group) sample_list.sort() url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader" api=DbsApi(url=url) for samp in sample_list: outputDataSets = '' #print('Checking {1}'.format(samp.DAS)) outputDataSets = api.listDatasets(dataset=samp.DAS, detail = True, dataset_access_type='VALID') if outputDataSets: for ds in outputDataSets: #print('{0}'.format(ds['dataset'])) #print('{0}'.format(ds['primary_ds_name'])) #print('{0}'.format(ds['xtcrosssection'])) nevents = api.listBlockSummaries(dataset=ds['dataset']) #print(nevents[0]['num_event']) # this to create a table for the paper with dataset name and number of events print('verb@ {0} @ & {1:.2e} & XX \\\\ '.format(ds['primary_ds_name'],nevents[0]['num_event'])) sys.exit(0);
def getFilenames(txtFile): # dasgoclient python API dbs = DbsApi('https://cmsweb.cern.ch/dbs/prod/global/DBSReader') global_director = "root://cmsxrootd.fnal.gov/" # Read out input files containing data set names with open(txtFile) as f: datasets = [ dataset for dataset in f.read().splitlines() if dataset != "" ] # Fill file names in using dasgoclient API filelist = {} for setname in datasets: if "mc" in setname: filelist[setname.split("/")[1]] = [ global_director + filename['logical_file_name'] for filename in dbs.listFiles(dataset=setname, detail=1) ] elif "user" in setname: filelist[setname.split("/")[5]] = [global_director + setname] else: filelist[setname.split("/")[1] + "-" + setname.split("/")[2]] = [ global_director + filename['logical_file_name'] for filename in dbs.listFiles(dataset=setname, detail=1) ] # print filelist return filelist
def getDatasetSize(dataset): # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve file aggregation only by the runs #transform from strin to list reply = dbsapi.listBlockSummaries(dataset=dataset) return reply[0]['file_size']
def dbs3_get_data(dataset, timestamps=1): #q = "/afs/cern.ch/user/s/spinoso/public/dbs3wrapper.sh /afs/cern.ch/user/c/cmst2/mc/scripts/datasetinfo.py --dataset %s --json" % dataset #output=os.popen(q).read() #s = json.loads(output) dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary try: reply = dbsapi.listDatasets(dataset=dataset, dataset_access_type='*', detail=True) #print reply if len(reply): status = reply[0]['dataset_access_type'] reply = dbsapi.listBlockSummaries(dataset=dataset, detail=True) cnt = 0 for block in reply: cnt += int(block['num_event']) return [cnt, status, int(cnt / 100.)] else: print dataset, "not exsiting" return [0, '', 0] except: print "crash dbs3" return [0, '', 0]
def getLumiListInValidFiles(dataset, dbsurl = 'phys03'): """ Get the runs/lumis in the valid files of a given dataset. dataset: the dataset name as published in DBS dbsurl: the DBS URL or DBS prod instance Returns a LumiList object. """ dbsurl = DBSURLS['reader'].get(dbsurl, dbsurl) dbs3api = DbsApi(url=dbsurl) try: files = dbs3api.listFileArray(dataset=dataset, validFileOnly=0, detail=True) except Exception as ex: msg = "Got DBS client error requesting details of dataset '%s' on DBS URL '%s': %s" % (dataset, dbsurl, ex) msg += "\n%s" % (traceback.format_exc()) raise ClientException(msg) if not files: msg = "Dataset '%s' not found in DBS URL '%s'." % (dataset, dbsurl) raise ClientException(msg) validFiles = [f['logical_file_name'] for f in files if f['is_file_valid']] blocks = set([f['block_name'] for f in files]) runLumiPairs = [] for blockName in blocks: fileLumis = dbs3api.listFileLumis(block_name=blockName) for f in fileLumis: if f['logical_file_name'] in validFiles: run = f['run_num'] lumis = f['lumi_section_num'] for lumi in lumis: runLumiPairs.append((run,lumi)) lumiList = LumiList(lumis=runLumiPairs) return lumiList
def getDatasetFileLumis(dataset): url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader" api=DbsApi(url=url) dRunLumis = {} files = api.listFiles(dataset=dataset) files = [f.get('logical_file_name','') for f in files] # chunk into size less than 1000 or else DBS complains fileChunks = getChunks(files) for fileChunk in fileChunks: info = api.listFileLumiArray(logical_file_name=fileChunk) for f in info: fname = f['logical_file_name'] dRunLumis[fname] = {} run, lumis = str(f['run_num']), f['lumi_section_num'] if run not in dRunLumis[fname]: dRunLumis[fname][run] = [] dRunLumis[fname][run].extend(lumis) for fname in dRunLumis.keys(): for run in dRunLumis[fname].keys(): dRunLumis[fname][run] = listToRanges(dRunLumis[fname][run]) return dRunLumis
def uploadWorker(input, results, dbsUrl): """ _uploadWorker_ Put JSONized blocks in the input Get confirmation in the output """ # Init DBS Stuff logging.debug("Creating dbsAPI with address %s" % dbsUrl) dbsApi = DbsApi(url = dbsUrl) while True: try: work = input.get() except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logging.error(crashMessage) break if work == 'STOP': # Then halt the process break name = work.get('name', None) block = work.get('block', None) # Do stuff with DBS try: logging.debug("About to call insert block with block: %s" % block) dbsApi.insertBulkBlock(blockDump = block) results.put({'name': name, 'success': "uploaded"}) except Exception as ex: exString = str(ex) if 'Block %s already exists' % name in exString: # Then this is probably a duplicate # Ignore this for now logging.error("Had duplicate entry for block %s. Ignoring for now." % name) logging.debug("Exception: %s" % exString) logging.debug("Traceback: %s" % str(traceback.format_exc())) results.put({'name': name, 'success': "uploaded"}) elif 'Proxy Error' in exString: # This is probably a successfully inserton that went bad. # Put it on the check list msg = "Got a proxy error for block (%s)." % name logging.error(msg) logging.error(str(traceback.format_exc())) results.put({'name': name, 'success': "check"}) else: msg = "Error trying to process block %s through DBS.\n" % name msg += exString logging.error(msg) logging.error(str(traceback.format_exc())) logging.debug("block: %s \n" % block) results.put({'name': name, 'success': "error", 'error': msg}) return
def getDatasetStatus(dataset): """ Gets the dataset status (access type): VALID, INVALID, PRODUCTION, DEPRECATED """ dbsapi = DbsApi(url=dbs3_url) reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True) return reply[0]['dataset_access_type']
def duplicateLumi(dataset, verbose=False, skipInvalid=False): """ checks if output dataset has duplicate lumis returns true if at least one duplicate lumi was found Verbose: if true prints details skipInvalid: if true skips invalid files, by default is False because is faster """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) duplicated = False lumisChecked = {} # retrieve files reply = dbsapi.listFiles(dataset=dataset, detail=skipInvalid) for f in reply: logical_file_name = f['logical_file_name'] #skip invalid files if skipInvalid and f['is_file_valid'] != 1: continue reply2 = dbsapi.listFileLumis(logical_file_name=logical_file_name) #retrieve lumis for each file lumis = reply2[0]['lumi_section_num'] #check that each lumi is only in one file for lumi in lumis: if lumi in lumisChecked: #if verbose print results, if not end quickly if verbose: print 'Lumi', lumi, 'is in these files' print logical_file_name print lumisChecked[lumi] duplicated = True else: return True else: lumisChecked[lumi] = logical_file_name return duplicated
def crabConfig(dataSet, setName, outDir, systematics, channels, era): isSignal = "HPlus" in setName isData = "Single" in setName or "JetHT" in setName or "EGamma" in setName outFiles = [] for systematic in systematics: if systematic == "": outFiles.append("{}.root".format(setName)) continue if isData: break for shift in ["Up", "Down"]: outFiles.append("{}_{}{}.root".format(setName, systematic, shift)) #Caculate number of files per job url = "https://cmsweb.cern.ch/dbs/prod/{}/DBSReader".format( "global") # if not isSignal else "phys03") api = DbsApi(url=url) files = api.listFiles(dataset=dataSet, detail=1) eventsPerFile = sum(f["event_count"] for f in files) / len(files) filesPerJob = int(math.ceil(300000. / eventsPerFile)) ##Crab config crabConf = config() crabConf.General.requestName = "Skim_{}".format(era) crabConf.General.workArea = outDir crabConf.General.transferOutputs = True crabConf.General.transferLogs = False crabConf.JobType.pluginName = "Analysis" crabConf.JobType.psetName = "{}/src/ChargedSkimming/Skimming/python/miniskimmer.py".format( os.environ["CMSSW_BASE"]) crabConf.JobType.pyCfgParams = [ "outname={}.root".format(setName), "channel={}".format(",".join(channels)), "era={}".format(era) ] crabConf.JobType.outputFiles = outFiles crabConf.JobType.maxJobRuntimeMin = 1440 crabConf.JobType.maxMemoryMB = 2500 crabConf.JobType.allowUndistributedCMSSW = True crabConf.Data.inputDataset = dataSet crabConf.Data.inputDBS = "global" # if not isSignal else "phys03" crabConf.Data.splitting = "FileBased" crabConf.Data.unitsPerJob = filesPerJob crabConf.Data.outLFNDirBase = "/store/user/dbrunner/skim/{}/{}".format( "_".join([ str(getattr(time.localtime(), "tm_" + t)) for t in ["mday", "mon", "year"] ]), era) crabConf.Site.storageSite = "T2_DE_DESY" crabConf.User.voGroup = "dcms" return crabConf
def das_files(dataset): dataset_split = dataset.split('/') dataset_split[2] = 'RunIIFall17NanoAODv4*' datasetv4 = '/'.join(dataset_split) dbs = DbsApi('https://cmsweb.cern.ch/dbs/prod/global/DBSReader') return dbs.listDatasets(dataset=datasetv4)
def duplicateLumi(dataset, verbose=False): """ checks if output dataset has duplicate lumis returns true if at least one duplicate lumi was found """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) duplicated = False lumisChecked={} # retrieve files reply = dbsapi.listFiles(dataset=dataset) for f in reply: logical_file_name = f['logical_file_name'] reply2 = dbsapi.listFileLumis(logical_file_name=logical_file_name) #retrieve lumis for each file lumis = reply2[0]['lumi_section_num'] #check that each lumi is only in one file for lumi in lumis: if lumi in lumisChecked: #if verbose print results, if not end quickly if verbose: print 'Lumi',lumi,'is in these files' print logical_file_name print lumisChecked[lumi] duplicated = True else: return True else: lumisChecked[lumi] = logical_file_name return duplicated
def uploadWorker(workInput, results, dbsUrl): """ _uploadWorker_ Put JSONized blocks in the workInput Get confirmation in the output """ # Init DBS Stuff logging.debug("Creating dbsAPI with address %s", dbsUrl) dbsApi = DbsApi(url = dbsUrl) while True: try: work = workInput.get() except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logging.error(crashMessage) break if work == 'STOP': # Then halt the process break name = work.get('name', None) block = work.get('block', None) # Do stuff with DBS try: logging.debug("About to call insert block with block: %s", block) dbsApi.insertBulkBlock(blockDump = block) results.put({'name': name, 'success': "uploaded"}) except Exception as ex: exString = str(ex) if 'Block %s already exists' % name in exString: # Then this is probably a duplicate # Ignore this for now logging.error("Had duplicate entry for block %s. Ignoring for now.", name) logging.debug("Exception: %s", exString) logging.debug("Traceback: %s", str(traceback.format_exc())) results.put({'name': name, 'success': "uploaded"}) elif 'Proxy Error' in exString: # This is probably a successfully inserton that went bad. # Put it on the check list msg = "Got a proxy error for block (%s)." % name logging.error(msg) logging.error(str(traceback.format_exc())) results.put({'name': name, 'success': "check"}) else: msg = "Error trying to process block %s through DBS.\n" % name msg += exString logging.error(msg) logging.error(str(traceback.format_exc())) logging.debug("block: %s \n", block) results.put({'name': name, 'success': "error", 'error': msg}) return
def getDatasetStatus(dataset): "Return dataset status" dbsapi = DbsApi(url=DBS3, verifypeer=False) reply = dbsapi.listDatasets(dataset=dataset, dataset_access_type='*', detail=True) return reply[0]['dataset_access_type']
def duplicateLumi(dataset, verbose=False, skipInvalid=False): """ checks if output dataset has duplicate lumis returns true if at least one duplicate lumi was found Verbose: if true prints details skipInvalid: if true skips invalid files, by default is False because is faster """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) duplicated = False lumisChecked={} # retrieve files reply = dbsapi.listFiles(dataset=dataset, detail=skipInvalid) for f in reply: logical_file_name = f['logical_file_name'] #skip invalid files if skipInvalid and f['is_file_valid'] != 1 : continue reply2 = dbsapi.listFileLumis(logical_file_name=logical_file_name) #retrieve lumis for each file lumis = reply2[0]['lumi_section_num'] #check that each lumi is only in one file for lumi in lumis: if lumi in lumisChecked: #if verbose print results, if not end quickly if verbose: print 'Lumi',lumi,'is in these files' print logical_file_name print lumisChecked[lumi] duplicated = True else: return True else: lumisChecked[lumi] = logical_file_name return duplicated
def getBlockSitesFromLocalDBS3(self,dbs_url): ## find the location for each block in the list from dbs.apis.dbsClient import DbsApi api = DbsApi(dbs_url) from NodeNameUtils import getMapOfSEHostName2PhedexNodeNameFromPhEDEx se2pnn = getMapOfSEHostName2PhedexNodeNameFromPhEDEx() blockSites = {} for block in self.Listfileblocks: blockInfo=api.listBlocks(block_name=block,detail=True) location=blockInfo[0]['origin_site_name'] if location == 'UNKNOWN': blockSites[block] = [] else: #if locationIsValidPNN: if location.startswith('T2_') or location.startswith('T3_'): blockSites[block] = [location] else: if location in se2pnn.keys(): blockSites[block] = [se2pnn[location]] else: msg = "ERROR: unknown location for block: %s. Skip this block" % location common.logger.info(msg) blockSites[block] = [] return blockSites
class MigrationToGlobal: def __init__(self): # Initialize dbs API dbsUrl = 'https://cmsweb.cern.ch/dbs/prod/global/DBSMigrate/' #dbsUrl = 'https://cmsweb-testbed.cern.ch/dbs/int/global/DBSMigrate/' self.dbsApi = DbsApi(url = dbsUrl) # Timing variable self.isOver = False # Timeout of the script self.time_out = 600 #10 min # Migration Requests self.migrationRequests = {} def updateRequest(self, over): """ This updates the migration requests status First calls DBS3 for the migration status. If it is the last loop (over = True), remove submitted requests and handle the uncomplete requests """ for task in self.migrationRequests.keys(): # Loop over all the submitted migration requests if self.migrationRequests[task]['status'] == 'submitted': request_status = self.dbsApi.statusMigration(migration_rqst_id = self.migrationRequests[task]['id']) status = request_status[0]['migration_status'] if status == 2: # Migration completed self.migrationRequests[task]['status'] = 'successful' print 'Migration to global succeed: '+ self.migrationRequests[task]['dataset'] elif status == 9: # Migration failed self.migrationRequests[task]['status'] = 'migration failed' print 'Migration to global fail: '+ self.migrationRequests[task]['dataset'] elif status == 3 and over == True: # Migration failed, no more retries due to script timeout self.removeRequest(self.migrationRequests[task]) self.migrationRequests[task]['status'] = 'migration failed' print 'Migration to global fail: '+ self.migrationRequests[task]['dataset'] elif status == 0 and over == True: # Migration is still pending, remove it self.removeRequest(self.migrationRequests[task]) self.migrationRequests[task]['status'] = 'migration not processed' print 'Migration to global not proccesed: '+ self.migrationRequests[task]['dataset'] elif status == 1 and over == True: # Migration in progress... self.migrationRequests[task]['status'] = 'still processing' print 'DBS3 is still processing Migration of %s' % self.migrationRequests[task]['id'] def removeRequest(self, migration): """ Remove a migration request This only works if the status is from DBS is 0 (pending) or 3 (fail) """ try: toDelete = {'migration_rqst_id': migration['id']} self.dbsApi.removeMigration(toDelete) except Exception, ex: print 'There was something wrong when migrating %s' % migration['dataset'] print 'Exception: '+str(ex)+'/n' print 'Traceback: '+str(traceback.format_exc())
def uploadWorker(workInput, results, dbsUrl): """ _uploadWorker_ Put JSONized blocks in the workInput Get confirmation in the output """ # Init DBS Stuff logging.debug("Creating dbsAPI with address %s", dbsUrl) dbsApi = DbsApi(url=dbsUrl) while True: try: work = workInput.get() except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logging.error(crashMessage) break if work == 'STOP': # Then halt the process break name = work.get('name', None) # this is the block name block = work.get('block', None) # this is the block data structure # Do stuff with DBS try: logging.debug("About to call insert block with block: %s", block) dbsApi.insertBulkBlock(blockDump=block) results.put({'name': name, 'success': "uploaded"}) except Exception as ex: exString = str(ex) if 'Block %s already exists' % name in exString: # Then this is probably a duplicate # Ignore this for now logging.warning("Block %s already exists. Marking it as uploaded.", name) logging.debug("Exception: %s", exString) results.put({'name': name, 'success': "uploaded"}) elif 'Proxy Error' in exString: # This is probably a successfully insertion that went bad. # Put it on the check list msg = "Got a proxy error for block %s." % name logging.warning(msg) results.put({'name': name, 'success': "check"}) elif 'Missing data when inserting to dataset_parents' in exString: msg = "Parent dataset is not inserted yet for block %s." % name logging.warning(msg) results.put({'name': name, 'success': "error", 'error': msg}) else: msg = "Error trying to process block %s through DBS. Error: %s" % (name, exString) logging.exception(msg) logging.debug("block info: %s \n", block) results.put({'name': name, 'success': "error", 'error': msg}) return
def get_filenames(bkgTXT, dataTXT, sigTXT): ##dasgoclient python API dbs = DbsApi('https://cmsweb.cern.ch/dbs/prod/global/DBSReader') global_director = "root://cmsxrootd.fnal.gov/" ##Read out input files containing data set names if bkgTXT: with open(bkgTXT) as f: background = [ background for background in f.read().splitlines() if background != "" ] else: background = [] if dataTXT: with open(dataTXT) as f: data = [data for data in f.read().splitlines() if data != ""] else: data = [] if sigTXT: with open(sigTXT) as f: signal = [ signal for signal in f.read().splitlines() if signal != "" ] else: signal = [] ##Fill file names in using dasgoclient API filelist = {} for setname in background + data: if "mc" in setname: key = setname.split("/")[1] else: key = setname.split("/")[1] + "-" + setname.split("/")[2] filelist[key] = [ global_director + filename['logical_file_name'] for filename in dbs.listFiles(dataset=setname, detail=1) ] ##Read out signal files with gfal-ls command for SEpath in signal: key = SEpath.split("/")[-2] signalFiles = subprocess.check_output(["gfal-ls", SEpath]).split("\n")[:-1] filelist[key] = [ global_director + SEpath[74:] + "/" + signalFile for signalFile in signalFiles ] return filelist
def getDatasetEventsPerLumi(dataset): dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') all_files = dbsapi.listFileSummaries( dataset = dataset , validFileOnly=1) try: average = sum([f['num_event']/float(f['num_lumi']) for f in all_files]) / float(len(all_files)) except: average = 100 return average
def getDataTiers(dbsUrl): """ Function to retrieve all the datatiers from DBS. NOTE: to be used with some caching (MemoryCacheStruct) :param dbsUrl: the DBS URL string :return: a list of strings/datatiers """ dbs = DbsApi(dbsUrl) return [tier['data_tier_name'] for tier in dbs.listDataTiers()]
def getSize(dataset): # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlocks(dataset=dataset,detail=True) sum = 0 for block in reply: sum = sum + block['block_size'] return sum
def getDatasetStatus(dataset): # initialize API to DBS3 dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') # retrieve dataset summary reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True) if len(reply): return reply[0]['dataset_access_type'] else: return None
def getFileCount(dataset): # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlockSummaries(dataset=dataset,detail=True) cnt=0 for block in reply: cnt = cnt + int(block['num_file']) return cnt
def getSize(dataset): # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlocks(dataset=dataset, detail=True) sum = 0 for block in reply: sum = sum + block['block_size'] return sum
def getLumiCountDataSet(dataset): """ Get the number of unique lumis in a dataset """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listFileSummaries(dataset=dataset) return reply[0]['num_lumi']
def _setDatatiersCache(ts, dbsUrl): """ Set a timestamp and update the list of datatiers cached in the class property """ dbs = DbsApi(dbsUrl) DBS3Reader._datatiers['ts'] = ts DBS3Reader._datatiers['tiers'] = [tier['data_tier_name'] for tier in dbs.listDataTiers()] return
def getNumberofFilesPerRun(das_url, dataset, run): """ Count number of files """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve file list reply = dbsapi.listFiles(dataset=dataset) return len(reply)
def getFileCountDataset(dataset): """ Returns the number of files registered in DBS3 """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve file list reply = dbsapi.listFiles(dataset=dataset) return len(reply)
def getEventCountBlock(block): """ Returns the number of events in a dataset using DBS3 """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlockSummaries(block_name=block) return reply[0]["num_event"]
def getEventCountBlock(block): """ Returns the number of events in a dataset using DBS3 """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlockSummaries(block_name=block) return reply[0]['num_event']
def getDatasetEventsPerLumi(dataset): dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') all_files = dbsapi.listFileSummaries(dataset=dataset, validFileOnly=1) try: average = sum( [f['num_event'] / float(f['num_lumi']) for f in all_files]) / float(len(all_files)) except: average = 100 return average
def __init__(self,config): self.br=Browser() self.config = config # Initialise connections self.mySiteDB = SiteDBJSON() self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/") self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/") self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
def getEventCountDataSetBlockList(dataset,blockList): """ Counts and adds all the events for a given lists blocks inside a dataset """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) lumis=0 reply = dbsapi.listBlockSummaries(block_name=blockList) return reply[0]['num_event']
def getEventCountDataSet(dataset): """ Returns the number of events in a dataset using DBS3 """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlockSummaries(dataset=dataset) return reply[0]['num_event']
def __init__(self,config): self.br=Browser() self.config = config # Initialise connections self.phedex = PhEDEx({"endpoint":"https://cmsweb.cern.ch/phedex/datasvc/json/prod/"}, "json") self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/") self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/") self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
def duplicateRunLumi(dataset, verbose=False, skipInvalid=False): """ checks if output dataset has duplicate lumis for every run. returns true if at least one duplicate lumi was found That is if there is the same lumi in the same run and two different files This can be used on datasets that have separate runs. Verbose: if true prints details skipInvalid: if true skips invalid files, by default is False because is faster """ dbsapi = DbsApi(url=dbs3_url) duplicated = False #check each run runs = getRunsDataset(dataset) #if only one run in the list if len(runs) == 1: if verbose: print "only one run:", runs return duplicateLumi(dataset, verbose, skipInvalid) #else manually for run in runs: #create a set lumisChecked = {} # retrieve files for that run reply = dbsapi.listFiles(dataset=dataset, detail=skipInvalid) for f in reply: #skip invalid files if skipInvalid and f['is_file_valid'] != 1: continue logical_file_name = f['logical_file_name'] reply2 = dbsapi.listFileLumis(logical_file_name=logical_file_name, run_num=run) #retrieve lumis for each file if reply2: lumis = reply2[0]['lumi_section_num'] else: continue #check that each lumi is only in one file for lumi in lumis: if lumi in lumisChecked: #if verbose print results, if not end quickly if verbose: print 'Lumi', lumi, 'in run', run, 'is in these files' print logical_file_name print lumisChecked[lumi] duplicated = True else: return True else: lumisChecked[lumi] = logical_file_name return duplicated
def getDatasetStatus(dataset): # initialize API to DBS3 dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') # retrieve dataset summary reply = dbsapi.listDatasets(dataset=dataset, dataset_access_type='*', detail=True) if len(reply): return reply[0]['dataset_access_type'] else: return None
def __init__(self, url, logger=None, **contact): # instantiate dbs api object try: self.dbsURL = url.replace("cmsweb.cern.ch", "cmsweb-prod.cern.ch") self.dbs = DbsApi(self.dbsURL, **contact) self.logger = logger or logging.getLogger(self.__class__.__name__) except dbsClientException as ex: msg = "Error in DBSReader with DbsApi\n" msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg)
def getLumiCountDataSet(dataset): """ Get the number of unique lumis in a dataset """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listFileSummaries(dataset=dataset) if not reply: return 0 return reply[0]['num_lumi']
def __init__(self, url, **contact): # instantiate dbs api object try: self.dbs = DbsApi(url, **contact) except dbsClientException as ex: msg = "Error in DBSReader with DbsApi\n" msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg) # connection to PhEDEx (Use default endpoint url) self.phedex = PhEDEx(responseType="json")
def __init__(self, config): """ Initialise class members """ logging.info("Running __init__ for DBS3 Uploader") BaseWorkerThread.__init__(self) self.config = config # This is slightly dangerous, but DBSUpload depends # on DBSInterface anyway self.dbsUrl = self.config.DBS3Upload.dbsUrl self.dbsUtil = DBSBufferUtil() myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=myThread.logger, dbinterface=myThread.dbi) self.pool = [] self.blocksToCheck = [] self.workInput = None self.workResult = None self.nProc = getattr(self.config.DBS3Upload, 'nProcesses', 4) self.wait = getattr(self.config.DBS3Upload, 'dbsWaitTime', 2) self.nTries = getattr(self.config.DBS3Upload, 'dbsNTries', 300) self.physicsGroup = getattr(self.config.DBS3Upload, "physicsGroup", "NoGroup") self.datasetType = getattr(self.config.DBS3Upload, "datasetType", "PRODUCTION") self.primaryDatasetType = getattr(self.config.DBS3Upload, "primaryDatasetType", "mc") self.blockCount = 0 self.dbsApi = DbsApi(url=self.dbsUrl) # List of blocks currently in processing self.queuedBlocks = [] # Set up the pool of worker processes self.setupPool() # Setting up any cache objects self.blockCache = {} self.filesToUpdate = [] self.produceCopy = getattr(self.config.DBS3Upload, 'copyBlock', False) self.copyPath = getattr(self.config.DBS3Upload, 'copyBlockPath', '/data/mnorman/block.json') self.timeoutWaiver = 1 return
def duplicateRunLumi(dataset, verbose=False, skipInvalid=False): """ checks if output dataset has duplicate lumis for every run. returns true if at least one duplicate lumi was found That is if there is the same lumi in the same run and two different files This can be used on datasets that have separate runs. Verbose: if true prints details skipInvalid: if true skips invalid files, by default is False because is faster """ dbsapi = DbsApi(url=dbs3_url) duplicated = False #check each run runs = getRunsDataset(dataset) #if only one run in the list if len(runs) == 1: if verbose: print "only one run:",runs return duplicateLumi(dataset, verbose, skipInvalid) #else manually for run in runs: #create a set lumisChecked={} # retrieve files for that run reply = dbsapi.listFiles(dataset=dataset, detail=skipInvalid) for f in reply: #skip invalid files if skipInvalid and f['is_file_valid'] != 1 : continue logical_file_name = f['logical_file_name'] reply2 = dbsapi.listFileLumis(logical_file_name=logical_file_name, run_num=run) #retrieve lumis for each file if reply2: lumis = reply2[0]['lumi_section_num'] else: continue #check that each lumi is only in one file for lumi in lumis: if lumi in lumisChecked: #if verbose print results, if not end quickly if verbose: print 'Lumi',lumi,'in run',run,'is in these files' print logical_file_name print lumisChecked[lumi] duplicated = True else: return True else: lumisChecked[lumi] = logical_file_name return duplicated
def getDatasetEventsAndLumis(dataset, blocks=None): dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') all_files = [] if blocks: for b in blocks: all_files.extend( dbsapi.listFileSummaries( block_name = b , validFileOnly=1)) else: all_files = dbsapi.listFileSummaries( dataset = dataset , validFileOnly=1) all_events = sum([f['num_event'] for f in all_files]) all_lumis = sum([f['num_lumi'] for f in all_files]) return all_events,all_lumis
def uploadWorker(input, results, dbsUrl): """ _uploadWorker_ Put JSONized blocks in the input Get confirmation in the output """ # Init DBS Stuff logging.debug("Creating dbsAPI with address %s" % dbsUrl) dbsApi = DbsApi(url = dbsUrl) while True: try: work = input.get() except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logging.error(crashMessage) break if work == 'STOP': # Then halt the process break name = work.get('name', None) block = work.get('block', None) # Do stuff with DBS try: logging.debug("About to call insert block with block: %s" % block) dbsApi.insertBulkBlock(blockDump = block) results.put({'name': name, 'success': True}) except Exception, ex: exString = str(ex) if 'Duplicate entry' in exString: # Then this is probably a duplicate # Ignore this for now logging.error("Had duplicate entry for block %s\n" % name) logging.error("Ignoring for now.\n") logging.error("Exception: %s\n" % exString) logging.error("Traceback: %s\n" % str(traceback.format_exc())) results.put({'name': name, 'success': True}) else: msg = "Error trying to process block %s through DBS.\n" % name msg += exString logging.error(msg) logging.error(str(traceback.format_exc())) results.put({'name': name, 'success': False, 'error': msg})
def getBlockSitesFromLocalDBS3(self,dbs_url): ## find the location for each block in the list from dbs.apis.dbsClient import DbsApi api = DbsApi(dbs_url) blockSites = {} for block in self.Listfileblocks: blockInfo=api.listBlocks(block_name=block,detail=True) location=blockInfo[0]['origin_site_name'] blockSites[block] = [location] return blockSites
def lookup_summary(ds): """lookup basic information of a dataset from DBS3 API :param ds: dataset name :type ds: str :return: infodict :rtype: dict """ dbs3api = DbsApi("https://cmsweb.cern.ch/dbs/prod/global/DBSReader") res = dbs3api.listFileSummaries(dataset=ds) # [{'num_file': 1599, 'file_size': 10341982224399, 'num_event': 28159421, # 'num_lumi': 198621, 'num_block': 234}] return res[0]
def getLFNbase(url, dataset): # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve file reply = dbsapi.listFiles(dataset=dataset) file = reply[0]['logical_file_name'] # determine lfn lfn = '/store/mc' if '/store/himc' in file: lfn = '/store/himc' if '/store/data' in file: lfn = '/store/data' return lfn
def hasAllBlocksClosed(dataset): """ checks if a given dataset has all blocks closed """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlocks(dataset=dataset, detail=True) for block in reply: #print block['block_name'] #print block['open_for_writing'] if block['open_for_writing']: return False return True
def getLumiCountDataSet(dataset, skipInvalid=False): """ Get the number of unique lumis in a dataset """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary if not skipInvalid: reply = dbsapi.listFileSummaries(dataset=dataset) else: reply = dbsapi.listFileSummaries(dataset=dataset, validFileOnly=1) if not reply or not reply[0]: return 0 return reply[0]['num_lumi']
def getFileCountDataset(dataset, skipInvalid=False, onlyInvalid=False): """ Returns the number of files registered in DBS3 """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve file list reply = dbsapi.listFiles(dataset=dataset, detail=(skipInvalid or onlyInvalid)) if skipInvalid: reply = filter(lambda f : f['is_file_valid'] ==1, reply) elif onlyInvalid: reply = filter(lambda f : f['is_file_valid'] ==0, reply) return len(reply)