def __init__(self,config): self.br=Browser() self.config = config # Initialise connections self.phedex = PhEDEx({"endpoint":"https://cmsweb.cern.ch/phedex/datasvc/json/prod/"}, "json") self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/") self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/") self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
def __init__(self,config): self.br=Browser() self.config = config # Initialise connections self.mySiteDB = SiteDBJSON() self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/") self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/") self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
def getSiblings(fileName, dataset): try: from dbs.apis.dbsClient import DbsApi from CRABClient.ClientUtilities import DBSURLS except ImportError: print "getSiblings() relies on CRAB. Please set up the environment for CRAB before using." sys.exit(1) dbsurl_global = DBSURLS["reader"].get("global", "global") dbsurl_phys03 = DBSURLS["reader"].get("phys03", "phys03") dbs3api_phys03 = DbsApi(url=dbsurl_phys03) dbs3api_global = DbsApi(url=dbsurl_global) # if there is an xrootd prefix, strip it if "/store/" in fileName: i = fileName.find("/store/") fileName = fileName[i:] # first get the parents parents = dbs3api_phys03.listFileParents(logical_file_name=fileName) # for each of the parents, get the grandparents grandparents = [] for parent in parents: for parent_file_name in parent["parent_logical_file_name"]: grandparents.extend( dbs3api_global.listFileParents( logical_file_name=parent_file_name)) # then for each of the grandparents, get their children children = [] for grandparent in grandparents: for grandparent_file_name in grandparent["parent_logical_file_name"]: children.extend( dbs3api_global.listFileChildren( logical_file_name=grandparent_file_name)) # put the children in a set miniaod = set([]) for child in children: for child_file_name in child["child_logical_file_name"]: miniaod.add(child_file_name) # put the files of the target dataset in another set dataset = dbs3api_global.listFiles(dataset=dataset) miniaodSuperset = set([]) for f in dataset: miniaodSuperset.add(f["logical_file_name"]) # return the intersection of the two sets return list(miniaodSuperset.intersection(miniaod))
def getFilenames(txtFile): # dasgoclient python API dbs = DbsApi('https://cmsweb.cern.ch/dbs/prod/global/DBSReader') global_director = "root://cmsxrootd.fnal.gov/" # Read out input files containing data set names with open(txtFile) as f: datasets = [ dataset for dataset in f.read().splitlines() if dataset != "" ] # Fill file names in using dasgoclient API filelist = {} for setname in datasets: if "mc" in setname: filelist[setname.split("/")[1]] = [ global_director + filename['logical_file_name'] for filename in dbs.listFiles(dataset=setname, detail=1) ] elif "user" in setname: filelist[setname.split("/")[5]] = [global_director + setname] else: filelist[setname.split("/")[1] + "-" + setname.split("/")[2]] = [ global_director + filename['logical_file_name'] for filename in dbs.listFiles(dataset=setname, detail=1) ] # print filelist return filelist
def dbs3_get_data(dataset, timestamps=1): #q = "/afs/cern.ch/user/s/spinoso/public/dbs3wrapper.sh /afs/cern.ch/user/c/cmst2/mc/scripts/datasetinfo.py --dataset %s --json" % dataset #output=os.popen(q).read() #s = json.loads(output) dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary try: reply = dbsapi.listDatasets(dataset=dataset, dataset_access_type='*', detail=True) #print reply if len(reply): status = reply[0]['dataset_access_type'] reply = dbsapi.listBlockSummaries(dataset=dataset, detail=True) cnt = 0 for block in reply: cnt += int(block['num_event']) return [cnt, status, int(cnt / 100.)] else: print dataset, "not exsiting" return [0, '', 0] except: print "crash dbs3" return [0, '', 0]
def das_files(dataset): dataset_split = dataset.split('/') dataset_split[2] = 'RunIIFall17NanoAODv4*' datasetv4 = '/'.join(dataset_split) dbs = DbsApi('https://cmsweb.cern.ch/dbs/prod/global/DBSReader') return dbs.listDatasets(dataset=datasetv4)
def uploadWorker(input, results, dbsUrl): """ _uploadWorker_ Put JSONized blocks in the input Get confirmation in the output """ # Init DBS Stuff logging.debug("Creating dbsAPI with address %s" % dbsUrl) dbsApi = DbsApi(url = dbsUrl) while True: try: work = input.get() except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logging.error(crashMessage) break if work == 'STOP': # Then halt the process break name = work.get('name', None) block = work.get('block', None) # Do stuff with DBS try: logging.debug("About to call insert block with block: %s" % block) dbsApi.insertBulkBlock(blockDump = block) results.put({'name': name, 'success': "uploaded"}) except Exception as ex: exString = str(ex) if 'Block %s already exists' % name in exString: # Then this is probably a duplicate # Ignore this for now logging.error("Had duplicate entry for block %s. Ignoring for now." % name) logging.debug("Exception: %s" % exString) logging.debug("Traceback: %s" % str(traceback.format_exc())) results.put({'name': name, 'success': "uploaded"}) elif 'Proxy Error' in exString: # This is probably a successfully inserton that went bad. # Put it on the check list msg = "Got a proxy error for block (%s)." % name logging.error(msg) logging.error(str(traceback.format_exc())) results.put({'name': name, 'success': "check"}) else: msg = "Error trying to process block %s through DBS.\n" % name msg += exString logging.error(msg) logging.error(str(traceback.format_exc())) logging.debug("block: %s \n" % block) results.put({'name': name, 'success': "error", 'error': msg}) return
def duplicateLumi(dataset, verbose=False, skipInvalid=False): """ checks if output dataset has duplicate lumis returns true if at least one duplicate lumi was found Verbose: if true prints details skipInvalid: if true skips invalid files, by default is False because is faster """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) duplicated = False lumisChecked = {} # retrieve files reply = dbsapi.listFiles(dataset=dataset, detail=skipInvalid) for f in reply: logical_file_name = f['logical_file_name'] #skip invalid files if skipInvalid and f['is_file_valid'] != 1: continue reply2 = dbsapi.listFileLumis(logical_file_name=logical_file_name) #retrieve lumis for each file lumis = reply2[0]['lumi_section_num'] #check that each lumi is only in one file for lumi in lumis: if lumi in lumisChecked: #if verbose print results, if not end quickly if verbose: print 'Lumi', lumi, 'is in these files' print logical_file_name print lumisChecked[lumi] duplicated = True else: return True else: lumisChecked[lumi] = logical_file_name return duplicated
def getDatasetStatus(dataset): """ Gets the dataset status (access type): VALID, INVALID, PRODUCTION, DEPRECATED """ dbsapi = DbsApi(url=dbs3_url) reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True) return reply[0]['dataset_access_type']
def getDatasetSize(dataset): # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve file aggregation only by the runs #transform from strin to list reply = dbsapi.listBlockSummaries(dataset=dataset) return reply[0]['file_size']
def get_dbs_api(instance='global'): """Return an API client for a CMS Dataset Bookkeeping (DBS) server instance. For a given DBS instance, a new DbsApi object is returned on the first call. Subsequent calls for the same DBS instance return the same DbsApi object to avoid reinitialization. Parameters ---------- instance : string One of the following DBS server instances: * global (default) * phys01 * phys02 * phys03 * caf Returns ------- DbsApi A DbsApi object configured for the requested DBS server instance. """ DBS_INSTANCES = {'global', 'phys01', 'phys02', 'phys03', 'caf'} if instance not in DBS_INSTANCES: raise ValueError('Unrecognized DBS instance: {0}'.format(instance)) dbs_api = globals().get(instance.upper(), None) if dbs_api is None: url = 'https://cmsweb.cern.ch/dbs/prod/{0}/DBSReader'.format(instance) dbs_api = DbsApi(url) globals()[instance.upper()] = dbs_api return dbs_api
def getDatasetStatus(dataset): "Return dataset status" dbsapi = DbsApi(url=DBS3, verifypeer=False) reply = dbsapi.listDatasets(dataset=dataset, dataset_access_type='*', detail=True) return reply[0]['dataset_access_type']
def main(): # args=sys.argv[1:] # data=args[0] sample_group = 'signal' # signal, background, data, all sample_list = get_sample_list(sample_group) sample_list.sort() url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader" api=DbsApi(url=url) for samp in sample_list: outputDataSets = '' #print('Checking {1}'.format(samp.DAS)) outputDataSets = api.listDatasets(dataset=samp.DAS, detail = True, dataset_access_type='VALID') if outputDataSets: for ds in outputDataSets: #print('{0}'.format(ds['dataset'])) #print('{0}'.format(ds['primary_ds_name'])) #print('{0}'.format(ds['xtcrosssection'])) nevents = api.listBlockSummaries(dataset=ds['dataset']) #print(nevents[0]['num_event']) # this to create a table for the paper with dataset name and number of events print('verb@ {0} @ & {1:.2e} & XX \\\\ '.format(ds['primary_ds_name'],nevents[0]['num_event'])) sys.exit(0);
def getBlockSitesFromLocalDBS3(self,dbs_url): ## find the location for each block in the list from dbs.apis.dbsClient import DbsApi api = DbsApi(dbs_url) from NodeNameUtils import getMapOfSEHostName2PhedexNodeNameFromPhEDEx se2pnn = getMapOfSEHostName2PhedexNodeNameFromPhEDEx() blockSites = {} for block in self.Listfileblocks: blockInfo=api.listBlocks(block_name=block,detail=True) location=blockInfo[0]['origin_site_name'] if location == 'UNKNOWN': blockSites[block] = [] else: #if locationIsValidPNN: if location.startswith('T2_') or location.startswith('T3_'): blockSites[block] = [location] else: if location in se2pnn.keys(): blockSites[block] = [se2pnn[location]] else: msg = "ERROR: unknown location for block: %s. Skip this block" % location common.logger.info(msg) blockSites[block] = [] return blockSites
def crabConfig(dataSet, setName, outDir, systematics, channels, era): isSignal = "HPlus" in setName isData = "Single" in setName or "JetHT" in setName or "EGamma" in setName outFiles = [] for systematic in systematics: if systematic == "": outFiles.append("{}.root".format(setName)) continue if isData: break for shift in ["Up", "Down"]: outFiles.append("{}_{}{}.root".format(setName, systematic, shift)) #Caculate number of files per job url = "https://cmsweb.cern.ch/dbs/prod/{}/DBSReader".format( "global") # if not isSignal else "phys03") api = DbsApi(url=url) files = api.listFiles(dataset=dataSet, detail=1) eventsPerFile = sum(f["event_count"] for f in files) / len(files) filesPerJob = int(math.ceil(300000. / eventsPerFile)) ##Crab config crabConf = config() crabConf.General.requestName = "Skim_{}".format(era) crabConf.General.workArea = outDir crabConf.General.transferOutputs = True crabConf.General.transferLogs = False crabConf.JobType.pluginName = "Analysis" crabConf.JobType.psetName = "{}/src/ChargedSkimming/Skimming/python/miniskimmer.py".format( os.environ["CMSSW_BASE"]) crabConf.JobType.pyCfgParams = [ "outname={}.root".format(setName), "channel={}".format(",".join(channels)), "era={}".format(era) ] crabConf.JobType.outputFiles = outFiles crabConf.JobType.maxJobRuntimeMin = 1440 crabConf.JobType.maxMemoryMB = 2500 crabConf.JobType.allowUndistributedCMSSW = True crabConf.Data.inputDataset = dataSet crabConf.Data.inputDBS = "global" # if not isSignal else "phys03" crabConf.Data.splitting = "FileBased" crabConf.Data.unitsPerJob = filesPerJob crabConf.Data.outLFNDirBase = "/store/user/dbrunner/skim/{}/{}".format( "_".join([ str(getattr(time.localtime(), "tm_" + t)) for t in ["mday", "mon", "year"] ]), era) crabConf.Site.storageSite = "T2_DE_DESY" crabConf.User.voGroup = "dcms" return crabConf
def getLumiListInValidFiles(dataset, dbsurl = 'phys03'): """ Get the runs/lumis in the valid files of a given dataset. dataset: the dataset name as published in DBS dbsurl: the DBS URL or DBS prod instance Returns a LumiList object. """ dbsurl = DBSURLS['reader'].get(dbsurl, dbsurl) dbs3api = DbsApi(url=dbsurl) try: files = dbs3api.listFileArray(dataset=dataset, validFileOnly=0, detail=True) except Exception as ex: msg = "Got DBS client error requesting details of dataset '%s' on DBS URL '%s': %s" % (dataset, dbsurl, ex) msg += "\n%s" % (traceback.format_exc()) raise ClientException(msg) if not files: msg = "Dataset '%s' not found in DBS URL '%s'." % (dataset, dbsurl) raise ClientException(msg) validFiles = [f['logical_file_name'] for f in files if f['is_file_valid']] blocks = set([f['block_name'] for f in files]) runLumiPairs = [] for blockName in blocks: fileLumis = dbs3api.listFileLumis(block_name=blockName) for f in fileLumis: if f['logical_file_name'] in validFiles: run = f['run_num'] lumis = f['lumi_section_num'] for lumi in lumis: runLumiPairs.append((run,lumi)) lumiList = LumiList(lumis=runLumiPairs) return lumiList
def get_filenames(bkgTXT, dataTXT, sigTXT): ##dasgoclient python API dbs = DbsApi('https://cmsweb.cern.ch/dbs/prod/global/DBSReader') global_director = "root://cmsxrootd.fnal.gov/" ##Read out input files containing data set names if bkgTXT: with open(bkgTXT) as f: background = [ background for background in f.read().splitlines() if background != "" ] else: background = [] if dataTXT: with open(dataTXT) as f: data = [data for data in f.read().splitlines() if data != ""] else: data = [] if sigTXT: with open(sigTXT) as f: signal = [ signal for signal in f.read().splitlines() if signal != "" ] else: signal = [] ##Fill file names in using dasgoclient API filelist = {} for setname in background + data: if "mc" in setname: key = setname.split("/")[1] else: key = setname.split("/")[1] + "-" + setname.split("/")[2] filelist[key] = [ global_director + filename['logical_file_name'] for filename in dbs.listFiles(dataset=setname, detail=1) ] ##Read out signal files with gfal-ls command for SEpath in signal: key = SEpath.split("/")[-2] signalFiles = subprocess.check_output(["gfal-ls", SEpath]).split("\n")[:-1] filelist[key] = [ global_director + SEpath[74:] + "/" + signalFile for signalFile in signalFiles ] return filelist
def uploadWorker(workInput, results, dbsUrl): """ _uploadWorker_ Put JSONized blocks in the workInput Get confirmation in the output """ # Init DBS Stuff logging.debug("Creating dbsAPI with address %s", dbsUrl) dbsApi = DbsApi(url=dbsUrl) while True: try: work = workInput.get() except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logging.error(crashMessage) break if work == 'STOP': # Then halt the process break name = work.get('name', None) # this is the block name block = work.get('block', None) # this is the block data structure # Do stuff with DBS try: logging.debug("About to call insert block with block: %s", block) dbsApi.insertBulkBlock(blockDump=block) results.put({'name': name, 'success': "uploaded"}) except Exception as ex: exString = str(ex) if 'Block %s already exists' % name in exString: # Then this is probably a duplicate # Ignore this for now logging.warning("Block %s already exists. Marking it as uploaded.", name) logging.debug("Exception: %s", exString) results.put({'name': name, 'success': "uploaded"}) elif 'Proxy Error' in exString: # This is probably a successfully insertion that went bad. # Put it on the check list msg = "Got a proxy error for block %s." % name logging.warning(msg) results.put({'name': name, 'success': "check"}) elif 'Missing data when inserting to dataset_parents' in exString: msg = "Parent dataset is not inserted yet for block %s." % name logging.warning(msg) results.put({'name': name, 'success': "error", 'error': msg}) else: msg = "Error trying to process block %s through DBS. Error: %s" % (name, exString) logging.exception(msg) logging.debug("block info: %s \n", block) results.put({'name': name, 'success': "error", 'error': msg}) return
def getDataTiers(dbsUrl): """ Function to retrieve all the datatiers from DBS. NOTE: to be used with some caching (MemoryCacheStruct) :param dbsUrl: the DBS URL string :return: a list of strings/datatiers """ dbs = DbsApi(dbsUrl) return [tier['data_tier_name'] for tier in dbs.listDataTiers()]
def __init__(self, args): # just make sure args value complies with dbs args try: from dbs.apis.dbsClient import DbsApi DbsApi(args) except ImportError: # No dbsApi available, carry on pass self.args = args
def __init__(self, url, **contact): # instantiate dbs api object try: self.dbs = DbsApi(url, **contact) except DbsException, ex: msg = "Error in DBSReader with DbsApi\n" msg += "%s\n" % formatEx(ex) raise DBSReaderError(msg)
def getSize(dataset): # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlocks(dataset=dataset, detail=True) sum = 0 for block in reply: sum = sum + block['block_size'] return sum
def getFileCount(dataset): # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlockSummaries(dataset=dataset,detail=True) cnt=0 for block in reply: cnt = cnt + int(block['num_file']) return cnt
def __init__(self, args, **contact): # just make sure args value complies with dbs args try: from dbs.apis.dbsClient import DbsApi DbsApi(args, **contact) except ImportError: # No dbsApi available, carry on pass self.args = args self.dbg = DataBlockGenerator3()
def _setDatatiersCache(ts, dbsUrl): """ Set a timestamp and update the list of datatiers cached in the class property """ dbs = DbsApi(dbsUrl) DBS3Reader._datatiers['ts'] = ts DBS3Reader._datatiers['tiers'] = [tier['data_tier_name'] for tier in dbs.listDataTiers()] return
def getEventCountBlock(block): """ Returns the number of events in a dataset using DBS3 """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlockSummaries(block_name=block) return reply[0]['num_event']
def getNumberofFilesPerRun(das_url, dataset, run): """ Count number of files """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve file list reply = dbsapi.listFiles(dataset=dataset) return len(reply)
def getDatasetEventsPerLumi(dataset): dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') all_files = dbsapi.listFileSummaries(dataset=dataset, validFileOnly=1) try: average = sum( [f['num_event'] / float(f['num_lumi']) for f in all_files]) / float(len(all_files)) except: average = 100 return average
def __init__(self, url, logger=None, **contact): # instantiate dbs api object try: self.dbsURL = url.replace("cmsweb.cern.ch", "cmsweb-prod.cern.ch") self.dbs = DbsApi(self.dbsURL, **contact) self.logger = logger or logging.getLogger(self.__class__.__name__) except dbsClientException as ex: msg = "Error in DBSReader with DbsApi\n" msg += "%s\n" % formatEx3(ex) raise DBSReaderError(msg)
def duplicateRunLumi(dataset, verbose=False, skipInvalid=False): """ checks if output dataset has duplicate lumis for every run. returns true if at least one duplicate lumi was found That is if there is the same lumi in the same run and two different files This can be used on datasets that have separate runs. Verbose: if true prints details skipInvalid: if true skips invalid files, by default is False because is faster """ dbsapi = DbsApi(url=dbs3_url) duplicated = False #check each run runs = getRunsDataset(dataset) #if only one run in the list if len(runs) == 1: if verbose: print "only one run:", runs return duplicateLumi(dataset, verbose, skipInvalid) #else manually for run in runs: #create a set lumisChecked = {} # retrieve files for that run reply = dbsapi.listFiles(dataset=dataset, detail=skipInvalid) for f in reply: #skip invalid files if skipInvalid and f['is_file_valid'] != 1: continue logical_file_name = f['logical_file_name'] reply2 = dbsapi.listFileLumis(logical_file_name=logical_file_name, run_num=run) #retrieve lumis for each file if reply2: lumis = reply2[0]['lumi_section_num'] else: continue #check that each lumi is only in one file for lumi in lumis: if lumi in lumisChecked: #if verbose print results, if not end quickly if verbose: print 'Lumi', lumi, 'in run', run, 'is in these files' print logical_file_name print lumisChecked[lumi] duplicated = True else: return True else: lumisChecked[lumi] = logical_file_name return duplicated