def __init__(self, group, name, *paths, **kwargs): """Container class for CMSSW samples, e.g.: - group: DY (used to group similar samples in final output) - name: DYJetsToLL_M-50 (used as shorthand and jobname) - path: /DYJetsToLL_M-50_TuneCP5_13TeV-madgraphMLM-pythia8/RunIIAutumn18NanoAODv6_Nano25Oct2019_102X_mcRun2/NANOAODSIM - dtype: 'mc', 'data', 'embed' """ # PATH LOG.insist( len(paths) >= 1, "Need at least one path to create a sample...") if len(paths) == 1 and isinstance(paths[0], list): paths = paths[0] for path in paths: LOG.insist( path.count('/') >= 3 and path.startswith('/'), "DAS path %r has wrong format. Need /SAMPLE/CAMPAIGN/FORMAT." % (path)) #sample = '/'.join(line.split('/')[-3:]) # DATA TYPE dtype = kwargs.get('dtype', None) dtypes = ['mc', 'data', 'embed'] if dtype == None: # automatic recognition path = paths[0] if 'Embed' in path: dtype = 'embed' elif path.endswith('SIM') or any(g in path for g in ['pythia', 'madgraph']): dtype = 'mc' elif re.search(r"/Run20\d\d", path): dtype = 'data' dtype = 'mc' # TODO: remove LOG.insist( dtype in dtypes, "Given data type '%s' is not recongized! Please choose from %s..." % (dtype, ', '.join(dtypes))) # ATTRIBUTES self.group = group self.name = name self.paths = paths # DAS dataset path self.dtype = dtype self.channels = kwargs.get('channel', None) self.channels = kwargs.get('channels', self.channels) self.storage = None self.storepath = kwargs.get('store', None) # if stored elsewhere than DAS self.url = kwargs.get('url', None) # URL if stored elsewhere self.dasurl = kwargs.get( 'dasurl', None) or "root://cms-xrd-global.cern.ch/" # URL for DAS self.blacklist = kwargs.get('blacklist', []) # black list file self.instance = kwargs.get( 'instance', 'prod/phys03' if path.endswith('USER') else 'prod/global') # if None, does not exist in DAS self.nfilesperjob = kwargs.get('nfilesperjob', -1) # number of nanoAOD files per job self.maxevts = kwargs.get( 'maxevtsperjob', -1) # maximum number of events processed per job self.maxevts = kwargs.get( 'maxevts', self.maxevts) # maximum number of events processed per job self.extraopts = kwargs.get( 'opts', [] ) # extra options for analysis module, e.g. ['doZpt=1','tes=1.1'] self.subtry = kwargs.get('subtry', 0) # to help keep track of resubmission self.jobcfg = kwargs.get('jobcfg', {}) # to help keep track of resubmission self.nevents = kwargs.get( 'nevts', 0) # number of nanoAOD events that can be processed self.nevents = kwargs.get('nevents', self.nevents) # cache of number of events self.files = kwargs.get( 'files', []) # list of ROOT files, OR text file with list of files self.filenevts = {} # cache of number of events for each file self.postfix = kwargs.get( 'postfix', None) or "" # post-fix (before '.root') for stored ROOT files self.era = kwargs.get('era', "") # for expansion of $ERA variable self.dosplit = kwargs.get( 'split', len(self.paths) >= 2) # allow splitting (if multiple DAS datasets) self.verbosity = kwargs.get('verbosity', 0) # verbosity level for debugging self.refreshable = not self.files # allow refresh on file list in getfiles() # ENSURE LIST if self.channels != None and not isinstance(self.channels, list): self.channels = [self.channels] if isinstance(self.extraopts, str): if ',' in self.extraopts: self.extraopts = self.extraopts.split(',') self.extraopts = [self.extraopts] # STORAGE & URL DEFAULTS if self.storepath: self.storepath = repkey(self.storepath, USER=_user, ERA=self.era, GROUP=self.group, SAMPLE=self.name) self.storage = getstorage(repkey(self.storepath, PATH=self.paths[0], DAS=self.paths[0]), ensure=False) if not self.dasurl: self.dasurl = self.url if (self.url in dasurls) else dasurls[0] if not self.url: if self.storepath: if self.storage.__class__.__name__ == 'Local': self.url = "" #root://cms-xrd-global.cern.ch/ else: self.url = self.storage.fileurl else: self.url = self.dasurl # GET FILE LIST FROM TEXT FILE if isinstance(self.files, str): self.loadfiles(self.files)
def loadfiles(self,listname_,**kwargs): verbosity = LOG.getverbosity(self,kwargs) """Load filenames from text file for fast look up in future.""" listname = repkey(listname_,ERA=self.era,GROUP=self.group,SAMPLE=self.name) LOG.verb("loadfiles: listname=%r -> %r, len(files)=%d, len(filenevts)=%d"%( listname_,listname,len(self.files),len(self.filenevts)),verbosity,1) filenevts = self.filenevts nevents = 0 #listname = ensurefile(listname,fatal=False) filelist = [ ] paths = self.paths if '$PATH' in listname else [self.paths[0]] for path in paths: listname_ = repkey(listname,PATH=path.strip('/').replace('/','__')) if self.verbosity>=1: print ">>> Loading sample files from %r..."%(listname_) self.pathfiles[path] = [ ] if os.path.isfile(listname_): skip = False subpaths = [ ] # for sanity check with open(listname_,'r') as file: for line in file: line = line.strip().split() # split at space to allow comments at end if not line: continue line = line[0].strip() # remove spaces, consider only first part of the line if line[0]=='#': continue # do not consider comments #if line.endswith('.root'): if line.startswith("DASPATH="): # to keep track of multiple DAS data set paths path = line.split('=')[-1] # DAS data set path LOG.insist(path.count('/')>=3 and path.startswith('/'), "DAS path %r in %s has wrong format. Need /SAMPLE/CAMPAIGN/FORMAT..."%(path,listname_)) if path in self.paths: # store file list for this path self.pathfiles[path] = [ ] subpaths.append(path) skip = False else: # do not store file list for this path skip = True else: if skip: continue # only load files for this sample's DAS dataset paths match = fevtsexp.match(line) # match $FILENAM(:NEVTS) if not match: continue infile = match.group(1) if match.group(2): # found nevents in filename nevts = int(match.group(2)) filenevts[infile] = nevts # store/cache in dictionary nevents += nevts filelist.append(infile) self.pathfiles[path].append(infile) if self.verbosity>=3: print ">>> %7d events for %s"%(nevts,infile) if not filelist: LOG.warning("loadfiles: Did not find any files in %s!"%(listname_)) self.refreshable = True else: # sanity check for empty list for subpath in subpaths: if not self.pathfiles[subpath]: LOG.warning("loadfiles: Did not find any files for path %s in %s!"%(subpath,listname_)) else: LOG.warning("loadfiles: file list %s does not exist!"%(listname_)) self.refreshable = True for path in self.paths: if path not in self.pathfiles: # nonexistent list LOG.warning("loadfiles: Did not find any files for path %s in %s!"%(path,listname)) if self.nevents<=0: self.nevents = nevents elif self.nevents!=nevents: LOG.warning("loadfiles: stored nevents=%d does not match the sum total of file events, %d!"%(self.nevents,nevents)) self.nevents == nevents self.files = filelist self.files.sort() return self.files