示例#1
0
文件: Sample.py 项目: whahmad/TauFW
 def getfiles(self, refresh=False, url=True, verb=0):
     """Get list of files from DAS."""
     files = self.files
     if self.refreshable and (not files or refresh):
         files = []
         for path in self.paths:
             if self.storage:  # get files from storage system
                 sepath = repkey(self.storage, PATH=path).replace('//', '/')
                 storage = getstorage(sepath, verb=verb - 1)
                 outlist = storage.getfiles(url=url, verb=verb - 1)
             else:  # get files from DAS
                 dascmd = 'dasgoclient --query="file dataset=%s instance=%s"' % (
                     path, self.instance)  #--limit=0
                 LOG.verb(repr(dascmd), verb)
                 cmdout = execute(dascmd, verb=verb - 1)
                 outlist = cmdout.split(os.linesep)
             for line in outlist:  # filter root files
                 line = line.strip()
                 if line.endswith('.root') and not any(
                         f.endswith(line) for f in self.blacklist):
                     if url and self.url not in line and 'root://' not in line:
                         line = self.url + line
                     files.append(line)
         files.sort()  # for consistent list order
         self.files = files
     return files
示例#2
0
文件: Sample.py 项目: slehti/TauFW
 def _getnevents(self,
                 das=True,
                 refresh=False,
                 tree='Events',
                 limit=-1,
                 checkfiles=False,
                 verb=0):
     """Get number of nanoAOD events from DAS (default), or from files on storage system (das=False)."""
     nevents = self.nevents
     filenevts = self.filenevts
     treename = tree
     if nevents <= 0 or refresh:
         if checkfiles or (self.storage and not das
                           ):  # get number of events from storage system
             files = self.getfiles(url=True,
                                   das=das,
                                   refresh=refresh,
                                   limit=limit,
                                   verb=verb)
             for fname in files:
                 nevts = getnevents(fname, treename)
                 filenevts[fname] = nevts  # cache
                 nevents += nevts
                 LOG.verb(
                     "_getnevents: Found %d events in %r." % (nevts, fname),
                     verb, 3)
         else:  # get number of events from DAS
             for daspath in self.paths:
                 nevents += getdasnevents(daspath,
                                          instance=self.instance,
                                          verb=verb - 1)
         if limit < 0:
             self.nevents = nevents
     return nevents, filenevts
示例#3
0
 def loadjson(cfgname):
   """Initialize sample from job config JSON file."""
   if cfgname.endswith(".json.gz"):
     with gzip.open(cfgname,'rt') as file:
       data = file.read().strip()
       jobcfg = json.loads(data)
   else:
     with open(cfgname,'r') as file:
       jobcfg = json.load(file)
   for key, value in jobcfg.items():
     if isinstance(value,unicode):
       jobcfg[key] = str(value)
   for key in ['group','name','paths','try','channel','chunkdict','dtype','extraopts']:
     LOG.insist(key in jobcfg,"Did not find key '%s' in job configuration %s"%(key,cfgname))
   jobcfg['config']    = str(cfgname)
   jobcfg['chunkdict'] = { int(k): v for k, v in jobcfg['chunkdict'].iteritems() }
   nfilesperjob        = int(jobcfg['nfilesperjob'])
   filenevts = jobcfg.get('filenevts',{ })
   dtype     = jobcfg['dtype']
   channels  = [jobcfg['channel']]
   opts      = [str(s) for s in jobcfg['extraopts']]
   subtry    = int(jobcfg['try'])
   nevents   = int(jobcfg['nevents'])
   sample    = Sample(jobcfg['group'],jobcfg['name'],jobcfg['paths'],dtype=dtype,channels=channels,
                      subtry=subtry,jobcfg=jobcfg,nfilesperjob=nfilesperjob,filenevts=filenevts,nevents=nevents,opts=opts)
   return sample
示例#4
0
文件: Sample.py 项目: slehti/TauFW
 def writefiles(self, listname, **kwargs):
     """Write filenames to text file for fast look up in future."""
     writeevts = kwargs.pop('nevts', False)  # also write nevents to file
     listname = repkey(listname,
                       ERA=self.era,
                       GROUP=self.group,
                       SAMPLE=self.name)
     print ">>> Write list to %r..." % (listname)
     ensuredir(os.path.dirname(listname))
     filenevts = self.getfilenevts(checkfiles=True, **
                                   kwargs) if writeevts else None
     treename = kwargs.pop('tree', 'Events')
     files = self.getfiles(**kwargs)
     with open(listname, 'w+') as lfile:
         for infile in files:
             if writeevts:
                 nevts = filenevts.get(infile, -1)
                 if nevts < 0:
                     LOG.warning(
                         "Did not find nevents of %s. Trying again..." %
                         (infile))
                     nevts = getnevents(infile, treename)
                 infile = "%s:%d" % (infile, nevts
                                     )  # write $FILENAM(:NEVTS)
             lfile.write(infile + '\n')
示例#5
0
def dasgoclient(query, **kwargs):
    """Help function to call dasgoclient and retrieve data set information."""
    try:
        verbosity = kwargs.get('verb', 0)
        instance = kwargs.get('instance', "")
        limit = kwargs.get('limit', 0)
        option = kwargs.get('opts', "")
        if instance:
            query += " instance=%s" % (instance)
        dascmd = 'dasgoclient --query="%s"' % (query)
        if limit > 0:
            dascmd += " --limit=%d" % (limit)
        if option:
            dascmd += " " + option.strip()
        LOG.verb(repr(dascmd), verbosity)
        cmdout = execute(dascmd, verb=verbosity - 1)
    except CalledProcessError as e:
        print
        LOG.error(
            "Failed to call 'dasgoclient' command. Please make sure:\n"
            "  1) 'dasgoclient' command exists.\n"
            "  2) You have a valid VOMS proxy. Use 'voms-proxy-init -voms cms -valid 200:0' or 'source utils/setupVOMS.sh'.\n"
            "  3) The DAS dataset in '%s' exists!\n" % (dascmd))
        raise e
    return cmdout
示例#6
0
 def ls(self, *paths, **kwargs):
     """List contents of given directory."""
     verb = kwargs.get('verb', self.verbosity)
     dryrun = kwargs.get('dry', False)
     here = kwargs.get('here', False)
     lscol = kwargs.get('lscol', self.lscol)
     filters = ensurelist(
         kwargs.get('filter', [])
     )  # inclusive filters with glob pattern, like '*' or '[0-9]' wildcards
     path = self.expandpath(*paths, here=here)
     retlist = self.execute("%s %s%s" % (self.lscmd, self.lsurl, path),
                            fatal=False,
                            dry=dryrun,
                            verb=verb)
     delim = '\r\n' if '\r\n' in retlist else '\n'
     retlist = retlist.split(delim)
     if isinstance(lscol, int):
         retlist = [l.split(' ')[lscol] for l in retlist]
     if retlist and 'No such file or directory' in retlist[0]:
         LOG.warning(retlist[0])
         retlist = []
     elif filters:
         for file in retlist[:]:
             if not any(fnmatch(file, f) for f in filters):
                 retlist.remove(file)
     return retlist
示例#7
0
 def _writefile(ofile,fname,prefix=""):
   """Help function to write individual files."""
   if writeevts: # add nevents at end of infile string
     nevts = filenevts.get(fname,-1) # retrieve from cache
     if nevts<0:
       LOG.warning("Did not find nevents of %s. Trying again..."%(fname))
       nevts = getnevents(fname,treename) # get nevents from file
     fname = "%s:%d"%(fname,nevts) # write $FILENAM(:NEVTS)
   ofile.write(prefix+fname+'\n')
示例#8
0
 def file(self, *paths, **kwargs):
     """Ensure that a given file exists, and append a file URL if needed."""
     ensure = kwargs.get('ensure', False)
     path = self.expandpath(*paths, here=True)
     if path.startswith(self.parent):
         path = self.fileurl + path
     if ensure:
         if not self.exists(path):
             LOG.throw(IOError, "Did not find %s." % (path))
     return path
示例#9
0
def getdasnevents(daspath, **kwargs):
    """Get number of events."""
    dascmd = "summary dataset=%s" % (daspath)
    cmdout = dasgoclient(dascmd, **kwargs)
    if "nevents" in cmdout:
        nevts = int(cmdout.split('"nevents":')[1].split(',')[0])
    else:
        nevts = 0
        LOG.warning(
            "getdasnevents: Could not get number of events from DAS for %r." %
            (daspath))
    return nevts
示例#10
0
 def writefiles(self,listname,**kwargs):
   """Write filenames to text file for fast look up in future.
   If there is more than one DAS dataset path, write lists separately for each path."""
   kwargs    = kwargs.copy() # do not edit given dictionary
   writeevts = kwargs.pop('nevts',False) # also write nevents to file
   listname  = repkey(listname,ERA=self.era,GROUP=self.group,SAMPLE=self.name)
   ensuredir(os.path.dirname(listname))
   filenevts = self.getfilenevts(checkfiles=True,**kwargs) if writeevts else None
   treename  = kwargs.pop('tree','Events') # do not pass to Sample.getfiles
   kwargs.pop('ncores') # do not pass to Sample.getfiles
   kwargs['refresh'] = False # already got file list in Sample.filenevts
   files     = self.getfiles(**kwargs) # get right URL
   if not files:
     LOG.warning("writefiles: Did not find any files!")
   def _writefile(ofile,fname,prefix=""):
     """Help function to write individual files."""
     if writeevts: # add nevents at end of infile string
       nevts = filenevts.get(fname,-1) # retrieve from cache
       if nevts<0:
         LOG.warning("Did not find nevents of %s. Trying again..."%(fname))
         nevts = getnevents(fname,treename) # get nevents from file
       fname = "%s:%d"%(fname,nevts) # write $FILENAM(:NEVTS)
     ofile.write(prefix+fname+'\n')
   paths = self.paths if '$PATH' in listname else [self.paths[0]]
   for path in paths:
     listname_ = repkey(listname,PATH=path.strip('/').replace('/','__'))
     with open(listname_,'w+') as lfile:
       if '$PATH' in listname: # write only the file list of this path to this text file
         print ">>> Write %s files to list %r..."%(len(self.pathfiles[path]),listname_)
         for infile in self.pathfiles[path]:
           _writefile(lfile,infile)
       elif len(self.paths)<=1: # write file list for the only path
         if self.nevents>0:
           print ">>> Write %s files to list %r..."%(len(files),listname_)
         else:
           print ">>> Write %s files (%d events) to list %r..."%(len(files),self.nevents,listname_)
         for infile in files:
           _writefile(lfile,infile)
       else: # divide up list per DAS dataset path
         if self.nevents>0:
           print ">>> Write %s files to list %r..."%(len(files),listname_)
         else:
           print ">>> Write %s files (%d events) to list %r..."%(len(files),self.nevents,listname_)
         for i, path in enumerate(self.paths):
           print ">>>   %3s files for %s..."%(len(self.pathfiles[path]),path)
           lfile.write("DASPATH=%s\n"%(path)) # write special line to text file, which loadfiles() can parse
           for infile in self.pathfiles[path]: # loop over this list (general list is sorted)
             LOG.insist(infile in files,"Did not find file %s in general list! %s"%(infile,files))
             _writefile(lfile,infile,prefix="  ")
           if i+1<len(self.paths): # add extra white line between blocks
             lfile.write("\n")
示例#11
0
 def ls(self,*paths,**kwargs):
   """List contents of given directory."""
   verb    = kwargs.get('verb',self.verbosity)
   dryrun  = kwargs.get('dry', False)
   filters = ensurelist(kwargs.get('filter',[ ]))  # inclusive filters with glob pattern, like '*' or '[0-9]' wildcards
   path    = self.expandpath(*paths)
   retlist = self.execute("%s %s%s"%(self.lscmd,self.lsurl,path),fatal=False,dry=dryrun,verb=verb).split('\n')
   if retlist and 'No such file or directory' in retlist[0]:
     LOG.warning(retlist[0])
     retlist = [ ]
   elif filters:
     for file in retlist[:]:
       if not any(fnmatch(file,f) for f in filters):
         retlist.remove(file)
   return retlist
示例#12
0
文件: Sample.py 项目: whahmad/TauFW
 def getnevents(self, refresh=False, verb=0):
     """Get number of files from DAS."""
     nevents = self.nevents
     if nevents <= 0 or refresh:
         for path in self.paths:
             dascmd = 'dasgoclient --query="summary dataset=%s instance=%s"' % (
                 path, self.instance)
             LOG.verb(repr(dascmd), verb)
             cmdout = execute(dascmd, verb=verb - 1)
             if "nevents" in cmdout:
                 ndasevts = int(cmdout.split('"nevents":')[1].split(',')[0])
             else:
                 LOG.warning(
                     "Could not get number of events from DAS for %r." %
                     (self.name))
             nevents += ndasevts
         self.nevents = nevents
     return nevents
示例#13
0
 def ls(self, *paths, **kwargs):
     verb = kwargs.get('verb', self.verbosity)
     dryrun = kwargs.get('dry', False)
     filter = kwargs.get(
         'filter',
         None)  # filter with glob pattern, like '*' or '[0-9]' wildcards
     path = self.expandpath(*paths)
     retlist = self.execute("%s %s%s" % (self.lscmd, self.lsurl, path),
                            fatal=False,
                            dry=dryrun,
                            verb=verb).split('\n')
     if retlist and 'No such file or directory' in retlist[0]:
         LOG.warning(retlist[0])
         retlist = []
     elif filter:
         for file in retlist[:]:
             if not fnmatch(file, filter):
                 retlist.remove(file)
     return retlist
示例#14
0
文件: Sample.py 项目: yihui-lai/TauFW
 def getnevents(self, das=True, refresh=False, treename='Events', verb=0):
     """Get number of nanoAOD events from DAS (default), or from files on storage system (das=False)."""
     nevents = self.nevents
     if nevents <= 0 or refresh:
         if self.storage and not das:  # get number of events from storage system
             files = self.getfiles(url=True, refresh=refresh, verb=verb)
             for fname in files:
                 file = ensureTFile(fname)
                 tree = file.Get(treename)
                 if not tree:
                     LOG.warning("getnevents: No %r tree in events in %r!" %
                                 ('Events', fname))
                     continue
                 nevts = tree.GetEntries()
                 file.Close()
                 nevents += nevts
                 LOG.verb(
                     "getnevents: Found %d events in %r." % (nevts, fname),
                     verb, 3)
         else:  # get number of events from DAS
             for daspath in self.paths:
                 cmdout = dasgoclient("summary dataset=%s instance=%s" %
                                      (daspath, self.instance),
                                      verb=verb - 1)
                 if "nevents" in cmdout:
                     ndasevts = int(
                         cmdout.split('"nevents":')[1].split(',')[0])
                 else:
                     ndasevts = 0
                     LOG.warning(
                         "Could not get number of events from DAS for %r." %
                         (self.name))
                 nevents += ndasevts
         self.nevents = nevents
     return nevents
示例#15
0
文件: Sample.py 项目: slehti/TauFW
 def loadfiles(self, listname, **kwargs):
     """Load filenames from text file for fast look up in future."""
     listname = repkey(listname,
                       ERA=self.era,
                       GROUP=self.group,
                       SAMPLE=self.name)
     filenevts = self.filenevts
     nevents = 0
     if self.verbosity + 2 >= 1:
         print ">>> Loading sample files from '%r'" % (listname)
     ensurefile(listname, fatal=True)
     filelist = []
     with open(listname, 'r') as file:
         for line in file:
             line = line.strip().split()
             if not line: continue
             line = line[0].strip()  # remove spaces, one per line
             if line[0] == '#': continue  # do not consider out-commented
             #if v.endswith('.root'):
             match = fevtsexp.match(line)  # match $FILENAM(:NEVTS)
             if not match: continue
             infile = match.group(1)
             if match.group(2):  # found nevents in filename
                 nevts = int(match.group(2))
                 filenevts[infile] = nevts  # store/cache in dictionary
                 nevents += nevts
             filelist.append(infile)
     if self.nevents <= 0:
         self.nevents = nevents
     elif self.nevents != nevents:
         LOG.warning(
             "loadfiles: stored nevents=%d does not match the sum total of file events, %d!"
             % (self.nevents, nevents))
         self.nevents == nevents
     self.files = filelist
     self.files.sort()
     return self.files
示例#16
0
 def getfiles(self,das=False,refresh=False,url=True,limit=-1,verb=0):
   """Get list of files from storage system (default), or DAS (if no storage system of das=True)."""
   LOG.verb("getfiles: das=%r, refresh=%r, url=%r, limit=%r, filelist=%r, len(files)=%d, len(filenevts)=%d"%(
     das,refresh,url,limit,self.filelist,len(self.files),len(self.filenevts)),verb,1)
   if self.filelist and not self.files: # get file list from text file for first time
     self.loadfiles(self.filelist)
   files = self.files # cache for efficiency
   url_  = self.dasurl if (das and self.storage) else self.url
   if self.refreshable and (not files or das or refresh): # (re)derive file list
     if not files or das:
       LOG.verb("getfiles: Retrieving files...",verb,2)
     else:
       LOG.verb("getfiles: Refreshing file list...",verb,2)
     files = [ ]
     for daspath in self.paths: # loop over DAS dataset paths
       self.pathfiles[daspath] = [ ]
       if (self.storage and not das) or (not self.instance): # get files from storage system
         postfix = self.postfix+'.root'
         sepath  = repkey(self.storepath,PATH=daspath,DAS=daspath).replace('//','/')
         outlist = self.storage.getfiles(sepath,url=url,verb=verb-1)
         if limit>0:
           outlist = outlist[:limit]
       else: # get files from DAS
         postfix = '.root'
         outlist = getdasfiles(daspath,instance=self.instance,limit=limit,verb=verb-1)
       for line in outlist: # filter root files
         line = line.strip()
         if line.endswith(postfix) and not any(f.endswith(line) for f in self.blacklist):
           if url and url_ not in line and 'root://' not in line:
             line = url_+line
           files.append(line)
           self.pathfiles[daspath].append(line)
       self.pathfiles[daspath].sort()
       if not self.pathfiles[daspath]:
         LOG.warning("getfiles: Did not find any files for %s"%(daspath))
     files.sort() # for consistent list order
     if not das or not self.storage:
       self.files = files # store cache for efficiency
   elif url and any(url_ not in f for f in files): # add url if missing
     files = [(url_+f if url_ not in f else f) for f in files]
   elif not url and any(url_ in f for f in files): # remove url
     files = [f.replace(url_,"") for f in files]
   return files[:] # pass copy to protect private self.files
示例#17
0
def testStorage(path,verb=0):
  
  # INITIALIZE
  LOG.header("__init__")
  #storage = ensuremodule(system,"PicoProducer.storage"
  storage = getstorage(path,ensure=True,verb=verb)
  print ">>> %r"%(storage)
  print ">>> %-10s = %s"%('path',storage.path)
  print ">>> %-10s = %s"%('rmcmd',storage.rmcmd)
  print ">>> %-10s = %s"%('lscmd',storage.lscmd)
  print ">>> %-10s = %s"%('mkdrcmd',storage.mkdrcmd)
  print ">>> %-10s = %s"%('cpcmd',storage.cpcmd)
  print ">>> %-10s = %s"%('tmpdir',storage.tmpdir)
  print ">>> "
  
  # EXPAND PATH
  LOG.header("expandpath")
  pathargs = [
    ('test.py',),
    ('$PATH/test.py',),
    ('foo','bar',),
  ]
  pathkwargs = [
    {'here':True},
    {'here':False},
  ]
  for patharg in pathargs:
    for pathkwarg in pathkwargs:
      LOG.color("storage.expandpath(%s,%s)"%(','.join(repr(a) for a in patharg),','.join("%s=%r"%(k,v) for k,v in pathkwarg.iteritems())))
      result = storage.expandpath(*patharg,**pathkwarg)
      print ">>>   %r"%(result)
  
  # LS
  LOG.header("ls")
  LOG.color("storage.ls(verb=%d)"%(verb))
  storage.ls(verb=verb)
  
  # CP
  LOG.header("cp")
  fname = createdummy("testStorage.txt")
  LOG.color("storage.cp(%r,verb=%d)"%(fname,verb))
  storage.cp(fname,verb=verb)
  storage.ls(verb=verb)
  
  # EXISTS
  LOG.header("exists")
  LOG.color("storage.exists(%r,verb=%d)"%(fname,verb))
  result = storage.exists(fname,verb=verb)
  print ">>>   %r"%(result)
  storage.ls(verb=verb)
  
  # RM
  LOG.header("rm")
  LOG.color("storage.rm(%r,verb=%d)"%(fname,verb))
  storage.rm(fname,verb=verb)
  storage.ls(verb=verb)
  
  # MKDIR
  LOG.header("mkdir")
  dirname = 'test'
  LOG.color("storage.mkdir(%r.verb=%d)"%(dirname,verb))
  storage.mkdir(dirname,verb=verb)
  storage.ls(verb=verb)
  storage.ls(dirname,verb=verb)
  
  # RM DIRECTORY
  LOG.header("rm directory")
  submit = raw_input(">>> Careful! Do you really want to remove %r? [y/n] "%(storage.expandpath(dirname,here=True)))
  if submit=='y':
    LOG.color("storage.rm(%r,verb=%d)"%(dirname,verb))
    storage.rm(dirname,verb=verb)
    storage.ls(verb=verb)
  
  # HADD
  LOG.header("hadd")
  infiles = [createdummyroot("testStorage1.root"),createdummyroot("testStorage2.root")]
  outfile = "testStorage.root"
  for tmpdir in [True,]: #False
    LOG.color("storage.hadd(%r,%r,tmpdir=%s,verb=%d)"%(infiles,outfile,tmpdir,verb))
    storage.hadd(infiles,outfile,tmpdir=tmpdir,verb=verb)
    storage.ls(verb=verb)
    storage.rm(outfile,verb=verb)
示例#18
0
 def _getnevents(self,das=True,refresh=False,tree='Events',limit=-1,checkfiles=False,ncores=0,verb=0):
   """Get number of nanoAOD events from DAS (default), or from files on storage system (das=False)."""
   LOG.verb("_getnevents: das=%r, refresh=%r, tree=%r, limit=%r, checkfiles=%r, filelist=%r, len(files)=%d, len(filenevts)=%d"%(
     das,refresh,tree,limit,checkfiles,self.filelist,len(self.files),len(self.filenevts)),verb,1)
   if self.filelist and not self.files: # get file list from text file for first time
     self.loadfiles(self.filelist)
   nevents   = self.nevents
   filenevts = self.filenevts
   bar       = None
   if nevents<=0 or refresh:
     if checkfiles or (self.storage and not das): # get number of events per file from storage system
       LOG.verb("_getnevents: Get events per file (storage=%r, das=%r)..."%(self.storage,das),verb,2)
       files = self.getfiles(url=True,das=das,refresh=refresh,limit=limit,verb=verb)
       if verb<=0 and len(files)>=5:
         bar = LoadingBar(len(files),width=20,pre=">>> Getting number of events: ",counter=True,remove=True)
       for nevts, fname in iterevts(files,tree,filenevts,refresh,ncores=ncores,verb=verb):
         filenevts[fname] = nevts # cache
         nevents += nevts
         LOG.verb("_getnevents: Found %d events in %r."%(nevts,fname),verb,3)
         if bar:
            if self.nevents>0:
              bar.count("files, %d/%d events (%d%%)"%(nevents,self.nevents,100.0*nevents/self.nevents))
            else:
              bar.count("files, %d events"%(nevents))
     else: # get total number of events from DAS
       LOG.verb("_getnevents: Get total number of events per path (storage=%r, das=%r)..."%(self.storage,das),verb,2)
       for daspath in self.paths:
         nevts = getdasnevents(daspath,instance=self.instance,verb=verb-1)
         LOG.verb("_getnevents: %10d events for %s..."%(nevts,daspath),verb,2)
         nevents += nevts
     if limit<=0:
       self.nevents = nevents
   else:
     LOG.verb("_getnevents: Reusing old number of events (nevents=%r, refresh=%r)..."%(nevents,refresh),verb,2)
   return nevents, filenevts
示例#19
0
文件: Sample.py 项目: slehti/TauFW
    def __init__(self, group, name, *paths, **kwargs):
        """Container class for CMSSW samples, e.g.:
       - group: DY (used to group similar samples in final output)
       - name:  DYJetsToLL_M-50 (used as shorthand and jobname)
       - path:  /DYJetsToLL_M-50_TuneCP5_13TeV-madgraphMLM-pythia8/RunIIAutumn18NanoAODv6_Nano25Oct2019_102X_mcRun2/NANOAODSIM
       - dtype: 'mc', 'data', 'embed'
    """

        # PATH
        LOG.insist(
            len(paths) >= 1, "Need at least one path to create a sample...")
        if len(paths) == 1 and isinstance(paths[0], list):
            paths = paths[0]
        for path in paths:
            LOG.insist(
                path.count('/') >= 3 and path.startswith('/'),
                "DAS path %r has wrong format. Need /SAMPLE/CAMPAIGN/FORMAT." %
                (path))
            #sample = '/'.join(line.split('/')[-3:])

        # DATA TYPE
        dtype = kwargs.get('dtype', None)
        dtypes = ['mc', 'data', 'embed']
        if dtype == None:  # automatic recognition
            path = paths[0]
            if 'Embed' in path:
                dtype = 'embed'
            elif path.endswith('SIM') or any(g in path
                                             for g in ['pythia', 'madgraph']):
                dtype = 'mc'
            elif re.search(r"/Run20\d\d", path):
                dtype = 'data'
            dtype = 'mc'  # TODO: remove
        LOG.insist(
            dtype in dtypes,
            "Given data type '%s' is not recongized! Please choose from %s..."
            % (dtype, ', '.join(dtypes)))

        # ATTRIBUTES
        self.group = group
        self.name = name
        self.paths = paths  # DAS dataset path
        self.dtype = dtype
        self.channels = kwargs.get('channel', None)
        self.channels = kwargs.get('channels', self.channels)
        self.storage = None
        self.storepath = kwargs.get('store',
                                    None)  # if stored elsewhere than DAS
        self.url = kwargs.get('url', None)  # URL if stored elsewhere
        self.dasurl = kwargs.get(
            'dasurl', None) or "root://cms-xrd-global.cern.ch/"  # URL for DAS
        self.blacklist = kwargs.get('blacklist', [])  # black list file
        self.instance = kwargs.get(
            'instance', 'prod/phys03' if path.endswith('USER') else
            'prod/global')  # if None, does not exist in DAS
        self.nfilesperjob = kwargs.get('nfilesperjob',
                                       -1)  # number of nanoAOD files per job
        self.maxevts = kwargs.get(
            'maxevtsperjob', -1)  # maximum number of events processed per job
        self.maxevts = kwargs.get(
            'maxevts',
            self.maxevts)  # maximum number of events processed per job
        self.extraopts = kwargs.get(
            'opts', []
        )  # extra options for analysis module, e.g. ['doZpt=1','tes=1.1']
        self.subtry = kwargs.get('subtry',
                                 0)  # to help keep track of resubmission
        self.jobcfg = kwargs.get('jobcfg',
                                 {})  # to help keep track of resubmission
        self.nevents = kwargs.get(
            'nevts', 0)  # number of nanoAOD events that can be processed
        self.nevents = kwargs.get('nevents',
                                  self.nevents)  # cache of number of events
        self.files = kwargs.get(
            'files', [])  # list of ROOT files, OR text file with list of files
        self.filenevts = {}  # cache of number of events for each file
        self.postfix = kwargs.get(
            'postfix',
            None) or ""  # post-fix (before '.root') for stored ROOT files
        self.era = kwargs.get('era', "")  # for expansion of $ERA variable
        self.dosplit = kwargs.get(
            'split',
            len(self.paths) >= 2)  # allow splitting (if multiple DAS datasets)
        self.verbosity = kwargs.get('verbosity',
                                    0)  # verbosity level for debugging
        self.refreshable = not self.files  # allow refresh on file list in getfiles()

        # ENSURE LIST
        if self.channels != None and not isinstance(self.channels, list):
            self.channels = [self.channels]
        if isinstance(self.extraopts, str):
            if ',' in self.extraopts:
                self.extraopts = self.extraopts.split(',')
            self.extraopts = [self.extraopts]

        # STORAGE & URL DEFAULTS
        if self.storepath:
            self.storepath = repkey(self.storepath,
                                    USER=_user,
                                    ERA=self.era,
                                    GROUP=self.group,
                                    SAMPLE=self.name)
            self.storage = getstorage(repkey(self.storepath,
                                             PATH=self.paths[0],
                                             DAS=self.paths[0]),
                                      ensure=False)
        if not self.dasurl:
            self.dasurl = self.url if (self.url in dasurls) else dasurls[0]
        if not self.url:
            if self.storepath:
                if self.storage.__class__.__name__ == 'Local':
                    self.url = ""  #root://cms-xrd-global.cern.ch/
                else:
                    self.url = self.storage.fileurl
            else:
                self.url = self.dasurl

        # GET FILE LIST FROM TEXT FILE
        if isinstance(self.files, str):
            self.loadfiles(self.files)
示例#20
0
 def loadfiles(self,listname_,**kwargs):
   verbosity = LOG.getverbosity(self,kwargs)
   """Load filenames from text file for fast look up in future."""
   listname  = repkey(listname_,ERA=self.era,GROUP=self.group,SAMPLE=self.name)
   LOG.verb("loadfiles: listname=%r -> %r, len(files)=%d, len(filenevts)=%d"%(
     listname_,listname,len(self.files),len(self.filenevts)),verbosity,1)
   filenevts = self.filenevts
   nevents   = 0
   #listname = ensurefile(listname,fatal=False)
   filelist = [ ]
   paths = self.paths if '$PATH' in listname else [self.paths[0]]
   for path in paths:
     listname_ = repkey(listname,PATH=path.strip('/').replace('/','__'))
     if self.verbosity>=1:
       print ">>> Loading sample files from %r..."%(listname_)
     self.pathfiles[path] = [ ]
     if os.path.isfile(listname_):
       skip = False
       subpaths = [ ] # for sanity check
       with open(listname_,'r') as file:
         for line in file:
           line = line.strip().split() # split at space to allow comments at end
           if not line: continue
           line = line[0].strip() # remove spaces, consider only first part of the line
           if line[0]=='#': continue # do not consider comments
           #if line.endswith('.root'):
           if line.startswith("DASPATH="): # to keep track of multiple DAS data set paths
             path = line.split('=')[-1] # DAS data set path
             LOG.insist(path.count('/')>=3 and path.startswith('/'),
               "DAS path %r in %s has wrong format. Need /SAMPLE/CAMPAIGN/FORMAT..."%(path,listname_))
             if path in self.paths: # store file list for this path
               self.pathfiles[path] = [ ]
               subpaths.append(path)
               skip = False
             else: # do not store file list for this path
               skip = True
           else:
             if skip: continue # only load files for this sample's DAS dataset paths
             match = fevtsexp.match(line) # match $FILENAM(:NEVTS)
             if not match: continue
             infile = match.group(1)
             if match.group(2): # found nevents in filename
               nevts  = int(match.group(2))
               filenevts[infile] = nevts # store/cache in dictionary
               nevents += nevts
             filelist.append(infile)
             self.pathfiles[path].append(infile)
             if self.verbosity>=3:
               print ">>> %7d events for %s"%(nevts,infile)
       if not filelist:
         LOG.warning("loadfiles: Did not find any files in %s!"%(listname_))
         self.refreshable = True
       else: # sanity check for empty list
         for subpath in subpaths:
           if not self.pathfiles[subpath]:
             LOG.warning("loadfiles: Did not find any files for path %s in %s!"%(subpath,listname_))
     else:
       LOG.warning("loadfiles: file list %s does not exist!"%(listname_))
       self.refreshable = True
   for path in self.paths:
     if path not in self.pathfiles: # nonexistent list
       LOG.warning("loadfiles: Did not find any files for path %s in %s!"%(path,listname))
   if self.nevents<=0:
     self.nevents = nevents
   elif self.nevents!=nevents:
     LOG.warning("loadfiles: stored nevents=%d does not match the sum total of file events, %d!"%(self.nevents,nevents))
     self.nevents == nevents
   self.files = filelist
   self.files.sort()
   return self.files
示例#21
0
            'list',
            'set',
            'rm',
            'write',
        ]
        for subcmd in subcmds:
            if args[0] in subcmd[:len(args[0])]:  # match abbreviation
                args[0] = subcmd
                break
    args = parser.parse_args(args)
    if hasattr(args, 'tag') and len(args.tag) >= 1 and args.tag[0] != '_':
        args.tag = '_' + args.tag

    # VERBOSITY
    if args.verbosity >= 2:
        SLOG.setverbosity(args.verbosity - 1)

    # SUBCOMMAND MAINs
    os.chdir(CONFIG.basedir)
    if args.subcommand == 'install':
        main_install(args)
    if args.subcommand == 'list':
        main_list(args)
    elif args.subcommand == 'get':
        main_get(args)
    elif args.subcommand == 'set':
        main_set(args)
    elif args.subcommand == 'write':
        main_write(args)
    elif args.subcommand in ['channel', 'era']:
        main_link(args)
示例#22
0
def testStorage(path, readonly=False, hadd=True, verb=0):

    # INITIALIZE
    LOG.header("__init__")
    #storage = ensuremodule(system,"PicoProducer.storage"
    storage = getstorage(path, ensure=True, verb=verb)
    print ">>> %r" % (storage)
    print ">>> %-10s = %s" % ('path', storage.path)
    print ">>> %-10s = %s" % ('rmcmd', storage.rmcmd)
    print ">>> %-10s = %s" % ('lscmd', storage.lscmd)
    print ">>> %-10s = %s" % ('mkdrcmd', storage.mkdrcmd)
    print ">>> %-10s = %s" % ('cpcmd', storage.cpcmd)
    print ">>> %-10s = %s" % ('tmpdir', storage.tmpdir)
    print ">>> "

    # EXPAND PATH
    LOG.header("expandpath")
    pathargs = [
        ('test.py', ),
        ('$PATH/test.py', ),
        (
            'foo',
            'bar',
        ),
    ]
    pathkwargs = [
        {
            'here': True
        },
        {
            'here': False
        },
    ]
    for patharg in pathargs:
        for pathkwarg in pathkwargs:
            LOG.color("storage.expandpath(%s,%s)" %
                      (','.join(repr(a) for a in patharg), ','.join(
                          "%s=%r" % (k, v) for k, v in pathkwarg.iteritems())))
            result = storage.expandpath(*patharg, **pathkwarg)
            print ">>>   %r" % (result)

    # LS
    LOG.header("ls")
    LOG.color("storage.ls(verb=%d)" % (verb))
    contents = storage.ls(verb=verb)
    print ">>> Found %d items" % (len(contents))
    print ">>> Contents: %s" % (contents)

    # FILES
    LOG.header("getfiles")
    LOG.color("storage.getfiles(verb=%d)" % (verb))
    contents = storage.getfiles(verb=verb)
    print ">>> Found %d items" % (len(contents))
    print ">>> Contents: %s" % (contents)
    print ">>> "
    LOG.color("storage.getfiles(filter='*.*',verb=%d)" % (verb))
    contents = storage.getfiles(filter='*.*', verb=verb)
    print ">>> Found %d files" % (len(contents))
    print ">>> Contents: %s" % (contents)
    print ">>> "
    LOG.color("storage.getfiles(filter='*.*',url=None,verb=%d)" % (verb))
    contents = storage.getfiles(filter='*.*', url=None, verb=verb)
    print ">>> Found %d files" % (len(contents))
    print ">>> Contents: %s" % (contents)

    if readonly:
        print ">>> Read only. Skip test for cp, rm, mkdir, hadd..."
        return

    # CP
    LOG.header("cp")
    fname = createdummy("testStorage.txt")
    LOG.color("storage.cp(%r,verb=%d)" % (fname, verb))
    storage.cp(fname, verb=verb)
    storage.ls(verb=verb)

    # EXISTS
    LOG.header("exists")
    LOG.color("storage.exists(%r,verb=%d)" % (fname, verb))
    result = storage.exists(fname, verb=verb)
    print ">>> Exists: %r" % (result)
    storage.ls(verb=verb)

    # RM
    LOG.header("rm")
    LOG.color("storage.rm(%r,verb=%d)" % (fname, verb))
    try:
        storage.rm(fname, verb=verb)
    except Exception as error:
        print error
    storage.ls(verb=verb)

    # MKDIR
    LOG.header("mkdir")
    dirname = 'test'
    LOG.color("storage.mkdir(%r.verb=%d)" % (dirname, verb))
    try:
        storage.mkdir(dirname, verb=verb)
        storage.ls(verb=verb)
        storage.ls(dirname, here=True, verb=verb)
        result = storage.exists(dirname, verb=verb)
        print ">>> Exists: %r" % (result)
    except Exception as error:
        print error

    # RM DIRECTORY
    LOG.header("rm directory")
    submit = raw_input(">>> Careful! Do you really want to remove %r? [y/n] " %
                       (storage.expandpath(dirname, here=True)))
    if submit == 'y':
        LOG.color("storage.rm(%r,verb=%d)" % (dirname, verb))
        try:
            storage.rm(dirname, verb=verb)
            storage.ls(verb=verb)
        except Exception as error:
            print error

    # HADD
    if hadd:
        LOG.header("hadd")
        infiles = [
            createdummyroot("testStorage1.root"),
            createdummyroot("testStorage2.root")
        ]
        outfile = "testStorage.root"
        for tmpdir in [
                True,
        ]:  #False
            LOG.color("storage.hadd(%r,%r,tmpdir=%s,verb=%d)" %
                      (infiles, outfile, tmpdir, verb))
            try:
                storage.hadd(infiles, outfile, tmpdir=tmpdir, verb=verb)
                storage.ls(verb=verb)
                storage.rm(outfile, verb=verb)
            except Exception as error:
                print error