示例#1
0
def main_status(args):
  """Check status of jobs (succesful/pending/failed/missing), or hadd job output."""
  if args.verbosity>=1:
    print ">>> main_status", args
  
  # SETTING
  eras           = args.eras
  channels       = args.channels
  tag            = args.tag
  checkdas       = args.checkdas
  checkqueue     = args.checkqueue
  dtypes         = args.dtypes
  filters        = args.samples
  vetoes         = args.vetoes
  force          = args.force
  hadd           = args.subcommand=='hadd'
  cleanup        = args.cleanup if hadd else False
  dryrun         = args.dryrun
  verbosity      = args.verbosity
  cmdverb        = max(1,verbosity)
  outdirformat   = CONFIG.outdir
  jobdirformat   = CONFIG.jobdir
  storedirformat = CONFIG.picodir
  jobs           = [ ]
  
  # LOOP over ERAS
  for era in eras:
    
    # LOOP over CHANNELS
    for channel in channels:
      print header("%s, %s"%(era,channel))
      
      # GET SAMPLES
      jobcfgs = repkey(os.path.join(jobdirformat,"config/jobconfig_$CHANNEL$TAG_try[0-9]*.json"),
                       ERA=era,SAMPLE='*',GROUP='*',CHANNEL=channel,TAG=tag)
      if verbosity>=1:
        print ">>> %-12s = %s"%('cwd',os.getcwd())
        print ">>> %-12s = %s"%('jobcfgs',jobcfgs)
        print ">>> %-12s = %s"%('filters',filters)
        print ">>> %-12s = %s"%('vetoes',vetoes)
        print ">>> %-12s = %s"%('dtypes',dtypes)
      samples = getcfgsamples(jobcfgs,filter=filters,veto=vetoes,dtype=dtypes,verb=verbosity)
      if verbosity>=2:
        print ">>> Found samples: "+", ".join(repr(s.name) for s in samples)
      if hadd and 'skim' in channel:
        LOG.warning("Hadding into one file not available for skimming...")
        print
        continue
      
      # SAMPLE over SAMPLES
      found = False
      for sample in samples:
        if sample.channels and channel not in sample.channels: continue
        found = True
        print ">>> %s"%(bold(sample.name))
        for path in sample.paths:
          print ">>> %s"%(bold(path))
        
        # CHECK JOBS ONLY ONCE
        if checkqueue==1 and not jobs:
          batch = getbatch(CONFIG,verb=verbosity)
          jobs  = batch.jobs(verb=verbosity-1)
        
        # HADD
        if hadd:
          jobdir   = sample.jobcfg['jobdir']
          outdir   = sample.jobcfg['outdir']
          storedir = repkey(storedirformat,ERA=era,CHANNEL=channel,TAG=tag,SAMPLE=sample.name,
                                           DAS=sample.paths[0].strip('/'),GROUP=sample.group)
          storage  = getstorage(storedir,ensure=True,verb=verbosity)
          outfile  = '%s_%s%s.root'%(sample.name,channel,tag)
          infiles  = os.path.join(outdir,'*_%s%s_[0-9]*.root'%(channel,tag))
          cfgfiles = os.path.join(sample.jobcfg['cfgdir'],'job*_%s%s_try[0-9]*.*'%(channel,tag))
          logfiles = os.path.join(sample.jobcfg['logdir'],'*_%s%s_try[0-9]*.*.*.log'%(channel,tag))
          if verbosity>=1:
            print ">>> Hadd'ing job output for '%s'"%(sample.name)
            print ">>> %-12s = %r"%('jobdir',jobdir)
            print ">>> %-12s = %r"%('outdir',outdir)
            print ">>> %-12s = %r"%('storedir',storedir)
            print ">>> %-12s = %s"%('infiles',infiles)
            print ">>> %-12s = %r"%('outfile',outfile)
          resubfiles, chunkdict = checkchuncks(sample,channel=channel,tag=tag,jobs=jobs,
                                               checkqueue=checkqueue,das=checkdas,verb=verbosity)
          if len(resubfiles)>0 and not force:
            LOG.warning("Cannot hadd job output because %d chunks need to be resubmitted..."%(len(resubfiles))+
                        "Please use -f or --force to hadd anyway.")
            continue
          #haddcmd = 'hadd -f %s %s'%(outfile,infiles)
          #haddout = execute(haddcmd,dry=dryrun,verb=max(1,verbosity))
          haddout = storage.hadd(infiles,outfile,dry=dryrun,verb=cmdverb)
          #os.system(haddcmd)
          
          # CLEAN UP
          # TODO: check if hadd was succesful with isvalid
          if cleanup:
            rmfiles   = ""
            rmfileset = [infiles,cfgfiles,logfiles]
            for files in rmfileset:
              if len(glob.glob(files))>0:
                rmfiles += ' '+files
            if verbosity>=2:
              print ">>> %-12s = %s"%('rmfileset',rmfileset)
              print ">>> %-12s = %s"%('rmfiles',rmfiles)
            if rmfiles:
              rmcmd = "rm %s"%(rmfiles)
              rmout = execute(rmcmd,dry=dryrun,verb=cmdverb)
        
        # ONLY CHECK STATUS
        else:
          outdir   = sample.jobcfg['outdir']
          if verbosity>=1:
            print ">>> Checking job status for '%s'"%(sample.name) 
            print ">>> %-12s = %r"%('outdir',outdir)
          checkchuncks(sample,channel=channel,tag=tag,jobs=jobs,
                       checkqueue=checkqueue,das=checkdas,verb=verbosity)
        
        print
      
      if not found:
        print ">>> Did not find any samples."
        print
示例#2
0
def main_submit(args):
  """Submit or resubmit jobs to the batch system."""
  if args.verbosity>=1:
    print ">>> main_submit", args
  
  verbosity = args.verbosity
  force     = args.force #or True
  dryrun    = args.dryrun #or True
  batch     = getbatch(CONFIG,verb=verbosity+1)
  
  for jobcfg in preparejobs(args):
    cfgname = jobcfg['cfgname']
    jobdir  = jobcfg['jobdir']
    logdir  = jobcfg['logdir']
    outdir  = jobcfg['outdir']
    joblist = jobcfg['joblist']
    jobname = jobcfg['jobname']
    nchunks = jobcfg['nchunks']
    if nchunks<=0:
      print ">>>   Nothing to resubmit!"
      continue
    if batch.system=='HTCondor':
      script  = "python/batch/submit_HTCondor.sub"
      appcmds = ["initialdir=%s"%(jobdir),
                 "mylogfile='log/%s.$(ClusterId).$(ProcId).log'"%(jobname)]
      queue   = "arg from %s"%(joblist)
      option  = "" #-dry-run dryrun.log"
      jobid   = batch.submit(script,name=jobname,opt=option,app=appcmds,queue=queue,dry=dryrun)
    elif batch.system=='SLURM':
      script  = "python/batch/submit_SLURM.sh %s"%(joblist)
      logfile = os.path.join(logdir,"%x.%A.%a") # $JOBNAME.o$JOBID.$TASKID
      jobid   = batch.submit(script,name=jobname,log=logfile,array=nchunks,dry=dryrun)
    #elif batch.system=='SGE':
    else:
      LOG.throw(NotImplementedError,"Submission for batch system '%s' has not been implemented (yet)..."%(batch.system))
    
    ## SUBMIT
    #if args.force:
    #  jobid = batch.submit(*jargs,**jkwargs)
    #else:
    #  while True:
    #    submit = raw_input(">>> Do you also want to submit %d jobs to the batch system? [y/n] "%(nchunks))
    #    if any(s in submit.lower() for s in ['quit','exit']):
    #      exit(0)
    #    elif 'force' in submit.lower():
    #      submit = 'y'
    #      args.force = True
    #    if 'y' in submit.lower():
    #      jobid = batch.submit(*jargs,**jkwargs)
    #      break
    #    elif 'n' in submit.lower():
    #      print "Not submitting."
    #      break
    #    else:
    #      print "'%s' is not a valid answer, please choose y/n."%submit
    #print
    
    # WRITE JOBCONFIG
    jobcfg['jobids'].append(jobid)
    if verbosity>=1:
      print ">>> Creating config file '%s'..."%(cfgname)
    with open(cfgname,'w') as file:
      json.dump(jobcfg,file,indent=2)
示例#3
0
def preparejobs(args):
  """Help function to iterate over samples per given channel and era and prepare job config and list."""
  if args.verbosity>=1:
    print ">>> preparejobs", args
  
  resubmit     = args.subcommand=='resubmit'
  eras         = args.eras
  channels     = args.channels
  tag          = args.tag
  dtypes       = args.dtypes
  filters      = args.samples
  vetoes       = args.vetoes
  checkdas     = args.checkdas
  checkqueue   = args.checkqueue
  prefetch     = args.prefetch
  nfilesperjob = args.nfilesperjob
  split_nfpj   = args.split_nfpj
  verbosity    = args.verbosity
  jobs         = [ ]
  
  # LOOP over ERAS
  for era in eras:
    moddict = { } # save time by loading samples and get their file list only once
    
    # LOOP over CHANNELS
    for channel in channels:
      print header("%s, %s"%(era,channel))
      
      # CHANNEL -> MODULE
      assert channel in CONFIG.channels, "Channel '%s' not found in the configuration file. Available: %s"%(channel,CONFIG.channels)
      module = CONFIG.channels[channel]
      if channel!='test' and 'skim' not in channel:
        ensuremodule(module)
      if verbosity>=1:
        print '-'*80
        print ">>> %-12s = %r"%('channel',channel)
        print ">>> %-12s = %r"%('module',module)
        print ">>> %-12s = %s"%('filters',filters)
        print ">>> %-12s = %s"%('vetoes',vetoes)
        print ">>> %-12s = %r"%('dtypes',dtypes)
      
      # PROCESSOR
      if 'skim' in channel:
        processor = module
      elif channel=='test':
        processor = module
      else:
        processor = "picojob.py"
      procpath  = os.path.join("python/processors",processor)
      if not os.path.isfile(procpath):
        LOG.throw(IOError,"Processor '%s' does not exist in '%s'..."%(processor,procpath))
      processor = os.path.abspath(procpath)
      if verbosity>=1:
        print ">>> %-12s = %r"%('processor',processor)
        print '-'*80
      
      # GET SAMPLES
      jobdirformat = CONFIG.jobdir # for job config & log files
      outdirformat = CONFIG.nanodir if 'skim' in channel else CONFIG.outdir # for job output
      if resubmit:
        # TODO: allow user to resubmit given config file
        jobcfgs  = repkey(os.path.join(jobdirformat,"config/jobconfig_$SAMPLE$TAG_try[0-9]*.json"),
                          ERA=era,SAMPLE='*',CHANNEL=channel,TAG=tag)
        if verbosity>=2:
          print ">>> %-12s = %s"%('cwd',os.getcwd())
          print ">>> %-12s = %s"%('jobcfgs',jobcfgs)
        samples = getcfgsamples(jobcfgs,filter=filters,veto=vetoes,dtype=dtypes,verb=verbosity)
      else:
        assert era in CONFIG.eras, "Era '%s' not found in the configuration file. Available: %s"%(era,CONFIG.eras)
        samples = getsamples(era,channel=channel,tag=tag,dtype=dtypes,filter=filters,veto=vetoes,moddict=moddict,verb=verbosity)
      if verbosity>=2:
        print ">>> Found samples: "+", ".join(repr(s.name) for s in samples)
      
      # SAMPLE over SAMPLES
      found = False
      for sample in samples:
        if sample.channels and channel not in sample.channels: continue
        found = True
        print ">>> %s"%(bold(sample.name))
        for path in sample.paths:
          print ">>> %s"%(bold(path))
        
        # DIRECTORIES
        subtry        = sample.subtry+1 if resubmit else 1
        jobids        = sample.jobcfg.get('jobids',[ ])
        postfix       = "_%s%s"%(channel,tag)
        jobtag        = '_%s%s_try%d'%(channel,tag,subtry)
        jobname       = sample.name+jobtag.rstrip('try1').rstrip('_')
        nfilesperjob_ = sample.nfilesperjob if sample.nfilesperjob>0 else nfilesperjob
        if split_nfpj>1:
          nfilesperjob_ = min(1,nfilesperjob_/split_nfpj)
        outdir        = repkey(outdirformat,ERA=era,CHANNEL=channel,TAG=tag,SAMPLE=sample.name,
                                            DAS=sample.paths[0].strip('/'),GROUP=sample.group)
        jobdir        = ensuredir(repkey(jobdirformat,ERA=era,CHANNEL=channel,TAG=tag,SAMPLE=sample.name,
                                                      DAS=sample.paths[0].strip('/'),GROUP=sample.group))
        cfgdir        = ensuredir(jobdir,"config")
        logdir        = ensuredir(jobdir,"log")
        cfgname       = "%s/jobconfig%s.json"%(cfgdir,jobtag)
        joblist       = '%s/jobarglist%s.txt'%(cfgdir,jobtag)
        if verbosity==1:
          print ">>> %-12s = %s"%('cfgname',cfgname)
          print ">>> %-12s = %s"%('joblist',joblist)
        elif verbosity>=2:
          print '-'*80
          print ">>> Preparing job %ssubmission for '%s'"%("re" if resubmit else "",sample.name)
          print ">>> %-12s = %r"%('processor',processor)
          print ">>> %-12s = %r"%('jobname',jobname)
          print ">>> %-12s = %r"%('jobtag',jobtag)
          print ">>> %-12s = %r"%('postfix',postfix)
          print ">>> %-12s = %r"%('outdir',outdir)
          print ">>> %-12s = %r"%('cfgdir',cfgdir)
          print ">>> %-12s = %r"%('logdir',logdir)
          print ">>> %-12s = %r"%('cfgname',cfgname)
          print ">>> %-12s = %r"%('joblist',joblist)
          print ">>> %-12s = %s"%('try',subtry)
          print ">>> %-12s = %r"%('jobids',jobids)
        
        # CHECKS
        if os.path.isfile(cfgname):
          # TODO: check for running jobs
          LOG.warning("Job configuration '%s' already exists and will be overwritten! "+
                      "Beware of conflicting job output!"%(cfgname))
        if not resubmit:
          cfgpattern = re.sub(r"(?<=try)\d+(?=.json$)",r"*",cfgname)
          cfgnames   = [f for f in glob.glob(cfgpattern) if not f.endswith("_try1.json")]
          if cfgnames:
            LOG.warning("Job configurations for resubmission already exists! This can cause conflicting job output!"+
              "If you are sure you want to submit from scratch, please remove these files:\n>>>   "+"\n>>>   ".join(cfgnames))
        storage = getstorage(outdir,verb=verbosity,ensure=True)
        
        # GET FILES
        nevents = 0
        if resubmit: # resubmission
          if checkqueue==0 and not jobs: # check jobs only once
            batch = getbatch(CONFIG,verb=verbosity)
            jobs  = batch.jobs(verb=verbosity-1)
          infiles, chunkdict = checkchuncks(sample,outdir=outdir,channel=channel,tag=tag,jobs=jobs,
                                         checkqueue=checkqueue,das=checkdas,verb=verbosity)
          nevents = sample.jobcfg['nevents'] # updated in checkchuncks
        else: # first-time submission
          infiles   = sample.getfiles(verb=verbosity-1)
          if checkdas:
            nevents = sample.getnevents()
          chunkdict = { }
        if args.testrun:
          infiles = infiles[:2]
        if verbosity==1:
          print ">>> %-12s = %s"%('nfilesperjob',nfilesperjob_)
          print ">>> %-12s = %s"%('nfiles',len(infiles))
        elif verbosity>=2:
          print ">>> %-12s = %s"%('nfilesperjob',nfilesperjob_)
          print ">>> %-12s = %s"%('nfiles',len(infiles))
          print ">>> %-12s = [ "%('infiles')
          for file in infiles:
            print ">>>   %r"%file
          print ">>> ]"
          print ">>> %-12s = %s"%('nevents',nevents)
        
        # CHUNKS
        infiles.sort() # to have consistent order with resubmission
        chunks    = [ ] # chunk indices
        fchunks   = chunkify(infiles,nfilesperjob_) # file chunks
        nfiles    = len(infiles)
        nchunks   = len(fchunks)
        if verbosity>=1:
          print ">>> %-12s = %s"%('nchunks',nchunks)
        if verbosity>=2:
          print '-'*80
        
        # WRITE JOB LIST with arguments per job
        if args.verbosity>=1:
          print ">>> Creating job list %s..."%(joblist)
        with open(joblist,'w') as listfile:
          ichunk = 0
          for fchunk in fchunks:
            while ichunk in chunkdict:
              ichunk   += 1 # allows for different nfilesperjob on resubmission
              continue
            jobfiles    = ' '.join(fchunk) # list of input files
            filetag     = postfix
            if 'skim' not in channel:
              filetag  += "_%d"%(ichunk)
            jobcmd      = processor
            if 'skim' in channel:
              jobcmd += " -y %s --copydir %s -t %s --jec-sys"%(era,outdir,filetag)
            elif 'test' in channel:
              jobcmd += " -o %s -t %s -i %s"%(outdir,filetag)
            else:
              jobcmd += " -y %s -c %s -M %s --copydir %s -t %s"%(era,channel,module,outdir,filetag)
            if prefetch:
              jobcmd += " -p"
            jobcmd += " -i %s"%(jobfiles) # add last
            if args.verbosity>=1:
              print jobcmd
            listfile.write(jobcmd+'\n')
            chunkdict[ichunk] = fchunk
            chunks.append(ichunk)
        
        # JSON CONFIG
        jobcfg = OrderedDict([
          ('time',str(datetime.now())),
          ('group',sample.group), ('paths',sample.paths), ('name',sample.name), ('nevents',nevents),
          ('channel',channel),    ('module',module),
          ('jobname',jobname),    ('jobtag',jobtag),      ('tag',tag),          ('postfix',postfix),
          ('try',subtry),         ('jobids',jobids),
          ('outdir',outdir),      ('jobdir',jobdir),      ('cfgdir',cfgdir),    ('logdir',logdir),
          ('cfgname',cfgname),    ('joblist',joblist),
          ('nfiles',nfiles),      ('files',infiles),      ('nfilesperjob',nfilesperjob_), #('nchunks',nchunks),
          ('nchunks',nchunks),    ('chunks',chunks),      ('chunkdict',chunkdict),
        ])
        
        # YIELD
        yield jobcfg
        print
        #if args.testrun:
        #  break # only run one sample
      
      if not found:
        print ">>> Did not find any samples."
        if verbosity>=1:
          print ">>> %-8s = %s"%('filters',filters)
          print ">>> %-8s = %s"%('vetoes',vetoes)
示例#4
0
def checkchuncks(sample,**kwargs):
  """Help function to check jobs status: success, pending, failed or missing.
  Return list of files to be resubmitted, and a dictionary between chunk index and input files."""
  outdir       = kwargs.get('outdir',      None)
  channel      = kwargs.get('channel',     None)
  tag          = kwargs.get('tag',         None)
  checkqueue   = kwargs.get('checkqueue', False)
  pendjobs     = kwargs.get('jobs',         [ ])
  checkdas     = kwargs.get('das',         True)
  verbosity    = kwargs.get('verb',           0)
  oldjobcfg    = sample.jobcfg
  oldcfgname   = oldjobcfg['config']
  chunkdict    = oldjobcfg['chunkdict'] # filenames
  jobids       = oldjobcfg['jobids']
  joblist      = oldjobcfg['joblist']
  postfix      = oldjobcfg['postfix']
  nfilesperjob = oldjobcfg['nfilesperjob']
  if outdir==None:
    outdir     = oldjobcfg['outdir']
  storage      = getstorage(outdir,ensure=True)
  if channel==None:
    channel    = oldjobcfg['channel']
  if tag==None:
    tag        = oldjobcfg['tag']
  noldchunks   = len(chunkdict) # = number of jobs
  goodchunks   = [ ] # good job output
  pendchunks   = [ ] # pending or running jobs
  badchunks    = [ ] # corrupted job output
  misschunks   = [ ] # missing job output
  resubfiles   = [ ] # files to resubmit (if bad or missing)
  
  # NUMBER OF EVENTS
  nprocevents = 0   # total number of processed events
  ndasevents  = oldjobcfg['nevents'] # total number of available events
  if checkdas and oldjobcfg['nevents']==0:
    ndasevents = sample.getnevents()
    oldjobcfg['nevents'] = ndasevents
  if verbosity>=2:
    print ">>> %-12s = %s"%('ndasevents',ndasevents)
  if verbosity>=3:
    print ">>> %-12s = %s"%('chunkdict',chunkdict)
  
  # CHECK PENDING JOBS
  if checkqueue<0 or pendjobs:
    batch = getbatch(CONFIG,verb=verbosity)
    if checkqueue!=1 or not pendjobs:
      pendjobs = batch.jobs(jobids,verb=verbosity-1) # get refreshed job list
    else:
      pendjobs = [j for j in pendjobs if j.jobid in jobids] # get new job list with right job id
  
  ###########################################################################
  # CHECK SKIMMED OUTPUT: nanoAOD format, one or more output files per job
  if 'skim' in channel: # and nfilesperjob>1:
    flagexp  = re.compile(r"-i (.+\.root)") #r"-i ((?:(?<! -).)+\.root[, ])"
    fpattern = "*%s.root"%(postfix)
    chunkexp = re.compile(r".+%s\.root"%(postfix))
    if verbosity>=2:
      print ">>> %-12s = %r"%('flagexp',flagexp.pattern)
      print ">>> %-12s = %r"%('fpattern',fpattern)
      print ">>> %-12s = %r"%('chunkexp',chunkexp.pattern)
      print ">>> %-12s = %s"%('checkqueue',checkqueue)
      print ">>> %-12s = %s"%('pendjobs',pendjobs)
      print ">>> %-12s = %s"%('jobids',jobids)
    
    # CHECK PENDING JOBS
    pendfiles = [ ]
    for job in pendjobs:
      if verbosity>=3:
        print ">>> Found job %r, status=%r, args=%r"%(job,job.getstatus(),job.args.rstrip())
      if job.getstatus() in ['q','r']:
        if CONFIG.batch=='HTCondor':
          jobarg  = str(job.args)
          matches = flagexp.findall(jobarg)
        else:
          jobarg  = getline(joblist,job.taskid-1)
          matches = flagexp.findall(jobarg)
        if verbosity>=3:
          print ">>> matches = ",matches
        if not matches:
          continue
        infiles = [ ]
        for file in matches[0].split():
          if not file.endswith('.root'):
            break
          infiles.append(file)
        LOG.insist(infiles,"Did not find any root files in %r, matches=%r"%(jobarg,matches))
        ichunk = -1
        for i in chunkdict:
          if all(f in chunkdict[i] for f in infiles):
            ichunk = i
            break
        LOG.insist(ichunk>=0,
                   "Did not find to which the input files of jobids %s belong! "%(jobids)+
                   "\nichunk=%s,\ninfiles=%s,\nchunkdict=%s"%(ichunk,infiles,chunkdict))
        LOG.insist(len(chunkdict[i])==len(infiles),
                   "Mismatch between input files of jobids %s and chunkdict! "%(jobids)+
                   "\nichunk=%s,\ninfiles=%s,\nchunkdict[%s]=%s"%(ichunk,infiles,ichunk,chunkdict[ichunk]))
        pendchunks.append(ichunk)
    
    # CHECK OUTPUT FILES
    badfiles  = [ ]
    goodfiles = [ ]
    fnames    = storage.getfiles(filter=fpattern,verb=verbosity-1)
    if verbosity>=2:
      print ">>> %-12s = %s"%('pendchunks',pendchunks)
      print ">>> %-12s = %s"%('fnames',fnames)
    for fname in fnames:
      if verbosity>=2:
        print ">>>   Checking job output '%s'..."%(fname)
      infile = os.path.basename(fname.replace(postfix+".root",".root")) # reconstruct input file
      nevents = isvalid(fname) # check for corruption
      ichunk = -1
      fmatch = None
      for i in chunkdict:
        if fmatch:
          break
        for chunkfile in chunkdict[i]:
          if infile in chunkfile: # find chunk input file belongs to
            ichunk = i
            fmatch = chunkfile
            break
      if ichunk<0:
        if verbosity>=2:
          print ">>>   => No match..."
        #LOG.warning("Did not recognize output file '%s'!"%(fname))
        continue
      if ichunk in pendchunks:
        if verbosity>=2:
          print ">>>   => Pending..."
        continue
      if nevents<0:
        if verbosity>=2:
          print ">>>   => Bad nevents=%s..."%(nevents)
        badfiles.append(fmatch)
      else:
        if verbosity>=2:
          print ">>>   => Good, nevents=%s"%(nevents)
        nprocevents += nevents
        goodfiles.append(fmatch)
    
    # GET FILES for RESUBMISSION + sanity checks
    for ichunk in chunkdict.keys():
      if ichunk in pendchunks:
        continue
      chunkfiles = chunkdict[ichunk]
      if all(f in goodfiles for f in chunkfiles): # all files succesful
        goodchunks.append(ichunk)
        continue
      bad = False # count each chunk only once: bad, else missing
      for fname in chunkfiles:
        LOG.insist(fname not in resubfiles,"Found file for chunk '%d' more than once: %s "%(ichunk,fname)+
                                           "Possible overcounting or conflicting job output file format!")
        if fname in badfiles:
          bad = True
          resubfiles.append(fname)
        elif fname not in goodfiles:
          resubfiles.append(fname)
      if bad:
        badchunks.append(ichunk)
      else:
        misschunks.append(ichunk)
      chunkdict.pop(ichunk)
  
  ###########################################################################
  # CHECK ANALYSIS OUTPUT: custom tree format, one output file per job
  else:
    flagexp  = re.compile(r"-t \w*(\d+)")
    fpattern = "*%s_[0-9]*.root"%(postfix)
    chunkexp = re.compile(r".+%s_(\d+)\.root"%(postfix))
    if verbosity>=2:
      print ">>> %-12s = %r"%('flagexp',flagexp.pattern)
      print ">>> %-12s = %r"%('fpattern',fpattern)
      print ">>> %-12s = %r"%('chunkexp',chunkexp.pattern)
      print ">>> %-12s = %s"%('checkqueue',checkqueue)
      print ">>> %-12s = %s"%('pendjobs',pendjobs)
      print ">>> %-12s = %s"%('jobids',jobids)
    
    # CHECK PENDING JOBS
    for job in pendjobs:
      if verbosity>=3:
        print ">>> Found job %r, status=%r, args=%r"%(job,job.getstatus(),job.args.rstrip())
      if job.getstatus() in ['q','r']:
        if CONFIG.batch=='HTCondor':
          jobarg  = str(job.args)
          matches = flagexp.findall(jobarg)
        else:
          jobarg  = getline(joblist,job.taskid-1)
          matches = flagexp.findall(jobarg)
        if verbosity>=3:
          print ">>> matches = ",matches
        if not matches:
          continue
        ichunk = int(matches[0])
        LOG.insist(ichunk in chunkdict,"Found an impossible chunk %d for job %s.%s! "%(ichunk,job.jobid,job.taskid)+
                                       "Possible overcounting!")
        pendchunks.append(ichunk)
    
    # CHECK OUTPUT FILES
    fnames = storage.getfiles(filter=fpattern,verb=verbosity-1)
    if verbosity>=2:
      print ">>> %-12s = %s"%('pendchunks',pendchunks)
      print ">>> %-12s = %s"%('fnames',fnames)
    for fname in fnames:
      if verbosity>=2:
        print ">>>   Checking job output '%s'..."%(fname)
      match = chunkexp.search(fname)
      if match:
        ichunk = int(match.group(1))
        LOG.insist(ichunk in chunkdict,"Found an impossible chunk %d for file %s!"%(ichunk,fname)+
                                       "Possible overcounting or conflicting job output file format!")
        if ichunk in pendchunks:
          continue
      else:
        #LOG.warning("Did not recognize output file '%s'!"%(fname))
        continue
      nevents = isvalid(fname) # check for corruption
      if nevents<0:
        if verbosity>=2:
          print ">>>   => Bad, nevents=%s"%(nevents)
        badchunks.append(ichunk)
        # TODO: remove file from outdir?
      else:
        if verbosity>=2:
          print ">>>   => Good, nevents=%s"%(nevents)
        nprocevents += nevents
        goodchunks.append(ichunk)
    
    # GET FILES for RESUBMISSION + sanity checks
    if verbosity>=2:
      print ">>> %-12s = %s"%('nprocevents',nprocevents)
    for ichunk in chunkdict.keys():
      count = goodchunks.count(ichunk)+pendchunks.count(ichunk)+badchunks.count(ichunk)
      LOG.insist(count in [0,1],"Found %d times chunk '%d' (good=%d, pending=%d, bad=%d). "%(
                                count,ichunk,goodchunks.count(ichunk),pendchunks.count(ichunk),badchunks.count(ichunk))+
                                "Possible overcounting or conflicting job output file format!")
      if count==0: # missing chunk
        misschunks.append(ichunk)
      elif ichunk not in badchunks: # good or pending chunk
        continue
      fchunk = chunkdict[ichunk]
      for fname in fchunk:
        LOG.insist(fname not in resubfiles,"Found file for chunk '%d' more than once: %s "%(ichunk,fname)+
                                           "Possible overcounting or conflicting job output file format!")
      resubfiles.extend(chunkdict[ichunk])
      chunkdict.pop(ichunk) # only save good chunks
  
  ###########################################################################
  
  goodchunks.sort()
  pendchunks.sort()
  badchunks.sort()
  misschunks.sort()
  
  # PRINT
  def printchunks(jobden,label,text,col,show=False):
   if jobden:
     ratio = color("%4d/%d"%(len(jobden),noldchunks),col,bold=False)
     label = color(label,col,bold=True)
     jlist = (": "+', '.join(str(j) for j in jobden)) if show else ""
     print ">>> %s %s - %s%s"%(ratio,label,text,jlist)
   #else:
   #  print ">>> %2d/%d %s - %s"%(len(jobden),len(jobs),label,text)
  rtext = ""
  if ndasevents>0:
    ratio = 100.0*nprocevents/ndasevents
    rcol  = 'green' if ratio>90. else 'yellow' if ratio>80. else 'red'
    rtext = ": "+color("%d/%d (%d%%)"%(nprocevents,ndasevents,ratio),rcol,bold=True)
  printchunks(goodchunks,'SUCCES', "Chunks with output in outdir"+rtext,'green')
  printchunks(pendchunks,'PEND',"Chunks with pending or running jobs",'white',True)
  printchunks(badchunks, 'FAIL', "Chunks with corrupted output in outdir",'red',True)
  printchunks(misschunks,'MISS',"Chunks with no output in outdir",'red',True)
  
  return resubfiles, chunkdict
示例#5
0
def testBatch(path,verb=0):
  
  # SETTINGS
  verbosity = args.verbosity
  dryrun    = args.dryrun    # prepare job and submit command, but do not submit
  ntasks    = args.ntasks    # only run a few test tasks per job
  nchecks   = args.nchecks   # number of times to check job status
  queue     = args.queue     # queue option for the batch system (job flavor for HTCondor)
  time      = args.time      # maximum time for the batch system
  batchopts = args.batchopts # extra options for the batch system
  #prompt    = args.prompt    # ask user confirmation before submitting
  outdir    = ensuredir("testBatch")
  logdir    = ensuredir("testBatch/log")
  jobname   = "testBatch"
  tasklist  = os.path.join(outdir,"testBatch.txt")
  
  # INITIALIZE
  LOG.header("__init__")
  #batch = ensuremodule(system,"PicoProducer.batch."+batch)
  batch     = getbatch(args.batch,verb=verbosity+1)
  print ">>> %r"%(batch)
  print ">>> %-10s = %s"%('jobname',jobname)
  print ">>> %-10s = %s"%('ntasks',ntasks)
  print ">>> %-10s = %s"%('nchecks',nchecks)
  print ">>> %-10s = %s"%('outdir',outdir)
  print ">>> %-10s = %s"%('logdir',logdir)
  print ">>> %-10s = %s"%('dryrun',dryrun)
  print ">>> %-10s = %s"%('queue',queue)
  print ">>> %-10s = %s"%('time',time)
  print ">>> %-10s = %s"%('batchopts',batchopts)
  print ">>> %-10s = %s"%('verbosity',verbosity)
  print ">>> "
  
  # PREPARE JOBS
  createtasks(tasklist,ntasks)
  
  # SUBMIT
  LOG.header("Submit")
  jkwargs = { # key-word arguments for batch.submit
    'name': jobname, 'opt': batchopts, 'dry': dryrun,
    'short': True, 'queue':queue, 'time':time
  }
  if batch.system=='HTCondor':
    # use specific settings for KIT condor
    if 'etp' in platform.node():
      script = "python/batch/submit_HTCondor_KIT.sub"
    else:
      script = "python/batch/submit_HTCondor.sub"
    appcmds = ["initialdir=%s"%(outdir),
               "mylogfile='log/%s.$(ClusterId).$(ProcId).log'"%(jobname)]
    jkwargs.update({ 'app': appcmds })
  elif batch.system=='SLURM':
    script  = "python/batch/submit_SLURM.sh"
    logfile = os.path.join(logdir,"%x.%A.%a.log") # $JOBNAME.o$JOBID.$TASKID.log
    jkwargs.update({ 'log': logfile, 'array': ntasks })
  #elif batch.system=='SGE':
  #elif batch.system=='CRAB':
  else:
    LOG.throw(NotImplementedError,"Submission for batch system '%s' has not been implemented (yet)..."%(batch.system))
  jobid = batch.submit(script,tasklist,**jkwargs)
  print ">>> jobid: %s"%(jobid)
  
  # CHECK JOBS
  LOG.header("Check jobs")
  for i in xrange(nchecks):
    jobs = batch.jobs(jobid,verb=verbosity-1) # get refreshed job list
    #jobs = batch.jobs(verb=verbosity-1) # get refreshed job list
    print ">>>   job objects: %r"%(jobs)
    print ">>>   "
    #for job in jobs:
    #  print ">>> Found job %r, status=%r, args=%r"%(job,job.getstatus(),job.args.rstrip())
    if i<nchecks-1:
      sleep(2)