def main_status(args): """Check status of jobs (succesful/pending/failed/missing), or hadd job output.""" if args.verbosity>=1: print ">>> main_status", args # SETTING eras = args.eras channels = args.channels tag = args.tag checkdas = args.checkdas checkqueue = args.checkqueue dtypes = args.dtypes filters = args.samples vetoes = args.vetoes force = args.force hadd = args.subcommand=='hadd' cleanup = args.cleanup if hadd else False dryrun = args.dryrun verbosity = args.verbosity cmdverb = max(1,verbosity) outdirformat = CONFIG.outdir jobdirformat = CONFIG.jobdir storedirformat = CONFIG.picodir jobs = [ ] # LOOP over ERAS for era in eras: # LOOP over CHANNELS for channel in channels: print header("%s, %s"%(era,channel)) # GET SAMPLES jobcfgs = repkey(os.path.join(jobdirformat,"config/jobconfig_$CHANNEL$TAG_try[0-9]*.json"), ERA=era,SAMPLE='*',GROUP='*',CHANNEL=channel,TAG=tag) if verbosity>=1: print ">>> %-12s = %s"%('cwd',os.getcwd()) print ">>> %-12s = %s"%('jobcfgs',jobcfgs) print ">>> %-12s = %s"%('filters',filters) print ">>> %-12s = %s"%('vetoes',vetoes) print ">>> %-12s = %s"%('dtypes',dtypes) samples = getcfgsamples(jobcfgs,filter=filters,veto=vetoes,dtype=dtypes,verb=verbosity) if verbosity>=2: print ">>> Found samples: "+", ".join(repr(s.name) for s in samples) if hadd and 'skim' in channel: LOG.warning("Hadding into one file not available for skimming...") print continue # SAMPLE over SAMPLES found = False for sample in samples: if sample.channels and channel not in sample.channels: continue found = True print ">>> %s"%(bold(sample.name)) for path in sample.paths: print ">>> %s"%(bold(path)) # CHECK JOBS ONLY ONCE if checkqueue==1 and not jobs: batch = getbatch(CONFIG,verb=verbosity) jobs = batch.jobs(verb=verbosity-1) # HADD if hadd: jobdir = sample.jobcfg['jobdir'] outdir = sample.jobcfg['outdir'] storedir = repkey(storedirformat,ERA=era,CHANNEL=channel,TAG=tag,SAMPLE=sample.name, DAS=sample.paths[0].strip('/'),GROUP=sample.group) storage = getstorage(storedir,ensure=True,verb=verbosity) outfile = '%s_%s%s.root'%(sample.name,channel,tag) infiles = os.path.join(outdir,'*_%s%s_[0-9]*.root'%(channel,tag)) cfgfiles = os.path.join(sample.jobcfg['cfgdir'],'job*_%s%s_try[0-9]*.*'%(channel,tag)) logfiles = os.path.join(sample.jobcfg['logdir'],'*_%s%s_try[0-9]*.*.*.log'%(channel,tag)) if verbosity>=1: print ">>> Hadd'ing job output for '%s'"%(sample.name) print ">>> %-12s = %r"%('jobdir',jobdir) print ">>> %-12s = %r"%('outdir',outdir) print ">>> %-12s = %r"%('storedir',storedir) print ">>> %-12s = %s"%('infiles',infiles) print ">>> %-12s = %r"%('outfile',outfile) resubfiles, chunkdict = checkchuncks(sample,channel=channel,tag=tag,jobs=jobs, checkqueue=checkqueue,das=checkdas,verb=verbosity) if len(resubfiles)>0 and not force: LOG.warning("Cannot hadd job output because %d chunks need to be resubmitted..."%(len(resubfiles))+ "Please use -f or --force to hadd anyway.") continue #haddcmd = 'hadd -f %s %s'%(outfile,infiles) #haddout = execute(haddcmd,dry=dryrun,verb=max(1,verbosity)) haddout = storage.hadd(infiles,outfile,dry=dryrun,verb=cmdverb) #os.system(haddcmd) # CLEAN UP # TODO: check if hadd was succesful with isvalid if cleanup: rmfiles = "" rmfileset = [infiles,cfgfiles,logfiles] for files in rmfileset: if len(glob.glob(files))>0: rmfiles += ' '+files if verbosity>=2: print ">>> %-12s = %s"%('rmfileset',rmfileset) print ">>> %-12s = %s"%('rmfiles',rmfiles) if rmfiles: rmcmd = "rm %s"%(rmfiles) rmout = execute(rmcmd,dry=dryrun,verb=cmdverb) # ONLY CHECK STATUS else: outdir = sample.jobcfg['outdir'] if verbosity>=1: print ">>> Checking job status for '%s'"%(sample.name) print ">>> %-12s = %r"%('outdir',outdir) checkchuncks(sample,channel=channel,tag=tag,jobs=jobs, checkqueue=checkqueue,das=checkdas,verb=verbosity) print if not found: print ">>> Did not find any samples." print
def main_submit(args): """Submit or resubmit jobs to the batch system.""" if args.verbosity>=1: print ">>> main_submit", args verbosity = args.verbosity force = args.force #or True dryrun = args.dryrun #or True batch = getbatch(CONFIG,verb=verbosity+1) for jobcfg in preparejobs(args): cfgname = jobcfg['cfgname'] jobdir = jobcfg['jobdir'] logdir = jobcfg['logdir'] outdir = jobcfg['outdir'] joblist = jobcfg['joblist'] jobname = jobcfg['jobname'] nchunks = jobcfg['nchunks'] if nchunks<=0: print ">>> Nothing to resubmit!" continue if batch.system=='HTCondor': script = "python/batch/submit_HTCondor.sub" appcmds = ["initialdir=%s"%(jobdir), "mylogfile='log/%s.$(ClusterId).$(ProcId).log'"%(jobname)] queue = "arg from %s"%(joblist) option = "" #-dry-run dryrun.log" jobid = batch.submit(script,name=jobname,opt=option,app=appcmds,queue=queue,dry=dryrun) elif batch.system=='SLURM': script = "python/batch/submit_SLURM.sh %s"%(joblist) logfile = os.path.join(logdir,"%x.%A.%a") # $JOBNAME.o$JOBID.$TASKID jobid = batch.submit(script,name=jobname,log=logfile,array=nchunks,dry=dryrun) #elif batch.system=='SGE': else: LOG.throw(NotImplementedError,"Submission for batch system '%s' has not been implemented (yet)..."%(batch.system)) ## SUBMIT #if args.force: # jobid = batch.submit(*jargs,**jkwargs) #else: # while True: # submit = raw_input(">>> Do you also want to submit %d jobs to the batch system? [y/n] "%(nchunks)) # if any(s in submit.lower() for s in ['quit','exit']): # exit(0) # elif 'force' in submit.lower(): # submit = 'y' # args.force = True # if 'y' in submit.lower(): # jobid = batch.submit(*jargs,**jkwargs) # break # elif 'n' in submit.lower(): # print "Not submitting." # break # else: # print "'%s' is not a valid answer, please choose y/n."%submit #print # WRITE JOBCONFIG jobcfg['jobids'].append(jobid) if verbosity>=1: print ">>> Creating config file '%s'..."%(cfgname) with open(cfgname,'w') as file: json.dump(jobcfg,file,indent=2)
def preparejobs(args): """Help function to iterate over samples per given channel and era and prepare job config and list.""" if args.verbosity>=1: print ">>> preparejobs", args resubmit = args.subcommand=='resubmit' eras = args.eras channels = args.channels tag = args.tag dtypes = args.dtypes filters = args.samples vetoes = args.vetoes checkdas = args.checkdas checkqueue = args.checkqueue prefetch = args.prefetch nfilesperjob = args.nfilesperjob split_nfpj = args.split_nfpj verbosity = args.verbosity jobs = [ ] # LOOP over ERAS for era in eras: moddict = { } # save time by loading samples and get their file list only once # LOOP over CHANNELS for channel in channels: print header("%s, %s"%(era,channel)) # CHANNEL -> MODULE assert channel in CONFIG.channels, "Channel '%s' not found in the configuration file. Available: %s"%(channel,CONFIG.channels) module = CONFIG.channels[channel] if channel!='test' and 'skim' not in channel: ensuremodule(module) if verbosity>=1: print '-'*80 print ">>> %-12s = %r"%('channel',channel) print ">>> %-12s = %r"%('module',module) print ">>> %-12s = %s"%('filters',filters) print ">>> %-12s = %s"%('vetoes',vetoes) print ">>> %-12s = %r"%('dtypes',dtypes) # PROCESSOR if 'skim' in channel: processor = module elif channel=='test': processor = module else: processor = "picojob.py" procpath = os.path.join("python/processors",processor) if not os.path.isfile(procpath): LOG.throw(IOError,"Processor '%s' does not exist in '%s'..."%(processor,procpath)) processor = os.path.abspath(procpath) if verbosity>=1: print ">>> %-12s = %r"%('processor',processor) print '-'*80 # GET SAMPLES jobdirformat = CONFIG.jobdir # for job config & log files outdirformat = CONFIG.nanodir if 'skim' in channel else CONFIG.outdir # for job output if resubmit: # TODO: allow user to resubmit given config file jobcfgs = repkey(os.path.join(jobdirformat,"config/jobconfig_$SAMPLE$TAG_try[0-9]*.json"), ERA=era,SAMPLE='*',CHANNEL=channel,TAG=tag) if verbosity>=2: print ">>> %-12s = %s"%('cwd',os.getcwd()) print ">>> %-12s = %s"%('jobcfgs',jobcfgs) samples = getcfgsamples(jobcfgs,filter=filters,veto=vetoes,dtype=dtypes,verb=verbosity) else: assert era in CONFIG.eras, "Era '%s' not found in the configuration file. Available: %s"%(era,CONFIG.eras) samples = getsamples(era,channel=channel,tag=tag,dtype=dtypes,filter=filters,veto=vetoes,moddict=moddict,verb=verbosity) if verbosity>=2: print ">>> Found samples: "+", ".join(repr(s.name) for s in samples) # SAMPLE over SAMPLES found = False for sample in samples: if sample.channels and channel not in sample.channels: continue found = True print ">>> %s"%(bold(sample.name)) for path in sample.paths: print ">>> %s"%(bold(path)) # DIRECTORIES subtry = sample.subtry+1 if resubmit else 1 jobids = sample.jobcfg.get('jobids',[ ]) postfix = "_%s%s"%(channel,tag) jobtag = '_%s%s_try%d'%(channel,tag,subtry) jobname = sample.name+jobtag.rstrip('try1').rstrip('_') nfilesperjob_ = sample.nfilesperjob if sample.nfilesperjob>0 else nfilesperjob if split_nfpj>1: nfilesperjob_ = min(1,nfilesperjob_/split_nfpj) outdir = repkey(outdirformat,ERA=era,CHANNEL=channel,TAG=tag,SAMPLE=sample.name, DAS=sample.paths[0].strip('/'),GROUP=sample.group) jobdir = ensuredir(repkey(jobdirformat,ERA=era,CHANNEL=channel,TAG=tag,SAMPLE=sample.name, DAS=sample.paths[0].strip('/'),GROUP=sample.group)) cfgdir = ensuredir(jobdir,"config") logdir = ensuredir(jobdir,"log") cfgname = "%s/jobconfig%s.json"%(cfgdir,jobtag) joblist = '%s/jobarglist%s.txt'%(cfgdir,jobtag) if verbosity==1: print ">>> %-12s = %s"%('cfgname',cfgname) print ">>> %-12s = %s"%('joblist',joblist) elif verbosity>=2: print '-'*80 print ">>> Preparing job %ssubmission for '%s'"%("re" if resubmit else "",sample.name) print ">>> %-12s = %r"%('processor',processor) print ">>> %-12s = %r"%('jobname',jobname) print ">>> %-12s = %r"%('jobtag',jobtag) print ">>> %-12s = %r"%('postfix',postfix) print ">>> %-12s = %r"%('outdir',outdir) print ">>> %-12s = %r"%('cfgdir',cfgdir) print ">>> %-12s = %r"%('logdir',logdir) print ">>> %-12s = %r"%('cfgname',cfgname) print ">>> %-12s = %r"%('joblist',joblist) print ">>> %-12s = %s"%('try',subtry) print ">>> %-12s = %r"%('jobids',jobids) # CHECKS if os.path.isfile(cfgname): # TODO: check for running jobs LOG.warning("Job configuration '%s' already exists and will be overwritten! "+ "Beware of conflicting job output!"%(cfgname)) if not resubmit: cfgpattern = re.sub(r"(?<=try)\d+(?=.json$)",r"*",cfgname) cfgnames = [f for f in glob.glob(cfgpattern) if not f.endswith("_try1.json")] if cfgnames: LOG.warning("Job configurations for resubmission already exists! This can cause conflicting job output!"+ "If you are sure you want to submit from scratch, please remove these files:\n>>> "+"\n>>> ".join(cfgnames)) storage = getstorage(outdir,verb=verbosity,ensure=True) # GET FILES nevents = 0 if resubmit: # resubmission if checkqueue==0 and not jobs: # check jobs only once batch = getbatch(CONFIG,verb=verbosity) jobs = batch.jobs(verb=verbosity-1) infiles, chunkdict = checkchuncks(sample,outdir=outdir,channel=channel,tag=tag,jobs=jobs, checkqueue=checkqueue,das=checkdas,verb=verbosity) nevents = sample.jobcfg['nevents'] # updated in checkchuncks else: # first-time submission infiles = sample.getfiles(verb=verbosity-1) if checkdas: nevents = sample.getnevents() chunkdict = { } if args.testrun: infiles = infiles[:2] if verbosity==1: print ">>> %-12s = %s"%('nfilesperjob',nfilesperjob_) print ">>> %-12s = %s"%('nfiles',len(infiles)) elif verbosity>=2: print ">>> %-12s = %s"%('nfilesperjob',nfilesperjob_) print ">>> %-12s = %s"%('nfiles',len(infiles)) print ">>> %-12s = [ "%('infiles') for file in infiles: print ">>> %r"%file print ">>> ]" print ">>> %-12s = %s"%('nevents',nevents) # CHUNKS infiles.sort() # to have consistent order with resubmission chunks = [ ] # chunk indices fchunks = chunkify(infiles,nfilesperjob_) # file chunks nfiles = len(infiles) nchunks = len(fchunks) if verbosity>=1: print ">>> %-12s = %s"%('nchunks',nchunks) if verbosity>=2: print '-'*80 # WRITE JOB LIST with arguments per job if args.verbosity>=1: print ">>> Creating job list %s..."%(joblist) with open(joblist,'w') as listfile: ichunk = 0 for fchunk in fchunks: while ichunk in chunkdict: ichunk += 1 # allows for different nfilesperjob on resubmission continue jobfiles = ' '.join(fchunk) # list of input files filetag = postfix if 'skim' not in channel: filetag += "_%d"%(ichunk) jobcmd = processor if 'skim' in channel: jobcmd += " -y %s --copydir %s -t %s --jec-sys"%(era,outdir,filetag) elif 'test' in channel: jobcmd += " -o %s -t %s -i %s"%(outdir,filetag) else: jobcmd += " -y %s -c %s -M %s --copydir %s -t %s"%(era,channel,module,outdir,filetag) if prefetch: jobcmd += " -p" jobcmd += " -i %s"%(jobfiles) # add last if args.verbosity>=1: print jobcmd listfile.write(jobcmd+'\n') chunkdict[ichunk] = fchunk chunks.append(ichunk) # JSON CONFIG jobcfg = OrderedDict([ ('time',str(datetime.now())), ('group',sample.group), ('paths',sample.paths), ('name',sample.name), ('nevents',nevents), ('channel',channel), ('module',module), ('jobname',jobname), ('jobtag',jobtag), ('tag',tag), ('postfix',postfix), ('try',subtry), ('jobids',jobids), ('outdir',outdir), ('jobdir',jobdir), ('cfgdir',cfgdir), ('logdir',logdir), ('cfgname',cfgname), ('joblist',joblist), ('nfiles',nfiles), ('files',infiles), ('nfilesperjob',nfilesperjob_), #('nchunks',nchunks), ('nchunks',nchunks), ('chunks',chunks), ('chunkdict',chunkdict), ]) # YIELD yield jobcfg print #if args.testrun: # break # only run one sample if not found: print ">>> Did not find any samples." if verbosity>=1: print ">>> %-8s = %s"%('filters',filters) print ">>> %-8s = %s"%('vetoes',vetoes)
def checkchuncks(sample,**kwargs): """Help function to check jobs status: success, pending, failed or missing. Return list of files to be resubmitted, and a dictionary between chunk index and input files.""" outdir = kwargs.get('outdir', None) channel = kwargs.get('channel', None) tag = kwargs.get('tag', None) checkqueue = kwargs.get('checkqueue', False) pendjobs = kwargs.get('jobs', [ ]) checkdas = kwargs.get('das', True) verbosity = kwargs.get('verb', 0) oldjobcfg = sample.jobcfg oldcfgname = oldjobcfg['config'] chunkdict = oldjobcfg['chunkdict'] # filenames jobids = oldjobcfg['jobids'] joblist = oldjobcfg['joblist'] postfix = oldjobcfg['postfix'] nfilesperjob = oldjobcfg['nfilesperjob'] if outdir==None: outdir = oldjobcfg['outdir'] storage = getstorage(outdir,ensure=True) if channel==None: channel = oldjobcfg['channel'] if tag==None: tag = oldjobcfg['tag'] noldchunks = len(chunkdict) # = number of jobs goodchunks = [ ] # good job output pendchunks = [ ] # pending or running jobs badchunks = [ ] # corrupted job output misschunks = [ ] # missing job output resubfiles = [ ] # files to resubmit (if bad or missing) # NUMBER OF EVENTS nprocevents = 0 # total number of processed events ndasevents = oldjobcfg['nevents'] # total number of available events if checkdas and oldjobcfg['nevents']==0: ndasevents = sample.getnevents() oldjobcfg['nevents'] = ndasevents if verbosity>=2: print ">>> %-12s = %s"%('ndasevents',ndasevents) if verbosity>=3: print ">>> %-12s = %s"%('chunkdict',chunkdict) # CHECK PENDING JOBS if checkqueue<0 or pendjobs: batch = getbatch(CONFIG,verb=verbosity) if checkqueue!=1 or not pendjobs: pendjobs = batch.jobs(jobids,verb=verbosity-1) # get refreshed job list else: pendjobs = [j for j in pendjobs if j.jobid in jobids] # get new job list with right job id ########################################################################### # CHECK SKIMMED OUTPUT: nanoAOD format, one or more output files per job if 'skim' in channel: # and nfilesperjob>1: flagexp = re.compile(r"-i (.+\.root)") #r"-i ((?:(?<! -).)+\.root[, ])" fpattern = "*%s.root"%(postfix) chunkexp = re.compile(r".+%s\.root"%(postfix)) if verbosity>=2: print ">>> %-12s = %r"%('flagexp',flagexp.pattern) print ">>> %-12s = %r"%('fpattern',fpattern) print ">>> %-12s = %r"%('chunkexp',chunkexp.pattern) print ">>> %-12s = %s"%('checkqueue',checkqueue) print ">>> %-12s = %s"%('pendjobs',pendjobs) print ">>> %-12s = %s"%('jobids',jobids) # CHECK PENDING JOBS pendfiles = [ ] for job in pendjobs: if verbosity>=3: print ">>> Found job %r, status=%r, args=%r"%(job,job.getstatus(),job.args.rstrip()) if job.getstatus() in ['q','r']: if CONFIG.batch=='HTCondor': jobarg = str(job.args) matches = flagexp.findall(jobarg) else: jobarg = getline(joblist,job.taskid-1) matches = flagexp.findall(jobarg) if verbosity>=3: print ">>> matches = ",matches if not matches: continue infiles = [ ] for file in matches[0].split(): if not file.endswith('.root'): break infiles.append(file) LOG.insist(infiles,"Did not find any root files in %r, matches=%r"%(jobarg,matches)) ichunk = -1 for i in chunkdict: if all(f in chunkdict[i] for f in infiles): ichunk = i break LOG.insist(ichunk>=0, "Did not find to which the input files of jobids %s belong! "%(jobids)+ "\nichunk=%s,\ninfiles=%s,\nchunkdict=%s"%(ichunk,infiles,chunkdict)) LOG.insist(len(chunkdict[i])==len(infiles), "Mismatch between input files of jobids %s and chunkdict! "%(jobids)+ "\nichunk=%s,\ninfiles=%s,\nchunkdict[%s]=%s"%(ichunk,infiles,ichunk,chunkdict[ichunk])) pendchunks.append(ichunk) # CHECK OUTPUT FILES badfiles = [ ] goodfiles = [ ] fnames = storage.getfiles(filter=fpattern,verb=verbosity-1) if verbosity>=2: print ">>> %-12s = %s"%('pendchunks',pendchunks) print ">>> %-12s = %s"%('fnames',fnames) for fname in fnames: if verbosity>=2: print ">>> Checking job output '%s'..."%(fname) infile = os.path.basename(fname.replace(postfix+".root",".root")) # reconstruct input file nevents = isvalid(fname) # check for corruption ichunk = -1 fmatch = None for i in chunkdict: if fmatch: break for chunkfile in chunkdict[i]: if infile in chunkfile: # find chunk input file belongs to ichunk = i fmatch = chunkfile break if ichunk<0: if verbosity>=2: print ">>> => No match..." #LOG.warning("Did not recognize output file '%s'!"%(fname)) continue if ichunk in pendchunks: if verbosity>=2: print ">>> => Pending..." continue if nevents<0: if verbosity>=2: print ">>> => Bad nevents=%s..."%(nevents) badfiles.append(fmatch) else: if verbosity>=2: print ">>> => Good, nevents=%s"%(nevents) nprocevents += nevents goodfiles.append(fmatch) # GET FILES for RESUBMISSION + sanity checks for ichunk in chunkdict.keys(): if ichunk in pendchunks: continue chunkfiles = chunkdict[ichunk] if all(f in goodfiles for f in chunkfiles): # all files succesful goodchunks.append(ichunk) continue bad = False # count each chunk only once: bad, else missing for fname in chunkfiles: LOG.insist(fname not in resubfiles,"Found file for chunk '%d' more than once: %s "%(ichunk,fname)+ "Possible overcounting or conflicting job output file format!") if fname in badfiles: bad = True resubfiles.append(fname) elif fname not in goodfiles: resubfiles.append(fname) if bad: badchunks.append(ichunk) else: misschunks.append(ichunk) chunkdict.pop(ichunk) ########################################################################### # CHECK ANALYSIS OUTPUT: custom tree format, one output file per job else: flagexp = re.compile(r"-t \w*(\d+)") fpattern = "*%s_[0-9]*.root"%(postfix) chunkexp = re.compile(r".+%s_(\d+)\.root"%(postfix)) if verbosity>=2: print ">>> %-12s = %r"%('flagexp',flagexp.pattern) print ">>> %-12s = %r"%('fpattern',fpattern) print ">>> %-12s = %r"%('chunkexp',chunkexp.pattern) print ">>> %-12s = %s"%('checkqueue',checkqueue) print ">>> %-12s = %s"%('pendjobs',pendjobs) print ">>> %-12s = %s"%('jobids',jobids) # CHECK PENDING JOBS for job in pendjobs: if verbosity>=3: print ">>> Found job %r, status=%r, args=%r"%(job,job.getstatus(),job.args.rstrip()) if job.getstatus() in ['q','r']: if CONFIG.batch=='HTCondor': jobarg = str(job.args) matches = flagexp.findall(jobarg) else: jobarg = getline(joblist,job.taskid-1) matches = flagexp.findall(jobarg) if verbosity>=3: print ">>> matches = ",matches if not matches: continue ichunk = int(matches[0]) LOG.insist(ichunk in chunkdict,"Found an impossible chunk %d for job %s.%s! "%(ichunk,job.jobid,job.taskid)+ "Possible overcounting!") pendchunks.append(ichunk) # CHECK OUTPUT FILES fnames = storage.getfiles(filter=fpattern,verb=verbosity-1) if verbosity>=2: print ">>> %-12s = %s"%('pendchunks',pendchunks) print ">>> %-12s = %s"%('fnames',fnames) for fname in fnames: if verbosity>=2: print ">>> Checking job output '%s'..."%(fname) match = chunkexp.search(fname) if match: ichunk = int(match.group(1)) LOG.insist(ichunk in chunkdict,"Found an impossible chunk %d for file %s!"%(ichunk,fname)+ "Possible overcounting or conflicting job output file format!") if ichunk in pendchunks: continue else: #LOG.warning("Did not recognize output file '%s'!"%(fname)) continue nevents = isvalid(fname) # check for corruption if nevents<0: if verbosity>=2: print ">>> => Bad, nevents=%s"%(nevents) badchunks.append(ichunk) # TODO: remove file from outdir? else: if verbosity>=2: print ">>> => Good, nevents=%s"%(nevents) nprocevents += nevents goodchunks.append(ichunk) # GET FILES for RESUBMISSION + sanity checks if verbosity>=2: print ">>> %-12s = %s"%('nprocevents',nprocevents) for ichunk in chunkdict.keys(): count = goodchunks.count(ichunk)+pendchunks.count(ichunk)+badchunks.count(ichunk) LOG.insist(count in [0,1],"Found %d times chunk '%d' (good=%d, pending=%d, bad=%d). "%( count,ichunk,goodchunks.count(ichunk),pendchunks.count(ichunk),badchunks.count(ichunk))+ "Possible overcounting or conflicting job output file format!") if count==0: # missing chunk misschunks.append(ichunk) elif ichunk not in badchunks: # good or pending chunk continue fchunk = chunkdict[ichunk] for fname in fchunk: LOG.insist(fname not in resubfiles,"Found file for chunk '%d' more than once: %s "%(ichunk,fname)+ "Possible overcounting or conflicting job output file format!") resubfiles.extend(chunkdict[ichunk]) chunkdict.pop(ichunk) # only save good chunks ########################################################################### goodchunks.sort() pendchunks.sort() badchunks.sort() misschunks.sort() # PRINT def printchunks(jobden,label,text,col,show=False): if jobden: ratio = color("%4d/%d"%(len(jobden),noldchunks),col,bold=False) label = color(label,col,bold=True) jlist = (": "+', '.join(str(j) for j in jobden)) if show else "" print ">>> %s %s - %s%s"%(ratio,label,text,jlist) #else: # print ">>> %2d/%d %s - %s"%(len(jobden),len(jobs),label,text) rtext = "" if ndasevents>0: ratio = 100.0*nprocevents/ndasevents rcol = 'green' if ratio>90. else 'yellow' if ratio>80. else 'red' rtext = ": "+color("%d/%d (%d%%)"%(nprocevents,ndasevents,ratio),rcol,bold=True) printchunks(goodchunks,'SUCCES', "Chunks with output in outdir"+rtext,'green') printchunks(pendchunks,'PEND',"Chunks with pending or running jobs",'white',True) printchunks(badchunks, 'FAIL', "Chunks with corrupted output in outdir",'red',True) printchunks(misschunks,'MISS',"Chunks with no output in outdir",'red',True) return resubfiles, chunkdict
def testBatch(path,verb=0): # SETTINGS verbosity = args.verbosity dryrun = args.dryrun # prepare job and submit command, but do not submit ntasks = args.ntasks # only run a few test tasks per job nchecks = args.nchecks # number of times to check job status queue = args.queue # queue option for the batch system (job flavor for HTCondor) time = args.time # maximum time for the batch system batchopts = args.batchopts # extra options for the batch system #prompt = args.prompt # ask user confirmation before submitting outdir = ensuredir("testBatch") logdir = ensuredir("testBatch/log") jobname = "testBatch" tasklist = os.path.join(outdir,"testBatch.txt") # INITIALIZE LOG.header("__init__") #batch = ensuremodule(system,"PicoProducer.batch."+batch) batch = getbatch(args.batch,verb=verbosity+1) print ">>> %r"%(batch) print ">>> %-10s = %s"%('jobname',jobname) print ">>> %-10s = %s"%('ntasks',ntasks) print ">>> %-10s = %s"%('nchecks',nchecks) print ">>> %-10s = %s"%('outdir',outdir) print ">>> %-10s = %s"%('logdir',logdir) print ">>> %-10s = %s"%('dryrun',dryrun) print ">>> %-10s = %s"%('queue',queue) print ">>> %-10s = %s"%('time',time) print ">>> %-10s = %s"%('batchopts',batchopts) print ">>> %-10s = %s"%('verbosity',verbosity) print ">>> " # PREPARE JOBS createtasks(tasklist,ntasks) # SUBMIT LOG.header("Submit") jkwargs = { # key-word arguments for batch.submit 'name': jobname, 'opt': batchopts, 'dry': dryrun, 'short': True, 'queue':queue, 'time':time } if batch.system=='HTCondor': # use specific settings for KIT condor if 'etp' in platform.node(): script = "python/batch/submit_HTCondor_KIT.sub" else: script = "python/batch/submit_HTCondor.sub" appcmds = ["initialdir=%s"%(outdir), "mylogfile='log/%s.$(ClusterId).$(ProcId).log'"%(jobname)] jkwargs.update({ 'app': appcmds }) elif batch.system=='SLURM': script = "python/batch/submit_SLURM.sh" logfile = os.path.join(logdir,"%x.%A.%a.log") # $JOBNAME.o$JOBID.$TASKID.log jkwargs.update({ 'log': logfile, 'array': ntasks }) #elif batch.system=='SGE': #elif batch.system=='CRAB': else: LOG.throw(NotImplementedError,"Submission for batch system '%s' has not been implemented (yet)..."%(batch.system)) jobid = batch.submit(script,tasklist,**jkwargs) print ">>> jobid: %s"%(jobid) # CHECK JOBS LOG.header("Check jobs") for i in xrange(nchecks): jobs = batch.jobs(jobid,verb=verbosity-1) # get refreshed job list #jobs = batch.jobs(verb=verbosity-1) # get refreshed job list print ">>> job objects: %r"%(jobs) print ">>> " #for job in jobs: # print ">>> Found job %r, status=%r, args=%r"%(job,job.getstatus(),job.args.rstrip()) if i<nchecks-1: sleep(2)