def lsf_run_until_done(to_run_dict, logfile, queue, bsub_flags, jobname_base, num_batches, MAX_RETRY): from run_safe import unfinished_cmds cmds = unfinished_cmds(to_run_dict) retries = 0 last_cmds = [] while len(cmds) > 0: print >> sys.stderr, '%s: %s cmds to run in %s batches on queue %s, logs in %s' % ( jobname_base, len(cmds), num_batches, queue, logfile) #code to halt execution on recurrent errors if set(last_cmds) == set(cmds): if retries > MAX_RETRY: errstr = 'maximum number of retry attempts (%s) exceeded with identical jobs lists. Check logs (%s) for recurrent errors' % ( MAX_RETRY, logfile) raise IOError, errstr else: retries += 1 last_cmds = cmds jobids, namedict = lsf_jobs_submit(cmds, logfile, queue, bsub_flags, jobname_base=jobname_base, num_batches=num_batches) time.sleep(20) lsf_wait_for_jobs(jobids, logfile, namedict=namedict) cmds = unfinished_cmds(to_run_dict) print >> sys.stderr, 'DONE\n'
def run_until_done(to_run_dict,jobname_base,scriptdir, runtime,mem, num_batches, partition='general' ,force_source=False,MAX_RETRY=MAX_RETRY,**kwargs): '''given to-run dictionary as populated by run_safe.add_cmd (see run_safe.py in py_util) and scheduling parameters submits jobs that have not yet completed per run_safe .done files until all jobs finish or until identical job lists are submitted MAX_RETRY times see jobs_submit and wait_for_jobs in this module for more details kwargs go to jobs_submit; see jobs_submit and slurm_script for handling of additional arguments ''' from run_safe import unfinished_cmds cmds = unfinished_cmds(to_run_dict) retries = 0 last_cmds = [] while len(cmds) > 0: print >> sys.stderr, '%s: %s cmds to run in %s batches on queue %s, logs in %s' % (jobname_base,len(cmds),num_batches,partition,scriptdir) #code to halt execution on recurrent errors if set(last_cmds) == set(cmds): if retries > MAX_RETRY: errstr = 'maximum number of retry attempts (%s) exceeded with identical jobs lists. Check logs (%s) for recurrent errors' % (MAX_RETRY,scriptdir) raise IOError, errstr else: retries += 1 last_cmds = cmds jobsdict = jobs_submit(cmds,jobname_base,scriptdir, runtime,mem, num_batches,partition=partition ,force_source=force_source, **kwargs) time.sleep(20) wait_for_jobs(jobsdict,restart_partition=partition,sleeptime = 20) time.sleep(20) cmds = unfinished_cmds(to_run_dict) print >> sys.stderr, 'DONE\n'
def lsf_run_until_done(to_run_dict,logfile,queue,bsub_flags,jobname_base,num_batches,MAX_RETRY): from run_safe import unfinished_cmds cmds = unfinished_cmds(to_run_dict) retries = 0 last_cmds = [] while len(cmds) > 0: print >> sys.stderr, '%s: %s cmds to run in %s batches on queue %s, logs in %s' % (jobname_base,len(cmds),num_batches,queue,logfile) #code to halt execution on recurrent errors if set(last_cmds) == set(cmds): if retries > MAX_RETRY: errstr = 'maximum number of retry attempts (%s) exceeded with identical jobs lists. Check logs (%s) for recurrent errors' % (MAX_RETRY,logfile) raise IOError, errstr else: retries += 1 last_cmds = cmds jobids,namedict = lsf_jobs_submit(cmds,logfile,queue,bsub_flags,jobname_base=jobname_base,num_batches=num_batches) time.sleep(20) lsf_wait_for_jobs(jobids,logfile,namedict=namedict) cmds = unfinished_cmds(to_run_dict) print >> sys.stderr, 'DONE\n'