def run(self, classname, funcname, hosts='all', **kwargs): """ Runs tasks in serial or parallel on specified hosts """ self.checkpoint() self.save_kwargs(classname, funcname, kwargs) if hosts == 'all': # run on all available nodes call('srun ' + '--wait=0 ' + join(findpath('seisflows.system'), 'wrappers/run ') + PATH.OUTPUT + ' ' + classname + ' ' + funcname + ' ' + PAR.ENVIRONS) elif hosts == 'head': # run on head node call('srun ' + '--wait=0 ' + '--ntasks=1 ' + '--nodes=1 ' + join(findpath('seisflows.system'), 'wrappers/run ') + PATH.OUTPUT + ' ' + classname + ' ' + funcname + ' ' + PAR.ENVIRONS) else: raise KeyError('Bad keyword argument: system.run: hosts')
def run(self, classname, method, hosts='all', **kwargs): """ Executes the following task: classname.method(*args, **kwargs) """ self.checkpoint() if hosts == 'all': # run all tasks call( findpath('seisflows.system') + '/' + 'wrappers/dsh ' + ','.join(self.hostlist()) + ' ' + findpath('seisflows.system') + '/' + 'wrappers/run ' + PATH.OUTPUT + ' ' + classname + ' ' + method + ' ' + 'PYTHONPATH=' + findpath('seisflows'), + ',' + PAR.ENVIRONS) elif hosts == 'head': # run a single task call( 'ssh ' + self.hostlist()[0] + ' ' + '"' + 'export SEISFLOWS_TASK_ID=0; ' + join(findpath('seisflows.system'), 'wrappers/run ') + PATH.OUTPUT + ' ' + classname + ' ' + method + ' ' + 'PYTHONPATH=' + findpath('seisflows'), + ',' + PAR.ENVIRONS + '"') else: raise KeyError('Bad keyword argument: system.run: hosts')
def run(self, classname, funcname, hosts='all', **kwargs): """ Runs tasks in serial or parallel on specified hosts """ self.checkpoint() self.save_kwargs(classname, funcname, kwargs) if hosts == 'all': unix.run( findpath('seisflows.system') + '/' + 'wrappers/dsh2 ' + str(PAR.NTASK) + ' ' + findpath('seisflows.system') + '/' + 'wrappers/run ' + PATH.OUTPUT + ' ' + classname + ' ' + funcname + ' ' + PAR.ENVIRONS) # call('srun ' # + '--wait=0 ' # + '--exclusive ' # + join(findpath('seisflows.system'), 'wrappers/run ') # + PATH.OUTPUT + ' ' # + classname + ' ' # + funcname + ' ' # + PAR.ENVIRONS) elif hosts == 'head': # run on head node unix.run( findpath('seisflows.system') + '/' + 'wrappers/run ' + PATH.OUTPUT + ' ' + classname + ' ' + funcname + ' ' + PAR.ENVIRONS)
def run(self, classname, method, hosts='all', **kwargs): """ Runs embarrassingly parallel tasks Executes the following multiple times: classname.method(*args, **kwargs) system.taskid serves to provide each running task a unique identifier """ self.checkpoint(PATH.OUTPUT, classname, method, args, kwargs) if hosts == 'all': # run all tasks call( findpath('seisflows.system') + '/' + 'wrappers/dsh ' + ','.join(self.hostlist()) + ' ' + findpath('seisflows.system') + '/' + 'wrappers/run ' + PATH.OUTPUT + ' ' + classname + ' ' + method + ' ' + 'PYTHONPATH=' + findpath('seisflows'), + ',' + PAR.ENVIRONS) elif hosts == 'head': # run a single task call( 'ssh ' + self.hostlist()[0] + ' ' + '"' + 'export SEISFLOWS_TASK_ID=0; ' + join(findpath('seisflows.system'), 'wrappers/run ') + PATH.OUTPUT + ' ' + classname + ' ' + method + ' ' + 'PYTHONPATH=' + findpath('seisflows'), + ',' + PAR.ENVIRONS + '"') else: raise KeyError('Bad keyword argument: system.run: hosts')
def run(self, classname, funcname, hosts='all', **kwargs): """ Runs task(s) on specified hosts """ self.checkpoint() if hosts == 'all': # run all tasks call(findpath('seisflows.system') +'/'+'wrappers/dsh ' + ','.join(self.hostlist()) + ' ' + PATH.OUTPUT + ' ' + classname + ' ' + funcname + ' ' + findpath('seisflows.system') +'/'+'wrappers/run ' + 'PYTHONPATH='+findpath('seisflows.system'),+',' + PAR.ENVIRONS) elif hosts == 'head': # run a single task call('ssh ' + self.hostlist()[0] + ' ' + '"' + 'export SEISFLOWS_TASK_ID=0; ' + join(findpath('seisflows.system'), 'wrappers/run ') + PATH.OUTPUT + ' ' + classname + ' ' + funcname + ' ' + 'PYTHONPATH='+findpath('seisflows.system'),+',' + PAR.ENVIRONS +'"') else: raise KeyError('Bad keyword argument: system.run: hosts')
def job_array_args(self, hosts): if hosts == 'all': args = ('-J 0-%s ' % (PAR.NTASK-1) +'-o %s ' % (PATH.WORKDIR+'/'+'output.pbs/' + '$PBS_ARRAYID') + ' -- ' + findpath('seisflows.system') +'/'+ 'wrappers/run ') elif hosts == 'head': args = ('-J 0-0 ' +'-o %s ' % (PATH.WORKDIR+'/'+'output.pbs/' + '$PBS_JOBID') + ' -- ' + findpath('seisflows.system') +'/'+ 'wrappers/run ') return args
def job_array_args(self, hosts): if hosts == 'all': args = ('-J 0-%s ' % (PAR.NTASK-1) +'-o %s ' % (PATH.WORKDIR+'/'+'output.pbs/' + '$PBS_ARRAYID') + ' -- ' + findpath('seisflows.system') +'/'+ 'wrappers/run ') elif hosts == 'head': args = ('-J 0-0 ' +'-o %s ' % (PATH.WORKDIR+'/'+'output.pbs/' + '$PBS_JOBID') + ' -- ' + findpath('seisflows.system') +'/'+ 'wrappers/run ') return args
def run_single(self, classname, method, hosts='all', **kwargs): """ Runs task multiple times in embarrassingly parallel fasion Executes classname.method(*args, **kwargs) NTASK times, each time on NPROC cpu cores """ self.checkpoint(PATH.OUTPUT, classname, method, args, kwargs) stdout = check_output( 'bsub %s ' % PAR.LSFARGS + '-n %d ' % PAR.NPROC + '-R "span[ptile=%d]" ' % PAR.NODESIZE + '-W %d:00 ' % PAR.TASKTIME + '-J "%s' %PAR.TITLE + '[%d-%d]' % (1, 1) + '-o %s ' % (PATH.WORKDIR+'/'+'output.lsf/'+'%J') + '%s ' % findpath('seisflows.system') +'/'+ 'wrapper/run ' + '%s ' % PATH.OUTPUT + '%s ' % classname + '%s ' % method + '%s ' % PAR.ENVIRONS, shell=True) # keep track of job ids jobs = self.job_id_list(stdout, ntask=1) while True: # wait 30 seconds before checking status again time.sleep(30) self.timestamp() isdone, jobs = self.job_status(classname, method, jobs) if isdone: return
def job_array_cmd(self, classname, funcname, hosts): nodes = math.ceil(PAR.NTASK/float(PAR.NODESIZE)) ncpus = PAR.NPROC mpiprocs = PAR.NPROC hours = PAR.STEPTIME/60 minutes = PAR.STEPTIME%60 walltime = 'walltime=%02d:%02d:00 '%(hours, minutes) return ('qsub ' + '%s ' % PAR.PBSARGS + '-l select=%d:ncpus=%d:mpiprocs=%d ' (nodes, ncpus, mpiprocs) + '-l %s ' % walltime + '-J 0-%s ' % (PAR.NTASK-1) + '-N %s ' % PAR.TITLE + '-o %s ' % (PATH.WORKDIR+'/'+'output.pbs/' + '$PBS_ARRAYID') + '-r y ' + '-j oe ' + '-V ' + self.job_array_args(hosts) + PATH.OUTPUT + ' ' + classname + ' ' + funcname + ' ' + 'PYTHONPATH='+findpath('seisflows.system'),+',' + PAR.ENVIRONS)
def submit(self, workflow): """ Submits workflow """ # create scratch directories if not exists(PATH.SCRATCH): path = '/scratch/gpfs'+'/'+getuser()+'/'+'seisflows'+'/'+str(uuid4()) unix.mkdir(path) unix.ln(path, PATH.SCRATCH) unix.mkdir(PATH.SYSTEM) # create output directories unix.mkdir(PATH.OUTPUT) unix.mkdir(PATH.WORKDIR+'/'+'output.slurm') self.checkpoint() # prepare sbatch arguments call('sbatch ' + '%s ' % PAR.SLURMARGS + '--job-name=%s ' % PAR.TITLE + '--output %s ' % (PATH.WORKDIR+'/'+'output.log') + '--ntasks-per-node=28 ' + '--ntasks=28 ' + '--gres=gpu:4 ' + '--nodes=%d ' % 1 + '--time=%d ' % PAR.WALLTIME + findpath('seisflows.system') +'/'+ 'wrappers/submit ' + PATH.OUTPUT)
def submit(self, workflow): """ Submits workflow """ # create scratch directories unix.mkdir(PATH.SCRATCH) unix.mkdir(PATH.SYSTEM) # create output directories unix.mkdir(PATH.OUTPUT) unix.mkdir(PATH.WORKDIR+'/'+'output.pbs') self.checkpoint() hours = PAR.WALLTIME/60 minutes = PAR.WALLTIME%60 walltime = 'walltime=%02d:%02d:00 ' % (hours, minutes) ncpus = PAR.NODESIZE mpiprocs = PAR.NODESIZE # prepare qsub arguments call( 'qsub ' + '%s ' % PAR.PBSARGS + '-l select=1:ncpus=%d:mpiprocs=%d ' % (ncpus, mpiprocs) + '-l %s ' % walltime + '-N %s ' % PAR.TITLE + '-j %s '%'oe' + '-o %s ' % (PATH.WORKDIR+'/'+'output.log') + '-V ' + ' -- ' + findpath('seisflows.system') +'/'+ 'wrappers/submit ' + PATH.OUTPUT)
def job_array_cmd(self, classname, funcname, hosts): return ('bsub ' + '%s ' % PAR.LSFARGS + '-n %d ' % PAR.NPROC + '-R "span[ptile=%d]" ' % PAR.NODESIZE + '-W %d:00 ' % PAR.STEPTIME + '-J "%s' % PAR.TITLE + self.launch_args(hosts) + findpath('seisflows.system') + '/' + 'wrapper/run ' + PATH.OUTPUT + ' ' + classname + ' ' + funcname + ' ' + PAR.ENVIRONS)
def run_single(self, classname, method, hosts='all', **kwargs): """ Runs task multiple times in embarrassingly parallel fasion Executes classname.method(*args, **kwargs) NTASK times, each time on NPROC cpu cores """ self.checkpoint(PATH.OUTPUT, classname, method, args, kwargs) stdout = check_output( 'bsub %s ' % PAR.LSFARGS + '-n %d ' % PAR.NPROC + '-R "span[ptile=%d]" ' % PAR.NODESIZE + '-W %d:00 ' % PAR.TASKTIME + '-J "%s' % PAR.TITLE + '[%d-%d]' % (1, 1) + '-o %s ' % (PATH.WORKDIR + '/' + 'output.lsf/' + '%J') + '%s ' % findpath('seisflows.system') + '/' + 'wrapper/run ' + '%s ' % PATH.OUTPUT + '%s ' % classname + '%s ' % method + '%s ' % PAR.ENVIRONS, shell=True) # keep track of job ids jobs = self.job_id_list(stdout, ntask=1) while True: # wait 30 seconds before checking status again time.sleep(30) self.timestamp() isdone, jobs = self.job_status(classname, method, jobs) if isdone: return
def submit(self, workflow): """ Submits workflow """ # create scratch directories unix.mkdir(PATH.SCRATCH) unix.mkdir(PATH.SYSTEM) # create output directories unix.mkdir(PATH.OUTPUT) unix.mkdir(PATH.WORKDIR+'/'+'output.pbs') workflow.checkpoint() hours = PAR.WALLTIME/60 minutes = PAR.WALLTIME%60 walltime = 'walltime=%02d:%02d:00 ' % (hours, minutes) ncpus = PAR.NODESIZE mpiprocs = PAR.NODESIZE # prepare qsub arguments call( 'qsub ' + '%s ' % PAR.PBSARGS + '-l select=1:ncpus=%d:mpiprocs=%d ' % (ncpus, mpiprocs) + '-l %s ' % walltime + '-N %s ' % PAR.TITLE + '-j %s '%'oe' + '-o %s ' % (PATH.WORKDIR+'/'+'output.log') + '-V ' + ' -- ' + findpath('seisflows.system') +'/'+ 'wrappers/submit ' + PATH.OUTPUT)
def job_array_cmd(self, classname, method, hosts): nodes = math.ceil(PAR.NTASK/float(PAR.NODESIZE)) ncpus = PAR.NPROC mpiprocs = PAR.NPROC hours = PAR.TASKTIME/60 minutes = PAR.TASKTIME%60 walltime = 'walltime=%02d:%02d:00 '%(hours, minutes) return ('qsub ' + '%s ' % PAR.PBSARGS + '-l select=%d:ncpus=%d:mpiprocs=%d ' (nodes, ncpus, mpiprocs) + '-l %s ' % walltime + '-J 0-%s ' % (PAR.NTASK-1) + '-N %s ' % PAR.TITLE + '-o %s ' % (PATH.WORKDIR+'/'+'output.pbs/' + '$PBS_ARRAYID') + '-r y ' + '-j oe ' + '-V ' + self.job_array_args(hosts) + PATH.OUTPUT + ' ' + classname + ' ' + method + ' ' + 'PYTHONPATH='+findpath('seisflows.system'),+',' + PAR.ENVIRONS)
def run_single(self, classname, method, *args, **kwargs): """ Runs task a single time Executes classname.method(*args, **kwargs) a single time on NPROC cpu cores """ self.checkpoint(PATH.OUTPUT, classname, method, args, kwargs) # submit job stdout = check_output( 'sbatch %s ' % PAR.SLURMARGS + '--job-name=%s ' % PAR.TITLE + '--nodes=%d ' % math.ceil(PAR.NPROC / float(PAR.NODESIZE)) + '--ntasks-per-node=%d ' % PAR.NODESIZE + '--ntasks=%d ' % PAR.NPROC + '--time=%d ' % PAR.TASKTIME + '--array=%d-%d ' % (0, 0) + '--output %s ' % (PATH.WORKDIR + '/' + 'output.slurm/' + '%A_%a') + '%s ' % (findpath('seisflows.system') + '/' + 'wrappers/run') + '%s ' % PATH.OUTPUT + '%s ' % classname + '%s ' % method + '%s ' % PAR.ENVIRONS + '%s ' % 'SEISFLOWS_TASKID=0', shell=True) # keep track of job ids jobs = self.job_id_list(stdout, 1) # check job completion status while True: # wait a few seconds between queries time.sleep(5) isdone, jobs = self.job_array_status(classname, method, jobs) if isdone: return
def job_array_cmd(self, classname, funcname, hosts): return ('sbatch ' + '%s ' % PAR.SLURMARGS + '--job-name=%s ' % PAR.TITLE + '--nodes=%d ' % math.ceil(PAR.NPROC / float(PAR.NODESIZE)) + '--ntasks-per-node=%d ' % PAR.NODESIZE + '--ntasks=%d ' % PAR.NPROC + '--time=%d ' % PAR.STEPTIME + self.job_array_args(hosts) + findpath('seisflows.system') + '/' + 'wrappers/run ' + PATH.OUTPUT + ' ' + classname + ' ' + funcname + ' ' + PAR.ENVIRONS)
def run(self, classname, method, *args, **kwargs): """ Runs task multiple times in embarrassingly parallel fasion """ self.checkpoint(PATH.OUTPUT, classname, method, args, kwargs) call('srun ' + '--wait=0 ' + '%s ' % join(findpath('seisflows.system'), 'wrappers/run ') + '%s ' % PATH.OUTPUT + '%s ' % classname + '%s ' % method + '%s ' % PAR.ENVIRONS)
def run_single(self, classname, method, *args, **kwargs): """ Runs task a single time """ self.checkpoint(PATH.OUTPUT, classname, method, args, kwargs) call('srun ' + '--wait=0 ' + '--ntasks=1 ' + '--nodes=1 ' + '%s ' % join(findpath('seisflows.system'), 'wrappers/run ') + '%s ' % PATH.OUTPUT + '%s ' % classname + '%s ' % method + '%s ' % PAR.ENVIRONS)
def run_single(self, classname, method, scale_tasktime=1, *args, **kwargs): """ Runs task a single time Executes classname.method(*args, **kwargs) a single time on NPROC cpu cores """ self.checkpoint(PATH.OUTPUT, classname, method, args, kwargs) # submit job run_call = " ".join([ 'sbatch', '%s ' % PAR.SLURMARGS, '--account=%s' % PAR.ACCOUNT, '--job-name=%s' % PAR.TITLE, '--clusters=%s' % PAR.MAIN_CLUSTER, '--partition=%s' % PAR.MAIN_PARTITION, '--cpus-per-task=%s' % PAR.CPUS_PER_TASK, '--ntasks=%d' % PAR.NPROC, '--nodes=%d' % PAR.NODES, '--time=%d' % (PAR.TASKTIME * scale_tasktime), '--array=%d-%d' % (0, 0), '--output %s' % (PATH.WORKDIR + '/' + 'output.slurm/' + '%A_%a'), '%s' % (findpath('seisflows.system') + '/' + 'wrappers/run'), '%s' % PATH.OUTPUT, '%s' % classname, '%s' % method, '%s' % PAR.ENVIRONS, '%s' % 'SEISFLOWS_TASKID=0' ]) if PAR.WITH_OPENMP: run_call = self.prep_openmp(run_call) stdout = check_output(run_call, shell=True) # keep track of job ids jobs = self.job_id_list(stdout, 1) # check job completion status check_status_error = 0 while True: # wait a few seconds between queries time.sleep(5) # Occassionally connections using 'sacct' are refused leading to job # failure. Wrap in a try-except and allow a handful of failures # incase the failure was a one-off connection problem try: isdone, jobs = self.job_array_status(classname, method, jobs) except CalledProcessError: check_status_error += 1 if check_status_error >= 10: print "check job status with sacct failed 10 times" sys.exit(-1) pass if isdone: return
def resubmit_cmd(self, classname, funcname, getnode): return ( 'sbatch ' + '%s ' % PAR.SLURMARGS + '--job-name=%s ' % PAR.TITLE + '--nodes=%d ' % math.ceil(PAR.NPROC / float(PAR.NODESIZE)) + '--ntasks-per-node=%d ' % PAR.NODESIZE + '--ntasks=%d ' % PAR.NPROC + '--time=%d ' % PAR.TASKTIME + '--output=%s ' % (PATH.WORKDIR + '/' + 'output.slurm/' + '%j') + '--export=TASKID=%d ' % getnode + findpath('seisflows.system') + '/' + 'wrappers/run ' + PATH.OUTPUT + ' ' + classname + ' ' + funcname + ' ' + PAR.ENVIRONS)
def _launch(self, classname, method, taskid=0): env = list(os.environ.copy().items()) env += [['SEISFLOWS_TASKID', str(taskid)]] self.progress(taskid) p = Popen(findpath('seisflows.system') + '/' + 'wrappers/run ' + PATH.OUTPUT + ' ' + classname + ' ' + method, shell=True, env=dict(env)) return p
def run(self, classname, method, *args, **kwargs): """ Runs task multiple times in embarrassingly parallel fasion """ self.checkpoint(PATH.OUTPUT, classname, method, args, kwargs) call('srun ' + '--wait=0 ' + '%s ' % join(findpath('seisflows.system'), 'wrappers/run ') + '%s ' % PATH.OUTPUT + '%s ' % classname + '%s ' % method + '%s ' % PAR.ENVIRONS)
def _launch(self, classname, funcname, itask=0): self.progress(itask) env = os.environ.copy().items() env += [['SEISFLOWS_TASKID', str(itask)]] p = Popen(findpath('seisflows.system') + '/' + 'wrappers/run ' + PATH.OUTPUT + ' ' + classname + ' ' + funcname, shell=True, env=dict(env)) return p
def submit(self, workflow): """ Submits workflow to maui_ancil cluster This needs to be run on maui_ancil because maui does not have the ability to run the command "sacct" """ # create scratch directories unix.mkdir(PATH.SCRATCH) unix.mkdir(PATH.SYSTEM) # create output directories unix.mkdir(PATH.OUTPUT) unix.mkdir(PATH.WORKDIR + '/' + 'output.slurm') if not exists('./scratch'): unix.ln(PATH.SCRATCH, PATH.WORKDIR + '/' + 'scratch') # if resuming, rename the old log files so they don't get overwritten output_log = os.path.join(PATH.WORKDIR, 'output.log') error_log = os.path.join(PATH.WORKDIR, 'error.log') for log in [output_log, error_log]: log_prior = log + '_prior' log_temp = log + '_temp' if os.path.exists(log): # If a prior log exists, move to temp file and then rewrite # with new log file if os.path.exists(log_prior): os.rename(log_prior, log_temp) with open(log_prior, 'w') as f_out: for fid in [log_temp, log]: with open(fid) as f_in: f_out.write(f_in.read()) unix.rm(log_temp) else: os.rename(log, log_prior) workflow.checkpoint() # Submit to maui_ancil call(" ".join([ 'sbatch', '%s' % PAR.SLURMARGS, '--account=%s' % PAR.ACCOUNT, '--clusters=%s' % PAR.ANCIL_CLUSTER, '--partition=%s' % PAR.ANCIL_PARTITION, '--job-name=%s' % 'M_' + PAR.TITLE, '--output=%s' % output_log, '--error=%s' % error_log, '--ntasks=%d' % 1, '--cpus-per-task=%d' % 1, '--time=%d' % PAR.WALLTIME, findpath('seisflows.system') + '/' + 'wrappers/submit ', PATH.OUTPUT ]))
def _run_task(self, classname, method, taskid=0): env = os.environ.copy() # import pdb; pdb.set_trace() env['SEISFLOWS_TASKID'] = str(taskid) self.progress(taskid) p = Popen(findpath('seisflows.system') + '/' + 'wrappers/run ' + PATH.OUTPUT + ' ' + classname + ' ' + method, shell=True, env=dict(env)) return p
def _run_task(self, classname, method, taskid=0): env = os.environ.copy().items() env += [['SEISFLOWS_TASKID', str(taskid)]] self.progress(taskid) p = Popen( findpath('seisflows.system') +'/'+ 'wrappers/run ' + PATH.OUTPUT + ' ' + classname + ' ' + method, shell=True, env=dict(env)) return p
def run(self, classname, method, hosts='all', **kwargs): """ Executes the following task: classname.method(*args, **kwargs) """ self.checkpoint() self.save_kwargs(classname, method, kwargs) if hosts == 'all': # run on all available nodes call('srun ' + '--wait=0 ' + join(findpath('seisflows.system'), 'wrappers/run ') + PATH.OUTPUT + ' ' + classname + ' ' + method + ' ' + PAR.ENVIRONS) elif hosts == 'head': # run on head node call('srun ' + '--wait=0 ' + '--ntasks=1 ' + '--nodes=1 ' + join(findpath('seisflows.system'), 'wrappers/run ') + PATH.OUTPUT + ' ' + classname + ' ' + method + ' ' + PAR.ENVIRONS) else: raise KeyError('Bad keyword argument: system.run: hosts')
def run_single(self, classname, method, *args, **kwargs): """ Runs task a single time """ self.checkpoint(PATH.OUTPUT, classname, method, args, kwargs) call('srun ' + '--wait=0 ' + '--ntasks=1 ' + '--nodes=1 ' + '%s ' % join(findpath('seisflows.system'), 'wrappers/run ') + '%s ' % PATH.OUTPUT + '%s ' % classname + '%s ' % method + '%s ' % PAR.ENVIRONS)
def job_array_cmd(self, classname, funcname, hosts): return ('sbatch ' + '%s ' % PAR.SLURMARGS + '--job-name=%s ' % PAR.TITLE + '--nodes=1 ' + '--ntasks-per-node=1 ' + '--ntasks=1 ' + '--gres=gpu:1 ' + '--time=%d ' % PAR.TASKTIME + self.job_array_args(hosts) + findpath('seisflows.system') +'/'+ 'wrappers/run ' + PATH.OUTPUT + ' ' + classname + ' ' + funcname + ' ' + PAR.ENVIRONS)
def run(self, classname, method, hosts='all', **kwargs): """ Runs embarrassingly parallel tasks Executes the following multiple times: classname.method(*args, **kwargs) system.taskid serves to provide each running task a unique identifier """ self.checkpoint(PATH.OUTPUT, classname, method, args, kwargs) if hosts == 'all': # run all tasks call(findpath('seisflows.system') +'/'+'wrappers/dsh ' + ','.join(self.hostlist()) + ' ' + findpath('seisflows.system') +'/'+'wrappers/run ' + PATH.OUTPUT + ' ' + classname + ' ' + method + ' ' + 'PYTHONPATH='+findpath('seisflows'),+',' + PAR.ENVIRONS) elif hosts == 'head': # run a single task call('ssh ' + self.hostlist()[0] + ' ' + '"' + 'export SEISFLOWS_TASK_ID=0; ' + join(findpath('seisflows.system'), 'wrappers/run ') + PATH.OUTPUT + ' ' + classname + ' ' + method + ' ' + 'PYTHONPATH='+findpath('seisflows'),+',' + PAR.ENVIRONS +'"') else: raise KeyError('Bad keyword argument: system.run: hosts')
def run_ancil(self, classname, method, *args, **kwargs): """ Runs task a single time. For Maui this is run on maui ancil and also includes some extra arguments for eval_func """ self.checkpoint(PATH.OUTPUT, classname, method, args, kwargs) # submit job stdout = check_output(" ".join([ 'sbatch', '%s' % PAR.SLURMARGS, '--job-name=%s' % PAR.TITLE, '--tasks=%d' % 1, '--cpus-per-task=%d' % PAR.CPUS_PER_TASK, '--account=%s' % PAR.ACCOUNT, '--clusters=%s' % PAR.ANCIL_CLUSTER, '--partition=%s' % PAR.ANCIL_PARTITION, '--time=%d' % PAR.ANCIL_TASKTIME, '--array=%d-%d' % (0, (PAR.NTASK - 1) % PAR.NTASKMAX), '--output %s' % (PATH.WORKDIR + '/' + 'output.slurm/' + '%A_%a'), '%s' % (findpath('seisflows.system') + '/' + 'wrappers/run'), '%s' % PATH.OUTPUT, '%s' % classname, '%s' % method, '%s' % PAR.ENVIRONS ]), shell=True) # keep track of job ids jobs = self.job_id_list(stdout, 1) # check job completion status check_status_error = 0 while True: # wait a few seconds between queries time.sleep(5) # Occassionally connections using 'sacct' are refused leading to job # failure. Wrap in a try-except and allow a handful of failures # incase the failure was a one-off connection problem try: isdone, jobs = self.job_array_status(classname, method, jobs) except CalledProcessError: check_status_error += 1 if check_status_error >= 10: print "check job status with sacct failed 10 times" sys.exit(-1) pass if isdone: return
def submit(self, workflow): """ Submits workflow """ unix.cd(PATH.WORKDIR) if not exists('./scratch'): unix.ln(PATH.SCRATCH, PATH.WORKDIR + '/' + 'scratch') unix.mkdir(PATH.OUTPUT) workflow.checkpoint() # prepare sbatch arguments call('sbatch ' + '%s ' % PAR.SLURMARGS + '--partition=%s ' % PAR.PARTITION + '--job-name=%s ' % PAR.TITLE + '--output %s ' % (PATH.WORKDIR + '/' + 'output.log') + '--cpus-per-task=%d ' % PAR.NPROC + '--ntasks=%d ' % PAR.NTASK + '--time=%d ' % PAR.WALLTIME + findpath('seisflows.system') + '/' + 'wrappers/submit ' + PATH.OUTPUT)
def submit(self, workflow): """ Submits workflow """ unix.mkdir(PATH.OUTPUT) unix.cd(PATH.OUTPUT) self.checkpoint() # if not exists(PATH.SUBMIT + '/' + 'scratch'): # unix.ln(PATH.SCRATCH, PATH.SUBMIT + '/' + 'scratch') nnodes = 1 #PAR.NTASK / PAR.NGPU call('sbatch ' + '%s ' % PAR.SLURMARGS + '--job-name=%s ' % PAR.TITLE + '--output=%s ' % (PATH.WORKDIR + '/' + 'output.log') + '--nodes %d ' % nnodes + '--ntasks-per-node=%d ' % PAR.NGPU + '--ntasks-per-socket=%d ' % PAR.NGPU + '--gres=gpu:%d ' % PAR.NGPU + '--time=%d ' % PAR.WALLTIME + findpath('seisflows.system') + '/' + 'wrappers/submit ' + PATH.OUTPUT)
def submit(self, workflow): """ Submits workflow """ # create scratch directories unix.mkdir(PATH.SCRATCH) unix.mkdir(PATH.SYSTEM) # create output directories unix.mkdir(PATH.OUTPUT) self.checkpoint() # submit workflow call('sbatch ' + '%s ' % PAR.SLURMARGS + '--job-name=%s ' % PAR.TITLE + '--output=%s ' % (PATH.WORKDIR + '/' + 'output.log') + '--cpus-per-task=%d ' % PAR.NPROC + '--ntasks=%d ' % PAR.NTASK + '--time=%d ' % PAR.WALLTIME + findpath('seisflows.system') + '/' + 'wrappers/submit ' + PATH.OUTPUT)
def write_sources(PAR, h, path='.'): """ Writes source information to text file """ file = findpath('sesiflows.plugins') + '/' + 'specfem3d/FORCESOLUTION' with open(file, 'r') as f: lines = f.readlines() file = 'DATA/FORCESOURCE' with open(file, 'w') as f: f.writelines(lines) # adjust coordinates setpar('xs', h.sx[0], file) setpar('zs', h.sz[0], file) setpar('ts', h.ts, file) # adjust wavelet setpar('f0', PAR['F0'], file)
def write_sources(coords, path='.', ws=1., suffix=''): """ Writes source information to text file TODO this has to be adapted for new versions of specfem because the source file format has changed """ sx, sy, sz = coords filename = findpath('seisflows.plugins') + '/' + 'solver/specfem2d/SOURCE' with open(filename, 'r') as f: lines = f.readlines() filename = 'DATA/SOURCE' + suffix with open(filename, 'w') as f: f.writelines(lines) # adjust source coordinates setpar('xs', sx, filename) setpar('zs', sy, filename) # setpar('ts', ts[0], filename) # adjust source amplitude try: fs = float(getpar('factor', filename)) fs *= ws setpar('factor', str(fs), filename) except: pass # adjust source wavelet if 1: # Ricker wavelet setpar('time_function_type', 1, filename) elif 0: # first derivative of Gaussian setpar('time_function_type', 2, filename) elif 0: # Gaussian setpar('time_function_type', 3, filename) elif 0: # Dirac setpar('time_function_type', 4, filename) elif 0: # Heaviside setpar('time_function_type', 5, filename)
def submit(self, workflow): """ Submits workflow """ # create scratch directories unix.mkdir(PATH.SCRATCH) unix.mkdir(PATH.SYSTEM) # create output directories unix.mkdir(PATH.OUTPUT) unix.mkdir(PATH.WORKDIR + '/' + 'output.slurm') workflow.checkpoint() # prepare sbatch arguments call('sbatch ' + '%s ' % PAR.SLURMARGS + '--job-name=%s ' % PAR.TITLE + '--output %s ' % (PATH.WORKDIR + '/' + 'output.log') + '--ntasks-per-node=%d ' % PAR.NODESIZE + '--nodes=%d ' % 1 + '--time=%d ' % PAR.WALLTIME + findpath('seisflows.system') + '/' + 'wrappers/submit ' + PATH.OUTPUT)
def write_sources(coords, path='.', ws=1., suffix=''): """ Writes source information to text file """ sx, sy, sz = coords filename = findpath('seisflows.plugins') + '/' + 'solver/specfem2d/SOURCE' with open(filename, 'r') as f: lines = f.readlines() filename = 'DATA/SOURCE' + suffix with open(filename, 'w') as f: f.writelines(lines) # adjust source coordinates setpar('xs', sx, filename) setpar('zs', sy, filename) #setpar('ts', ts[0], filename) # adjust source amplitude try: fs = float(getpar('factor', filename)) fs *= ws setpar('factor', str(fs), filename) except: pass # adjust source wavelet if 1: # Ricker wavelet setpar('time_function_type', 1, filename) elif 0: # first derivative of Gaussian setpar('time_function_type', 2, filename) elif 0: # Gaussian setpar('time_function_type', 3, filename) elif 0: # Dirac setpar('time_function_type', 4, filename) elif 0: # Heaviside setpar('time_function_type', 5, filename)
def submit(self, workflow): """ Submits workflow """ # create scratch directories unix.mkdir(PATH.SCRATCH) unix.mkdir(PATH.SYSTEM) # create output directories unix.mkdir(PATH.OUTPUT) workflow.checkpoint() # submit workflow call('sbatch ' + '%s ' % PAR.SLURMARGS + '--job-name=%s '%PAR.TITLE + '--output=%s '%(PATH.WORKDIR +'/'+ 'output.log') + '--cpus-per-task=%d '%PAR.NPROC + '--ntasks=%d '%PAR.NTASK + '--time=%d '%PAR.WALLTIME + '%s ' % findpath('seisflows.system') +'/'+ 'wrappers/submit' + '%s ' % PATH.OUTPUT)
def submit(self, workflow): """ Submits workflow """ # create scratch directories unix.mkdir(PATH.SCRATCH) unix.mkdir(PATH.SYSTEM) # create output directories unix.mkdir(PATH.OUTPUT) unix.mkdir(PATH.WORKDIR+'/'+'output.slurm') workflow.checkpoint() # prepare sbatch arguments call('sbatch ' + '%s ' % PAR.SLURMARGS + '--job-name=%s ' % PAR.TITLE + '--output %s ' % (PATH.WORKDIR+'/'+'output.log') + '--ntasks-per-node=%d ' % PAR.NODESIZE + '--nodes=%d ' % 1 + '--time=%d ' % PAR.WALLTIME + findpath('seisflows.system') +'/'+ 'wrappers/submit ' + PATH.OUTPUT)
def run_single(self, classname, method, *args, **kwargs): """ Runs task a single time Executes classname.method(*args, **kwargs) a single time on NPROC cpu cores """ self.checkpoint(PATH.OUTPUT, classname, method, args, kwargs) # submit job stdout = check_output( 'sbatch %s ' % PAR.SLURMARGS + '--job-name=%s ' % PAR.TITLE + '--nodes=%d ' % math.ceil(PAR.NPROC/float(PAR.NODESIZE)) + '--ntasks-per-node=%d ' % PAR.NODESIZE + '--ntasks=%d ' % PAR.NPROC + '--time=%d ' % PAR.TASKTIME + '--array=%d-%d ' % (0,0) + '--output %s ' % (PATH.WORKDIR+'/'+'output.slurm/'+'%A_%a') + '%s ' % (findpath('seisflows.system') +'/'+ 'wrappers/run') + '%s ' % PATH.OUTPUT + '%s ' % classname + '%s ' % method + '%s ' % PAR.ENVIRONS + '%s ' % 'SEISFLOWS_TASKID=0', shell=True) # keep track of job ids jobs = self.job_id_list(stdout, 1) # check job completion status while True: # wait a few seconds between queries time.sleep(5) isdone, jobs = self.job_array_status(classname, method, jobs) if isdone: return
def submit(self, workflow): """ Submits workflow """ # create scratch directories unix.mkdir(PATH.SCRATCH) unix.mkdir(PATH.SYSTEM) # create output directories unix.mkdir(PATH.OUTPUT) unix.mkdir(PATH.WORKDIR+'/'+'output.lsf') workflow.checkpoint() # prepare bsub arguments call('bsub ' + '%s ' % PAR.LSFARGS + '-J %s ' % PAR.TITLE + '-o %s ' % (PATH.WORKDIR+'/'+'output.log') + '-n %d ' % PAR.NODESIZE + '-e %s ' % (PATH.WORKDIR+'/'+'error.log') + '-R "span[ptile=%d]" ' % PAR.NODESIZE + '-W %d:00 ' % PAR.WALLTIME + findpath('seisflows.system') +'/'+ 'wrappers/submit ' + PATH.OUTPUT)