def getstatusoutput(self, cmd): ''' run command either locally or remotely and get output and status ''' if self.host is None: status, output = commands.getstatusoutput(cmd) else: status, output = ssh(cmd, self.host) return status, output
def getstatusoutput(self, cmd): ''' run command either locally or remotely and get output and status ''' if self.host is None: status,output = commands.getstatusoutput(cmd) else: status,output = ssh(cmd,self.host) return status, output
def qsub(self, jobfiles, QSUB=None, remotedir=None): ''' jobfiles is an iterable that contains all the files needed for the job. The job script is jobfiles[0] remotedir is the directory to run the job in on the remote server. this should be a unique directory for this job because the src files will be copied into it, and allthe files in it copied back. AND the remotedir will be deleted after the job is done server is assumed to be the same place as the PBS server host QSUB is the command to use to submit the job, including options. If it is None, the command is constructed. this function is designed to raise Exceptions unless the job is finished without PBS errors. Then it returns True! ''' jobfile = jobfiles[0] rcfile = jobfile + '.rc' if QSUB is None: QSUB = 'qsub -j oe -l cput=24:00:00,mem=499mb' server = self.host jobdonefile = jobfile + '.done' if os.path.exists(jobdonefile): raise JobDone, 'That job is done. delete %s to resubmit it.' % jobdonefile remotedirfile = jobfile + '.remotedir' if remotedir is None and os.path.exists(remotedirfile): remotedir = open(remotedirfile, 'r').readline().strip() elif remotedir is None: import tempfile i, tmpdirname = tempfile.mkstemp(dir='.') path, tmpdirname = os.path.split(tmpdirname) remotedir = 'tmp/%s' % tmpdirname # mkstemp actually makes the file, so we delete it os.close(i) os.unlink(tmpdirname) else: remotedir = remotedir # arg passed into function f = open(remotedirfile, 'w') f.write(remotedir) f.close() pushbackfile = jobfile + '.pushback' pullbackfile = jobfile + '.pullback' qsubfile = jobfile + '.qsubcmd' # see if job has been submitted before jobid_file = jobfile + '.jobid' if os.path.exists(jobid_file): f = open(jobid_file, 'r') jobid = f.readline().strip() f.close() self.fastpoll() # see if job is in the queue still for job in self: if job['Job Id'] == jobid: if job['job_state'] == 'Q': if self.verbosity > 1: print '%s still in the queue' % jobid raise JobInQueue, '%s still in the queue' % jobid elif job['job_state'] == 'R': if self.verbosity > 1: print '%s is running' % jobid raise JobRunning, '%s is running' % jobid elif job['job_state'] == 'H': raise JobHold, '%s is in Hold status' % jobid elif job['job_state'] == 'E': raise JobErrorStatus, '%s is in Error status' % jobid elif job['job_state'] == 'C': print '%s is done' % jobid else: raise UnknownJobStatus, '%s is in unknown state: %s' % ( jobid, job['job_state']) if self.verbosity > 1: print '%s is not in the queue anymore' % jobid # if you get here, it was not in the queue anymore # now we need to copy the results back src = '%s:%s/' % (server, remotedir) if self.verbosity > 1: print 'copying back remote results: ', src status, output = rsync(src, '.') # we have made it this far, we should now remove the remote # directory. if self.verbosity > 1: print 'removing remote directory: ', remotedir cmd = 'rm -fr %s' % (remotedir) status, output = ssh(cmd, server) if self.verbosity > 1: print 'removing remote directory status: ', status # now remove some files we don't need anymore os.unlink(jobid_file) nodefile = 'pbs.%s.nodes' % jobid if os.path.exists(nodefile): os.unlink(nodefile) if os.path.exists(pushbackfile): os.unlink(pushbackfile) if os.path.exists(pullbackfile): os.unlink(pullbackfile) if os.path.exists(remotedirfile): os.unlink(remotedirfile) if os.path.exists(qsubfile): os.unlink(qsubfile) # now lets try to check for batch errors like memory # exceeded or cput exceeded jobnumber, host = jobid.split('.') joboutputfile = jobfile[0:15] + '.o%s' % jobnumber # this may not exist if user killed job before it started # I also assume here that output and error have been joined if os.path.exists(joboutputfile): f = open(joboutputfile, 'r') #now lets hunt for errors in the output file for line in f: if '=>> PBS: job killed: mem' in line: raise PBS_MemoryExceeded, line elif '=>> PBS: job killed: cput' in line: raise PBS_CputExceeded, line elif 'Terminated' in line: raise PBS_Terminated, line elif '=>> PBS:' in line: raise PBS_UknownError, line elif 'ERROR: LAM/MPI' in line: for line2 in f: if 'ERROR' in line2: print line2 if 'ssh' in line2: print line2 raise LAMMPI_Error, line elif 'forrtl' in line or 'SIGSEGV' in line: raise FORTRAN_Error, line f.close() return True if os.environ.get('PBS_DRYRUN', None) is not None: print 'Dry run detected. exiting' return # this job needs to be submitted if you get here if self.verbosity > 1: print 'Submitting job:' destination = '%s:%s' % (server, remotedir) #make sure destination directory exists status, output = ssh('mkdir -p %s' % remotedir, server) #1 copy files to remote system status, output = rsync(jobfiles, destination) #2 submit job cmds = ['cd %s' % remotedir, '%s %s' % (QSUB, jobfile)] cmd = string.join(cmds, '; ') status, output = ssh(cmd, server) if status is not 0: print '===================================' print output print '===================================' raise PBS_UnknownError # we should save the jobid f = open(jobid_file, 'w') f.write(output) f.close() # copy jobid file to remotedir so we can tell on that end # what this temp dir is for. rsync(jobid_file, destination) # get user and hostname to copy results back to import platform uname = platform.uname() hostname = uname[1] status, user = commands.getstatusoutput('whoami') f = open(qsubfile, 'w') f.write('%s %s\n' % (QSUB, jobfile)) f.close() f = open(pullbackfile, 'w') f.write('#!/bin/tcsh -x\n') f.write('rsync -avz %s:%s/ .\n' % (self.host, remotedir)) f.write('ssh %s@%s rm -fr %s\n' % (user, self.host, remotedir)) f.write('#end') f.close() os.chmod(pullbackfile, 0755) f = open(pushbackfile, 'w') f.write('#!/bin/tcsh -x\n') f.write('rsync -avz . %s@%s:%s\n' % (user, hostname, os.getcwd())) f.close() os.chmod(pushbackfile, 0755) rsync(pushbackfile, destination) raise JobSubmitted, output
def qsub(self, jobfiles, QSUB=None, remotedir=None): ''' jobfiles is an iterable that contains all the files needed for the job. The job script is jobfiles[0] remotedir is the directory to run the job in on the remote server. this should be a unique directory for this job because the src files will be copied into it, and allthe files in it copied back. AND the remotedir will be deleted after the job is done server is assumed to be the same place as the PBS server host QSUB is the command to use to submit the job, including options. If it is None, the command is constructed. this function is designed to raise Exceptions unless the job is finished without PBS errors. Then it returns True! ''' jobfile = jobfiles[0] rcfile = jobfile + '.rc' if QSUB is None: QSUB = 'qsub -j oe -l cput=24:00:00,mem=499mb' server = self.host jobdonefile = jobfile + '.done' if os.path.exists(jobdonefile): raise JobDone, 'That job is done. delete %s to resubmit it.' % jobdonefile remotedirfile = jobfile + '.remotedir' if remotedir is None and os.path.exists(remotedirfile): remotedir = open(remotedirfile, 'r').readline().strip() elif remotedir is None: import tempfile i, tmpdirname = tempfile.mkstemp(dir='.') path, tmpdirname = os.path.split(tmpdirname) remotedir = 'tmp/%s' % tmpdirname # mkstemp actually makes the file, so we delete it os.close(i) os.unlink(tmpdirname) else: remotedir = remotedir # arg passed into function f = open(remotedirfile, 'w') f.write(remotedir) f.close() pushbackfile = jobfile + '.pushback' pullbackfile = jobfile + '.pullback' qsubfile = jobfile + '.qsubcmd' # see if job has been submitted before jobid_file = jobfile + '.jobid' if os.path.exists(jobid_file): f = open(jobid_file, 'r') jobid = f.readline().strip() f.close() self.fastpoll() # see if job is in the queue still for job in self: if job['Job Id'] == jobid: if job['job_state'] == 'Q': if self.verbosity > 1: print '%s still in the queue' % jobid raise JobInQueue, '%s still in the queue' % jobid elif job['job_state'] == 'R': if self.verbosity > 1: print '%s is running' % jobid raise JobRunning, '%s is running' % jobid elif job['job_state'] == 'H': raise JobHold, '%s is in Hold status' % jobid elif job['job_state'] == 'E': raise JobErrorStatus, '%s is in Error status' % jobid elif job['job_state'] == 'C': print '%s is done' % jobid else: raise UnknownJobStatus, '%s is in unknown state: %s'% (jobid, job['job_state']) if self.verbosity > 1: print '%s is not in the queue anymore' % jobid # if you get here, it was not in the queue anymore # now we need to copy the results back src = '%s:%s/' % (server, remotedir) if self.verbosity > 1: print 'copying back remote results: ', src status,output = rsync(src, '.') # we have made it this far, we should now remove the remote # directory. if self.verbosity > 1: print 'removing remote directory: ', remotedir cmd = 'rm -fr %s' % (remotedir) status, output = ssh(cmd, server) if self.verbosity > 1: print 'removing remote directory status: ', status # now remove some files we don't need anymore os.unlink(jobid_file) nodefile = 'pbs.%s.nodes' % jobid if os.path.exists(nodefile): os.unlink(nodefile) if os.path.exists(pushbackfile): os.unlink(pushbackfile) if os.path.exists(pullbackfile): os.unlink(pullbackfile) if os.path.exists(remotedirfile): os.unlink(remotedirfile) if os.path.exists(qsubfile): os.unlink(qsubfile) # now lets try to check for batch errors like memory # exceeded or cput exceeded jobnumber, host = jobid.split('.') joboutputfile = jobfile[0:15] + '.o%s' % jobnumber # this may not exist if user killed job before it started # I also assume here that output and error have been joined if os.path.exists(joboutputfile): f = open(joboutputfile, 'r') #now lets hunt for errors in the output file for line in f: if '=>> PBS: job killed: mem' in line: raise PBS_MemoryExceeded, line elif '=>> PBS: job killed: cput' in line: raise PBS_CputExceeded,line elif 'Terminated' in line: raise PBS_Terminated,line elif '=>> PBS:' in line: raise PBS_UknownError, line elif 'ERROR: LAM/MPI' in line: for line2 in f: if 'ERROR' in line2: print line2 if 'ssh' in line2: print line2 raise LAMMPI_Error, line elif 'forrtl' in line or 'SIGSEGV' in line: raise FORTRAN_Error, line f.close() return True if os.environ.get('PBS_DRYRUN', None) is not None: print 'Dry run detected. exiting' return # this job needs to be submitted if you get here if self.verbosity > 1: print 'Submitting job:' destination = '%s:%s' % (server, remotedir) #make sure destination directory exists status,output = ssh('mkdir -p %s' % remotedir, server) #1 copy files to remote system status,output = rsync(jobfiles, destination) #2 submit job cmds = ['cd %s' % remotedir, '%s %s' % (QSUB, jobfile)] cmd = string.join(cmds, '; ') status, output = ssh(cmd, server) if status is not 0: print '===================================' print output print '===================================' raise PBS_UnknownError # we should save the jobid f = open(jobid_file, 'w') f.write(output) f.close() # copy jobid file to remotedir so we can tell on that end # what this temp dir is for. rsync(jobid_file, destination) # get user and hostname to copy results back to import platform uname = platform.uname() hostname = uname[1] status, user = commands.getstatusoutput('whoami') f = open(qsubfile, 'w') f.write('%s %s\n' % (QSUB, jobfile)) f.close() f = open(pullbackfile, 'w') f.write('#!/bin/tcsh -x\n') f.write('rsync -avz %s:%s/ .\n' % (self.host, remotedir)) f.write('ssh %s@%s rm -fr %s\n' % (user, self.host, remotedir)) f.write('#end') f.close() os.chmod(pullbackfile, 0755) f = open(pushbackfile, 'w') f.write('#!/bin/tcsh -x\n') f.write('rsync -avz . %s@%s:%s\n' % (user, hostname, os.getcwd())) f.close() os.chmod(pushbackfile, 0755) rsync(pushbackfile, destination) raise JobSubmitted, output