def stagedir_name(self,stagedir=None): if self.queuingsystem is 'Local': return None if stagedir is None: # use canonical setup stagedir = pathjoin('/scratch',os.environ['USER'],self.jobdir_name) return stagedir
def stagedir_name(self, stagedir=None): if self.queuingsystem is 'Local': return None if stagedir is None: # use canonical setup stagedir = pathjoin('/scratch', os.environ['USER'], self.jobdir_name) return stagedir
def unstage(self): """Copy results back. Shell-style glob patterns are allowed.""" self.__MODE__ = "unstage" if self.queuingsystem is 'Local': return import glob self._make_all_dirs(self.startdir,self.output,sanitize=False) # make result directories, may be absolute! for key,p in self.output.items(): src = self.filenames[key] # always relative to stagedir srcdir = os.path.dirname(p) destdir = pathjoin(self.startdir,srcdir, sanitize=False) # may be absolute self.msg("item=%(key)s: looking for %(p)s [=%(src)s]..." % locals()) for srcpath in glob.glob(src): srcname = os.path.basename(srcpath) destpath = pathjoin(destdir,srcname, sanitize=False) self.msg("item=%(key)s: copying %(srcpath)s" % locals(), newline=False) shutil.copyfile(srcpath,destpath) # silently replaces files ! self.msg(" --> %(destpath)s" % locals())
def _make_all_dirs(self, topdir, filedict, **kwargs): """Create directories under topdir, based on paths in filedict.""" for key, p in filedict.items(): srcdir = os.path.dirname(p) destdir = pathjoin(topdir, srcdir, **kwargs) try: os.makedirs(destdir) # recursive self.msg("item=%(key)s: created dir %(destdir)s" % locals()) except os.error, e: if e.errno == errno.EEXIST: pass else: raise
def _make_all_dirs(self,topdir,filedict,**kwargs): """Create directories under topdir, based on paths in filedict.""" for key,p in filedict.items(): srcdir = os.path.dirname(p) destdir = pathjoin(topdir,srcdir,**kwargs) try: os.makedirs(destdir) # recursive self.msg("item=%(key)s: created dir %(destdir)s" % locals()) except os.error,e: if e.errno == errno.EEXIST: pass else: raise
def unstage(self): """Copy results back. Shell-style glob patterns are allowed.""" self.__MODE__ = "unstage" if self.queuingsystem is 'Local': return import glob self._make_all_dirs( self.startdir, self.output, sanitize=False) # make result directories, may be absolute! for key, p in self.output.items(): src = self.filenames[key] # always relative to stagedir srcdir = os.path.dirname(p) destdir = pathjoin(self.startdir, srcdir, sanitize=False) # may be absolute self.msg("item=%(key)s: looking for %(p)s [=%(src)s]..." % locals()) for srcpath in glob.glob(src): srcname = os.path.basename(srcpath) destpath = pathjoin(destdir, srcname, sanitize=False) self.msg("item=%(key)s: copying %(srcpath)s" % locals(), newline=False) shutil.copyfile(srcpath, destpath) # silently replaces files ! self.msg(" --> %(destpath)s" % locals())
def __init__(self, *args, **kwargs): """Set up SGE job. :Arguments: inputfiles dict of input files (with relative path to startdir); globs are not supported. outputfiles dict of result files or glob patterns (relative to stagedir == relative to startdir) variables key/value pairs that can be used in the script as Job.variables[key] startdir path to the directory where the input can be found (must be nfs-mounted on node) stagedir local scratch directory on node; all input files are copied there. The default should be ok. JOB_NAME unique identifier (only set this if this NOT submitted through the Gridengine queuing system AND if the files should be copied to a scratch disk (i.e. staging proceeds as it would for a SGE-submitted job).) SGE_TASK_ID fake a task id (use with JOB_NAME) """ self.__MODE__ = "init" # current state, for self.msg super(Job, self).__init__(*args, **kwargs) self.input = kwargs.setdefault('inputfiles', {}) self.output = kwargs.setdefault('outputfiles', {}) self.variables = kwargs.setdefault('variables', {}) # where we find input files and copy back results self.startdir = self.startdir_name(kwargs.setdefault('startdir', None)) # local directory on node self.stagedir = self.stagedir_name(kwargs.setdefault('stagedir', None)) # normalized filenames (always under stagedir) self.filenames = { k: pathjoin(self.stagedir, path, refdir=self.startdir) for k, path in joindicts(self.input, self.output).items() } self.statusmessage()
def __init__(self,*args,**kwargs): """Set up SGE job. :Arguments: inputfiles dict of input files (with relative path to startdir); globs are not supported. outputfiles dict of result files or glob patterns (relative to stagedir == relative to startdir) variables key/value pairs that can be used in the script as Job.variables[key] startdir path to the directory where the input can be found (must be nfs-mounted on node) stagedir local scratch directory on node; all input files are copied there. The default should be ok. JOB_NAME unique identifier (only set this if this NOT submitted through the Gridengine queuing system AND if the files should be copied to a scratch disk (i.e. staging proceeds as it would for a SGE-submitted job).) SGE_TASK_ID fake a task id (use with JOB_NAME) """ self.__MODE__ = "init" # current state, for self.msg super(Job,self).__init__(*args,**kwargs) self.input = kwargs.setdefault('inputfiles',{}) self.output = kwargs.setdefault('outputfiles',{}) self.variables = kwargs.setdefault('variables',{}) # where we find input files and copy back results self.startdir = self.startdir_name(kwargs.setdefault('startdir',None)) # local directory on node self.stagedir = self.stagedir_name(kwargs.setdefault('stagedir',None)) # normalized filenames (always under stagedir) self.filenames = dict([ (k,pathjoin(self.stagedir,path,refdir=self.startdir)) for k,path in joindicts(self.input,self.output).items()] ) self.statusmessage()
def __init__(self, *args, **kwargs): """Set up the Job: job = Job(inputfiles=dict(...),outputfiles=dict(...),variables=dict(...),**kwargs) inputfiles and outputfiles are dictionaries with arbitrary keys; each item is a path to a file relative to the startdir (which by default is the directory from which the SGE job starts --- use the #$ -cwd flag!). If the files are not relative to the start dir then new directories are constructed under the stage dir; in this instance it uis important that the user script ONLY uses the filenames in self.filenames: These have the proper paths of the local (staged) files for the script to operate on. With job.stage() inputfiles are copied to the stagedir on the node's scratch dir and sub directories are created as necessary; directories mentioned as part of the outputfiles are created, too. job.unstage() copies back all files mentioned in output files (again, use directories as part of the path as necessary) and create the directories in the startdir if needed. For the outputfiles one can also use shell-style glob patterns, e.g. outfiles = {'all_dcd': '*.dcd', 'last_data':'*[5-9].dat'} Sensible defaults are automatically selected for startdir (cwd) and stagedir (/scratch/USER/JOB_NAME.JOB_ID). If the script is not run through SGE (i.e. the environment variable JOB_NAME is not set) then the script is run without staging; this is pretty much equivalent to using from staging.Local import Job :Input: inputfiles dict of input files (with relative path to startdir); globs are not supported. outputfiles dict of result files or glob patterns (relative to stagedir == relative to startdir) variables key/value pairs that can be used in the script as Job.variables[key] startdir path to the directory where the input can be found (must be nfs-mounted on node) stagedir local scratch directory on node; all input files are copied there. The default should be ok. JOB_NAME unique identifier (only set this if this NOT submitted through the Gridengine queuing system AND if the files should be copied to a scratch disk (i.e. staging proceeds as it would for a SGE-submitted job).) SGE_TASK_ID fake a task id (use with JOB_NAME) :Attributes: input inputfiles dict (relative to startdir or absolute) output outputfiles dict (relative to startdir or absolute, can contain globs) filenames merged dict of input and output, pointing to *staged* files variables variables dict :Methods: stage() setup job on the nodes in stagedir unstage() retrieve results to startdir cleanup() remove all files on the node (rm -rf stagedir) """ self.__MODE__ = "init" # current state, for self.msg super(Job, self).__init__(*args, **kwargs) self.input = kwargs.setdefault('inputfiles', {}) self.output = kwargs.setdefault('outputfiles', {}) self.variables = kwargs.setdefault('variables', {}) # where we find input files and copy back results self.startdir = self.startdir_name(kwargs.setdefault('startdir', None)) # local directory on node self.stagedir = self.stagedir_name(kwargs.setdefault('stagedir', None)) # normalized filenames (always under stagedir) self.filenames = dict([ (k, pathjoin(self.stagedir, path, refdir=self.startdir)) for k, path in joindicts(self.input, self.output).items() ]) self.statusmessage()
class Job(SGE_job): """The Job class encapsulates the SGE job and allows for clean staging and unstaging.""" def __init__(self, *args, **kwargs): """Set up the Job: job = Job(inputfiles=dict(...),outputfiles=dict(...),variables=dict(...),**kwargs) inputfiles and outputfiles are dictionaries with arbitrary keys; each item is a path to a file relative to the startdir (which by default is the directory from which the SGE job starts --- use the #$ -cwd flag!). If the files are not relative to the start dir then new directories are constructed under the stage dir; in this instance it uis important that the user script ONLY uses the filenames in self.filenames: These have the proper paths of the local (staged) files for the script to operate on. With job.stage() inputfiles are copied to the stagedir on the node's scratch dir and sub directories are created as necessary; directories mentioned as part of the outputfiles are created, too. job.unstage() copies back all files mentioned in output files (again, use directories as part of the path as necessary) and create the directories in the startdir if needed. For the outputfiles one can also use shell-style glob patterns, e.g. outfiles = {'all_dcd': '*.dcd', 'last_data':'*[5-9].dat'} Sensible defaults are automatically selected for startdir (cwd) and stagedir (/scratch/USER/JOB_NAME.JOB_ID). If the script is not run through SGE (i.e. the environment variable JOB_NAME is not set) then the script is run without staging; this is pretty much equivalent to using from staging.Local import Job :Input: inputfiles dict of input files (with relative path to startdir); globs are not supported. outputfiles dict of result files or glob patterns (relative to stagedir == relative to startdir) variables key/value pairs that can be used in the script as Job.variables[key] startdir path to the directory where the input can be found (must be nfs-mounted on node) stagedir local scratch directory on node; all input files are copied there. The default should be ok. JOB_NAME unique identifier (only set this if this NOT submitted through the Gridengine queuing system AND if the files should be copied to a scratch disk (i.e. staging proceeds as it would for a SGE-submitted job).) SGE_TASK_ID fake a task id (use with JOB_NAME) :Attributes: input inputfiles dict (relative to startdir or absolute) output outputfiles dict (relative to startdir or absolute, can contain globs) filenames merged dict of input and output, pointing to *staged* files variables variables dict :Methods: stage() setup job on the nodes in stagedir unstage() retrieve results to startdir cleanup() remove all files on the node (rm -rf stagedir) """ self.__MODE__ = "init" # current state, for self.msg super(Job, self).__init__(*args, **kwargs) self.input = kwargs.setdefault('inputfiles', {}) self.output = kwargs.setdefault('outputfiles', {}) self.variables = kwargs.setdefault('variables', {}) # where we find input files and copy back results self.startdir = self.startdir_name(kwargs.setdefault('startdir', None)) # local directory on node self.stagedir = self.stagedir_name(kwargs.setdefault('stagedir', None)) # normalized filenames (always under stagedir) self.filenames = dict([ (k, pathjoin(self.stagedir, path, refdir=self.startdir)) for k, path in joindicts(self.input, self.output).items() ]) self.statusmessage() def statusmessage(self): super(Job, self).statusmessage() self.msg("startdir: %s" % self.startdir) self.msg("stagedir: %s" % self.stagedir) def startdir_name(self, startdir=None): if startdir is None: # use canonical setup (relies on -cwd SGE flag) startdir = os.path.realpath(os.path.curdir) return startdir def stagedir_name(self, stagedir=None): if self.queuingsystem is 'Local': return None if stagedir is None: # use canonical setup stagedir = pathjoin('/scratch', os.environ['USER'], self.jobdir_name) return stagedir def stage(self): """Copy all input files to the scratch directory.""" self.__MODE__ = "stage" if self.queuingsystem is 'Local': return stagedir = self.stagedir try: os.makedirs(stagedir) self.msg("Created stage dir %(stagedir)s." % locals()) except os.error, e: if e.errno == errno.EEXIST: self.msg("WARNING %(stagedir)s already exists." % locals()) else: raise self._make_all_dirs(stagedir, self.input, refdir=self.startdir ) # copy input and preserve directory structure self._make_all_dirs(stagedir, self.output, refdir=self.startdir ) # also create directories for the output files for key, p in self.input.items(): # copy input files srcpath = pathjoin( self.startdir, p, sanitize=False) # may be absolute (and ignores startdir!) destpath = self.filenames[key] # ALWAYS under stagedir self.msg("item=%(key)s: copying %(srcpath)s" % locals(), newline=False) shutil.copyfile(srcpath, destpath) self.msg(" --> %(destpath)s" % locals()) # finally, change current directory to the stage dir: all further # commands can assume that staging has been completed os.chdir(stagedir) self.msg("chdir to %(stagedir)s successful." % locals())