Exemplo n.º 1
0
def explore(self, cmdl): 
  """ Starts exploration of a pickled job folder. 
  
      Usage: 
      The most standard form is to simply load a job folder. All other
      job-dictionary magic functions will then use it.
      
      >>> explore path/to/job_folder_pickle

      If you have created a job-folder directly (rather than save it to
      disk), you can also load it as

      >>> explore jobfolder_variable 

      In case of conflict between a pathname and a variable name, you can use
      the more explicit version.

      >>> explore --file jobfolder
      >>> explore --expression jobfolder

      You can load a dictionary and filter out successfull or unsuccessfull
      runs.  To explore errors only, use:
     
      >>> explore errors path/to/job_pickle

      To explore only successful results, use:

      >>> explore results path/to/job_pickle
  """

  import argparse
  from os.path import join, dirname
  from pylada import interactive
  from pylada.misc import bugLev

  # options supported by all.
  parser = argparse.ArgumentParser(prog='%explore',
                     description='Opens a job-folder from file on disk.')
  group = parser.add_mutually_exclusive_group()
  group.add_argument( '--file', action="store_true", dest="is_file",
        help='JOBFOLDER is a path to a job-dictionary stored on disk.' )
  group.add_argument( '--expression', action="store_true",
        dest="is_expression", help='JOBFOLDER is a python expression.' )
  parser.add_argument( 'type', metavar='TYPE', type=str, default="", nargs='?',
         help="Optional. Specifies what kind of job folders will be explored. "\
              "Can be one of results, errors, all, running. "                  \
              "\"results\" are those job folders which have completed. "       \
              "\"errors\" are those job folders which are not \"running\" "    \
              "at the time of invokation and failed somehow. \"all\" means "   \
              "all job folders. By default, the dictionary is read as it was " \
              "saved. The modified job-folder is not saved to disk." )
  parser.add_argument( 'jobfolder', metavar='JOBFOLDER', type=str, default="",
         nargs='?',
         help='Job-dictionary variable or path to job folder saved to disk.' )


  # parse arguments
  try: args = parser.parse_args(cmdl.split())
  except SystemExit: return None
  else:
    if len(args.jobfolder) == 0                                                \
       and (args.type not in ["results", "errors", "all", "running"]):
      args.jobfolder = args.type
      args.type = ""

  if     len(args.jobfolder) == 0 \
     and (not args.is_file) \
     and (not args.is_expression) \
     and len(args.type) == 0 \
     and len(args.jobfolder) == 0: 
    if interactive.jobfolder is None:
      print "No current job folders."
    elif interactive.jobfolder_path is None:
      print "Current position in job folder:", interactive.jobfolder.name
    else:
      print "Current position in job folder:", interactive.jobfolder.name
      print "Path to job folder: ", interactive.jobfolder_path
    return

  options = ['', "errors", "results", "all", 'running']
  if hasattr(self, "magic_qstat"): options.append("running")
  if args.type not in options: 
    print "Unknown TYPE argument {0}.\nTYPE can be one of {1}."                \
          .format(args.type, options)
    return

  # tries to open dictionary
  try: _explore_impl(self, args)
  except: return

  # now does special stuff if requested.
  # First checks for errors. Errors are jobs which cannot be determined as
  # running and have failed.
  if args.type == "errors": 
    if interactive.jobfolder_path is None: 
      print "No known path/file for current job-folder.\n"\
            "Please save to file first."
      return
    for name, job in interactive.jobfolder.iteritems():
      if job.is_tagged: continue
      directory = join(dirname(interactive.jobfolder_path), name)
      extract = job.functional.Extract(directory)
      # successful jobs are not errors.
      if extract.success: job.tag()
      # running jobs are not errors either.
      else:
        is_run = getattr(extract, 'is_running', False)
        if is_run: job.tag()

        # Vladan changed Jun 23,2014: jobs which are in the queue are not errors either.
        from pylada.ipython import qstat
        qstuff=qstat(self,name)
        is_inqueue = len(qstuff)>0
        if is_inqueue: job.tag()

        # what's left is an error.
        else: job.untag()
        if bugLev >= 5: print 'ipython/explore errors: dir: %s  is_run: %s' \
          % (directory, is_run,)

  # Look only for jobs which are successfull.
  if args.type == "results": 
    if interactive.jobfolder_path is None: 
      print "No known path/file for current job-folder.\n"\
            "Please save to file first."
      return
    directory = dirname(interactive.jobfolder_path)
    for name, job in interactive.jobfolder.iteritems():
      if not job.functional.Extract(join(directory,name)).success: job.tag()
      else: job.untag()

  # Look only for jobs which are running (and can be determined as such).
  elif args.type == "running": 
    if interactive.jobfolder_path is None: 
      print "No known path/file for current job-folder.\n"\
            "Please save to file first."
      return
    for name, job in interactive.jobfolder.iteritems():
      directory = join(dirname(interactive.jobfolder_path), name)
      extract = job.functional.Extract(directory)
      is_run = getattr(extract, 'is_running', False)
      if is_run:
        # exploremod:
        #   import subprocess
        #   print job.jobNumber, job.jobId
        #   proc = subprocess.Popen(
        #     ['checkjob', str(job.jobNumber)],
        #     shell=False,
        #     cwd=wkDir,
        #     stdin=subprocess.PIPE,
        #     stdout=subprocess.PIPE,
        #     stderr=subprocess.PIPE,
        #     bufsize=10*1000*1000)
        #   (stdout, stderr) = proc.communicate()
        #   parse stdout to get status.  May be 'not found'.
        #   if idle or active: job.untag()
        #   else: job.tag()

        job.untag()
      else: job.tag()
      if bugLev >= 5: print 'ipython/explore running: dir: %s  is_run: %s' \
        % (directory, is_run,)

  # All jobs without restriction.
  elif args.type == "all": 
    if interactive.jobfolder_path is None: return
    for job in interactive.jobfolder.itervalues(): job.untag()
Exemplo n.º 2
0
def launch(self, event, jobfolders):
  """ Launch scattered jobs: one job = one pbs script. """
  from copy import deepcopy
  import os, re
  import subprocess
  from os.path import join, dirname, exists, basename
  from os import remove
  from .. import get_shell
  from ...misc import Changedir
  from ... import pbs_string, default_pbs, qsub_exe, default_comm
  from . import get_walltime, get_mppalloc, get_queues, scattered_script
  from pylada.misc import bugLev
  from pylada.misc import testValidProgram

  if bugLev >= 1: print "launch/scattered: event: %s" % (event,)
  shell = get_shell(self)

  pbsargs = deepcopy(dict(default_comm))
  pbsargs.update(default_pbs)
  pbsargs['ppn'] = event.ppn

  mppalloc = get_mppalloc(shell, event)
  if mppalloc is None: return

  # Set pbsargs['walltime'] to a string like '03:59:59'
  if not get_walltime(shell, event, pbsargs): return

  # Set pbsargs['queue'], pbsargs['account']
  if not get_queues(shell, event, pbsargs): return
  if bugLev >= 1: print "launch/scattered: pbsargs: %s" % (pbsargs,)
 

  # gets python script to launch in pbs.
  pyscript = scattered_script.__file__
  if bugLev >= 1: print "launch/scattered: pyscript: %s" % (pyscript,)
  if pyscript[-1] == 'c': pyscript = pyscript[:-1]   # change .pyc to .py

  # creates file names.
  hasprefix = getattr(event, "prefix", None)                               
  def pbspaths(directory, jobname, suffix):
    """ creates filename paths. """
    return join( join(directory,jobname),
                 '{0}-pbs{1}'.format(event.prefix, suffix) if hasprefix        \
                 else 'pbs{0}'.format(suffix) ) 
  # now  loop over jobfolders
  pbsscripts = []
  for current, path in jobfolders:
    if bugLev >= 1: print "launch/scattered: current: %s  path: %s" \
      % (current, path,)
    # creates directory.
    directory = dirname(path)
    with Changedir(directory) as pwd: pass
    # loop over executable folders in current jobfolder
    for name, job in current.root.iteritems():
      if bugLev >= 1:
#      if True:
        print 'launch/scattered: current: %s' % (current,)
        print 'launch/scattered: current.root: %s' % (current.root,)
        print 'launch/scattered: name: %s' % (name,)
        print 'launch/scattered: job: %s' % (job,)
        print 'launch/scattered: job.is_tagged: %s' % (job.is_tagged,)

      # avoid jobfolder which are off
      if job.is_tagged: continue

      ###### added by Peter Graf
      # avoid jobfolder which is already in the queue:
      from pylada.ipython import qstat
      qstuff = qstat(self, name)
      if (len(qstuff) > 0 and not event.force):
        status = [x.split()[2] for x in qstuff]
        # status is a list like ['Q'], ['R'], ['H'], ['C'], ['R', 'C'], etc
        # 'RHQ' is the status that the job is indeed in the queue, 'C' job completed and being removed from the queue
        # if needed, a prefix can be used to distinguish two jobs with the same name
        if len(set(status)&set('RHQ')) > 0:
          print "Job %s is in the queue, will not be re-queued" % name
          continue
      #######

      # avoid successful jobs.unless specifically requested
      if hasattr(job.functional, 'Extract') and not event.force: 
        p = join(directory, name)
        extract = job.functional.Extract(p)
        if extract.success:
          print "Job {0} completed successfully. "                             \
                "It will not be relaunched.".format(name)                     
          continue                                                            

      # setup parameters for launching/running jobs
      pbsargs['n'] = mppalloc(job) if hasattr(mppalloc, "__call__")            \
                     else mppalloc                                            
      pbsargs['nnodes'] = (pbsargs['n'] + pbsargs['ppn'] - 1)                  \
                          // pbsargs['ppn']                                   
      pbsargs['err'] = pbspaths(directory, name, 'err')
      pbsargs['out'] = pbspaths(directory, name, 'out')
      pbsargs['name'] = name if len(name)                                      \
                        else "{0}-root".format(basename(path))
      pbsargs['directory'] = directory
      pbsargs['bugLev'] = bugLev
      pbsargs['testValidProgram'] = testValidProgram
      
      pbsargs['scriptcommand']                                                 \
           = "{0} --bugLev {bugLev} --testValidProgram {testValidProgram} --nbprocs {n} --ppn {ppn} --jobid={1} {2}"                   \
             .format(pyscript, name, path, **pbsargs)
      ppath = pbspaths(directory, name, 'script')
      if bugLev >=1:
        print "launch/scattered: ppath: \"%s\"" % (ppath,)
        print "launch/scattered: pbsargs: \"%s\"" % (pbsargs,)
      pbsscripts.append( ppath)

      # write pbs scripts
      with Changedir(join(directory, name)) as pwd: pass
      if exists(pbsscripts[-1]): remove(pbsscripts[-1])
      with open(pbsscripts[-1], "w") as file:
        string = pbs_string(**pbsargs) if hasattr(pbs_string, '__call__')      \
                 else pbs_string.format(**pbsargs) 
        # peregrine takes back the option of "anynode"
        string = string.replace("#PBS -l feature=anynode", "##PBS -l feature=anynode")
        if bugLev >= 1:
          print "launch/scattered: ===== start pbsscripts[-1]: %s =====" \
            % (pbsscripts[-1],)
          print '%s' % (string,)
          print "launch/scattered: ===== end pbsscripts[-1]: %s =====" \
            % (pbsscripts[-1],)
        lines = string.split('\n')
        omitTag = '# omitted for testValidProgram: '
        for line in lines:
          if testValidProgram != None \
            and (re.match('^ *module ', line) \
              or re.match('^\. .*/bin/activate$', line)):
            line = omitTag + line
          file.write( line + '\n')
      assert exists(pbsscripts[-1])

      # exploremod
      #   import subprocess
      #   if not event.nolaunch:
      #   move launch here:
      #
      #   if bugLev >= 1:
      #     print ...
      #
      #   proc = subprocess.Popen(
      #     [qsub_exe, pbsscripts[-1]],
      #     shell=False,
      #     cwd=wkDir,
      #     stdin=subprocess.PIPE,
      #     stdout=subprocess.PIPE,
      #     stderr=subprocess.PIPE,
      #     bufsize=10*1000*1000)
      #   (stdout, stderr) = proc.communicate()
      #   parse stdout to get jobNumber
      #   job.jobNumber = jobNumber
      #
      #   if bugLev >= 1:
      #     print ...

    print "Created {0} scattered jobs from {1}.".format(len(pbsscripts), path)

  if event.nolaunch: return
  # otherwise, launch.
  for script in pbsscripts:
    if bugLev >= 1:
      print "launch/scattered: launch: shell: %s" % (shell,)
      print "launch/scattered: launch: qsub_exe: %s" % (qsub_exe,)
      print "launch/scattered: launch: script: \"%s\"" % (script,)

    if testValidProgram != None:
      cmdLine = '/bin/bash ' + script
    else:
      # qsub pbsscript (template is in config/mpi.py: pbs_string),
      # which sets up modules and invokes: python {scriptcommand}
      cmdLine = "{0} {1}".format(qsub_exe, script)

    nmerr = script + '.stderr'
    nmout = script + '.stdout'
    with open( nmerr, 'w') as ferr:
      with open( nmout, 'w') as fout:
        subprocess.call( cmdLine, shell=True, stderr=ferr, stdout=fout)
        # xxx: all subprocess: set stderr, stdout
    if os.path.getsize( nmerr) != 0:
      with open( nmerr) as fin:
        print 'launch/scattered: stderr: %s' % (fin.read(),)
    with open( nmout) as fin:
      print 'launch/scattered: stdout: %s' % (fin.read(),)