def run_example(self): ensure_wdir(project=self.project) empty_queues(project=self.project) os.chdir(self.ex_dir) if os.path.exists('logfiles'): rmtree('logfiles') os.mkdir('logfiles') call('./create_jobs.py {}'.format(self.project), shell=True) call('fjd-recruiter --project {} hire {}'\ .format(self.project, self.num_workers), shell=True) call('fjd-dispatcher --project {} --end_when_jobs_are_done'\ .format(self.project), shell=True)
def __init__(self, num_workers=1, project=None, local_only=False, curdir=''): if not project: project = 'default' self.project = project self.curdir = curdir self.wdir = ensure_wdir(project) # build up self.hosts self.hosts = [dict(name='localhost', workers=int(num_workers))] rc_loc = "{}/remote.conf".format(self.wdir) if osp.exists(rc_loc) and not local_only: self.hosts = [] remote_conf = configparser.ConfigParser() remote_conf.read(rc_loc) num_hosts = 0 while remote_conf.has_section('host{}'.format(num_hosts + 1)): hid = "host{}".format(num_hosts + 1) if not remote_conf.has_option(hid, 'name') or\ not remote_conf.has_option(hid, 'workers'): print("[fjd-recruiter] Host section for {} is missing"\ " name or workers option!".format(hid)) else: self.hosts.append( dict(name=remote_conf.get(hid, "name"), workers=remote_conf.getint(hid, "workers"))) num_hosts += 1 if debug: print("[fjd-recruiter] I am configured with hosts {}."\ .format(','.join([h['name'] for h in self.hosts])))
def __init__(self, interval=.1, project=None): if not project: project = 'default' self.wdir = ensure_wdir(project) self.start_up() # announce my presence self.id = self.mk_id() print('[fjd-worker] Started with ID {id}.'.format(id=self.id)) subprocess.call('touch {wdir}/workerqueue/{id}.worker'\ .format(wdir=self.wdir, id=self.id), shell=True) # look for jobs while True: job = self.next_job_on_pod() if job: print('[fjd-worker] Worker {}: I found a job.'.format(self.id)) # Check if job file is a config file (ini-style). # If it is, get executable from there, call it and pass it the config file. # If it is not, run the job file as a script. # We read the file first and close it, so no stale handles will exists # in case ConfigParser exits with open('{}/jobpod/{}'.format(self.wdir, job), 'r') as jobfile: jobtxt = jobfile.read() if sys.version < '3': jobtxt = unicode(jobtxt) ini_fp = io.StringIO(jobtxt) conf = configparser.RawConfigParser() try: conf.readfp(ini_fp) # this raises in case it is not an .ini file exe = conf.get('fjd', 'executable') cmd = 'nice -n {nice} {exe} {wdir}/jobpod/{job}; '\ .format(nice=9, exe=exe, wdir=self.wdir, job=job) #except (configparser.MissingSectionHeaderError, configparser.NoSectionError): except (configparser.MissingSectionHeaderError): cmd = 'nice -n {nice} {wdir}/jobpod/{job}'.format(nice=9, wdir=self.wdir, job=job) subprocess.call(cmd, shell=True) print('[fjd-worker] Worker {}: Finished my job.'.format(self.id)) # remove the job from pod (signaling it is done) + re-announce myself subprocess.call('rm {wdir}/jobpod/{job}; touch {wdir}/workerqueue/{id}.worker'\ .format(wdir=self.wdir, job=job, id=self.id), shell=True) time.sleep(interval)
def __init__(self, exe, repeat=1, parameters=[], project=None, num_workers=0, callback=None, curdir=''): if not exe or exe == '': print('[fjd] Please specify an executable command (--exe).') sys.exit(2) if repeat > 1 and len(parameters) > 0: print('[fjd] Only one of --repeat and --parameters can be set at a time.') sys.exit(2) empty_queues(project=project) self.wdir = ensure_wdir(project) if len(parameters) > 1: for i, p in enumerate(parameters): job = '{}/jobqueue/job{}'.format(self.wdir, i) with open(job, 'w') as f: f.write('#!/bin/bash\n') cur_exe = exe ext_params = [] for j, param in enumerate(str(p).split('#')): if '${}'.format(j+1) in cur_exe: cur_exe = cur_exe.replace('${}'.format(j+1), str(param)) else: ext_params.append(str(p)) f.write('{exe} {params}'.format(exe=cur_exe, params=' '.join(ext_params))) os.chmod(job, 0o777) else: for i in range(repeat): job = '{}/jobqueue/job{}'.format(self.wdir, i) with open(job, 'w') as f: f.write('#!/bin/bash\n') f.write(exe) os.chmod(job, 0o777) if num_workers == 0: num_workers = cpu_count() - 1 num_workers = min(num_workers, cpu_count()) recruiter = Recruiter(num_workers=num_workers, project=project, curdir=curdir) recruiter.hire() Dispatcher(project=project, callback=callback)
def __init__(self, interval=.1, project=None, end_when_jobs_are_done=True, callback=None, status_only=False): if not project: project = 'default' self.wdir = ensure_wdir(project) self.start_up() if not status_only: print('[fjd-dispatcher] Started on project "{}".'.format(project)) def signal_handler(signal, frame): ''' gently exiting, e.g. when CTRL-C was pressed. ''' sys.stdout.write('\n[fjd-dispatcher] Received Exit signal. Exiting ...\n') print('[fjd-dispatcher] Should I fire all workers in project {}? [y|N]'\ .format(project)) if input().lower() in ["y", "yes"]: Recruiter(project=project).fire() sys.exit(0) signal.signal(signal.SIGINT, signal_handler) do_work = True while do_work: time.sleep(interval) if status_only: # just show info once, don't do anything else do_work = False jq = os.listdir('{}/jobqueue'.format(self.wdir)) jp = os.listdir('{}/jobpod'.format(self.wdir)) wq = os.listdir('{}/workerqueue'.format(self.wdir)) self.sort_jobqueue(jq) num_workers = len(os.listdir('{}/screenrcs'.format(self.wdir))) if len(jq) > 0: # more jobs waiting for workers sys.stdout.write("\r[fjd-dispatcher] {} job(s) waiting in the queue."\ " Currently {} worker(s) out of {} are free ... "\ .format(len(jq), len(wq), num_workers)) sys.stdout.flush() if not status_only: for _ in range(min(len(jq), len(wq))): worker = wq.pop() job = jq.pop() os.rename('{wdir}/jobqueue/{j}'.format(wdir=self.wdir, j=job), '{wdir}/jobpod/{w}'.format(wdir=self.wdir, w=worker)) os.remove('{wdir}/workerqueue/{w}'.format(wdir=self.wdir, w=worker)) elif len(jp) > 0: # some jobs are still running sys.stdout.write("\r[fjd-dispatcher] Job queue is empty. Waiting for remaining {} job(s) to finish ... ".format(len(jp))) sys.stdout.flush() else: # all jobs are done sys.stdout.write("\r[fjd-dispatcher] Job queue is empty and all jobs have finished. ") sys.stdout.flush() if end_when_jobs_are_done: sys.stdout.write("\n") Recruiter(project=project).fire() do_work = False if callback: if isinstance(callback, types.FunctionType): callback() elif isinstance(callback, str): subprocess.call(callback, shell=True) else: print('[fjd-dispatcher] Cannot use callback function, as it is neither function nor string, but {}'.format(type(callback))) if status_only: print('') self.wrap_up()
I make parameter configurations from four shuffled lists and let one job run 1000 parameter configurations (otherwise, the job queue becomes too large and it takes too long for fjd to regularly inspect it). ''' import sys import os import itertools import numpy as np import random from subprocess import call from fjd import Dispatcher from fjd.utils import ensure_wdir, empty_queues # clean up call('rm pbsjobs/pbsjob*', shell=True) call('rm brute*.log;', shell=True) ensure_wdir(project='brute') empty_queues(project='brute') # start 80 workers on 10 PBS nodes (8 on each) for node in range(1, 11, 1): pbsjob = '''# Shell for the job: #PBS -S /bin/bash # request 1 node, 8 cores #PBS -lnodes=1:cores8 # job requires at most n hours wallclock time #PBS -lwalltime=08:00:00 cd /home/nicolas/brute fjd-recruiter --project brute hire 8 python -c "import time; time.sleep(16*60*60)" # keep PBS job alive '''