def simple_get_cancel_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 1", process_name="sleep", directory=tempdir) state = pidantic.get_state() self.assertEquals(state, PIDanticState.STATE_PENDING) pidantic.cancel_request()
def imediately_terminate_facorty_with_running_pgm_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/cat", process_name="cat", directory=tempdir) pidantic.start() factory.terminate()
def simple_get_state_start_test(self): name = "cat" + str(uuid.uuid4()).split("-")[0] tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/cat", process_name=name, directory=tempdir) pidantic.start() state = pidantic.get_state() self.assertEquals(state, PIDanticState.STATE_STARTING) factory.terminate()
def _setup_factory(self): if self.factory: return try: self.factory = SupDPidanticFactory(directory=self.pidantic_dir, name="epu-harness") except Exception: log.debug("Problem Connecting to SupervisorD", exc_info=True) raise HarnessException( "Could not connect to supervisord. Was epu-harness started?")
def simple_terminate_test(self): process_name = str(uuid.uuid4()).split("-")[0] tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 5000", process_name=process_name, directory=tempdir) pidantic.start() factory.poll() pidantic.terminate() while not pidantic.is_done(): factory.poll() rc = pidantic.get_result_code() self.assertNotEqual(rc, 0) factory.terminate()
def __init__(self, eeagent_cfg, log=logging): self.log = log self.log.debug("Starting SupDExe") self._working_dir = eeagent_cfg.launch_type.supd_directory self._eename = eeagent_cfg.name supdexe = _set_param_or_default(eeagent_cfg.launch_type, 'supdexe', None) self._slots = int(eeagent_cfg.slots) self._factory = SupDPidanticFactory(directory=self._working_dir, name=self._eename, supdexe=supdexe) pidantic_instances = self._factory.reload_instances() self._known_pws = {} for name in pidantic_instances: pidantic = pidantic_instances[name] pw = PidWrapper(self, name) pw.set_pidantic(pidantic) self._known_pws[name] = pw self._state_change_cb = None self._state_change_cb_arg = None
def _setup_factory(self): if self.factory: return try: self.factory = SupDPidanticFactory(directory=self.pidantic_dir, name="epu-harness") except Exception: log.debug("Problem Connecting to SupervisorD", exc_info=True) raise HarnessException("Could not connect to supervisord. Was epu-harness started?")
def simple_api_walk_through_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 1", process_name="sleep", directory=tempdir) pidantic.start() state = pidantic.get_state() while not pidantic.is_done(): factory.poll() factory.terminate()
def simple_get_state_exit_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 1", process_name="sleep", directory=tempdir) pidantic.start() while not pidantic.is_done(): factory.poll() state = pidantic.get_state() self.assertEquals(state, PIDanticState.STATE_EXITED) factory.terminate()
def simple_double_terminate_kill_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 5000", process_name="longnap", directory=tempdir) pidantic.start() factory.poll() pidantic.terminate() try: pidantic.terminate() self.fail("The terminate call should raise an error") except: pass while not pidantic.is_done(): factory.poll() rc = pidantic.get_result_code() self.assertNotEqual(rc, 0) factory.terminate()
def simple_return_code_success_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="false", process_name="false", directory=tempdir) pidantic.start() while not pidantic.is_done(): factory.poll() rc = pidantic.get_result_code() self.assertNotEqual(rc, 0) factory.terminate()
def __init__(self, eeagent_cfg): self._working_dir = eeagent_cfg.launch_type.supd_directory self._eename = eeagent_cfg.name supdexe = _set_param_or_default(eeagent_cfg.launch_type, 'supdexe', None) self._slots = int(eeagent_cfg.slots) self._factory = SupDPidanticFactory(directory=self._working_dir, name=self._eename, supdexe=supdexe) pidantic_instances = self._factory.reload_instances() self._known_pws = {} for name in pidantic_instances: pidantic = pidantic_instances[name] pw = PidWrapper(self, name) pw.set_pidantic(pidantic) self._known_pws[name] = pw
def two_processes_one_sup_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") true_pid = factory.get_pidantic(command="true", process_name="true", directory=tempdir) true_pid.start() false_pid = factory.get_pidantic(command="false", process_name="false", directory=tempdir) false_pid.start() while not false_pid.is_done() or not true_pid.is_done(): factory.poll() rc = false_pid.get_result_code() self.assertNotEqual(rc, 0) rc = true_pid.get_result_code() self.assertEqual(rc, 0) factory.terminate()
def terminate_done_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 1", process_name="sleep", directory=tempdir) pidantic.start() while not pidantic.is_done(): factory.poll() try: pidantic.terminate() self.assertFalse(True, "should not get here") except PIDanticStateException: pass factory.terminate()
def _configure_workers(self): # TODO: if num_workers == 1, simply run one in-line (runs in a greenlet anyhow) if self.is_single_worker: from brick_worker import run_worker worker = run_worker(self.prov_port, self.resp_port) self.workers.append(worker) else: if os.path.exists(self.pidantic_dir): bdp = os.path.join(self.pidantic_dir, 'brick_dispatch') if os.path.exists(bdp): import zipfile, zlib with zipfile.ZipFile(os.path.join(bdp, 'archived_worker_logs.zip'), 'a', zipfile.ZIP_DEFLATED) as f: names = f.namelist() for x in [x for x in os.listdir(bdp) if x.startswith('worker_') and x not in names]: fn = os.path.join(bdp, x) f.write(filename=fn, arcname=x) os.remove(fn) else: os.makedirs(self.pidantic_dir) self.factory = SupDPidanticFactory(name='brick_dispatch', directory=self.pidantic_dir) # Check for old workers - FOR NOW, TERMINATE THEM TODO: These should be reusable... old_workers = self.factory.reload_instances() for x in old_workers: old_workers[x].cleanup() worker_cmd = 'bin/python coverage_model/brick_worker.py {0} {1}'.format(self.prov_port, self.resp_port) for x in xrange(self.num_workers): w = self.factory.get_pidantic(command=worker_cmd, process_name='worker_{0}'.format(x), directory=os.path.realpath(self.working_dir)) w.start() self.workers.append(w) ready=False while not ready: self.factory.poll() for x in self.workers: s = x.get_state() if s is PIDanticState.STATE_STARTING: break elif s is PIDanticState.STATE_RUNNING: continue elif s is PIDanticState.STATE_EXITED: self.shutdown() raise SystemError('Error starting worker - cannot continue') else: raise SystemError('Problem starting worker - cannot continue') ready = True
def __init__(self, eeagent_cfg, log=logging): self.log = log self.log.debug("Starting SupDExe") self._working_dir = eeagent_cfg.launch_type.supd_directory self._eename = eeagent_cfg.name supdexe = _set_param_or_default(eeagent_cfg.launch_type, "supdexe", None) self._slots = int(eeagent_cfg.slots) self._factory = SupDPidanticFactory(directory=self._working_dir, name=self._eename, supdexe=supdexe) pidantic_instances = self._factory.reload_instances() self._known_pws = {} for name in pidantic_instances: pidantic = pidantic_instances[name] pw = PidWrapper(self, name) pw.set_pidantic(pidantic) self._known_pws[name] = pw self._state_change_cb = None self._state_change_cb_arg = None
def restart_test(self): from time import sleep tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/cat", process_name="cat", directory=tempdir) pidantic.start() while not pidantic.get_state() == PIDanticState.STATE_RUNNING: factory.poll() sleep(1) original_pid = pidantic._supd.get_all_state()[0]['pid'] pidantic.restart() while not pidantic.get_state() == PIDanticState.STATE_RUNNING: factory.poll() sleep(1) new_pid = pidantic._supd.get_all_state()[0]['pid'] assert int(new_pid) != 0 assert new_pid != original_pid
def state_change_callback_test(self): global cb_called cb_called = False def my_callback(arg): print "callback" global cb_called cb_called = True tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 1", process_name="sleep", directory=tempdir) pidantic.set_state_change_callback(my_callback, None) pidantic.start() state = pidantic.get_state() while not pidantic.is_done(): factory.poll() factory.terminate() assert cb_called
class SupDExe(object): def __init__(self, eeagent_cfg, log=logging): self.log = log self.log.debug("Starting SupDExe") self._working_dir = eeagent_cfg.launch_type.supd_directory self._eename = eeagent_cfg.name supdexe = _set_param_or_default(eeagent_cfg.launch_type, 'supdexe', None) self._slots = int(eeagent_cfg.slots) self._factory = SupDPidanticFactory(directory=self._working_dir, name=self._eename, supdexe=supdexe) pidantic_instances = self._factory.reload_instances() self._known_pws = {} for name in pidantic_instances: pidantic = pidantic_instances[name] pw = PidWrapper(self, name) pw.set_pidantic(pidantic) self._known_pws[name] = pw self._state_change_cb = None self._state_change_cb_arg = None def set_state_change_callback(self, cb, user_arg): self._state_change_cb = cb self._state_change_cb_arg = user_arg for name in self._known_pws: pw = self._known_pws['name'] pw.set_state_change_callback(self._state_change_cb, self._state_change_cb_arg) def run(self, name, parameters): pw = PidWrapper(self, name) self._known_pws[name] = pw command = parameters['exec'] + " " + " ".join(parameters['argv']) dir = self._working_dir if "working_directory" in parameters: dir = parameters["working_directory"] pid = self._factory.get_pidantic(command=command, process_name=name, directory=dir) pw.set_pidantic(pid) if self._state_change_cb: pw.set_state_change_callback(self._state_change_cb, self._state_change_cb_arg) running_jobs = self._get_running() x = len(running_jobs) if x <= self._slots: pid.start() else: pid.cancel_request() return pw def get_known_pws(self): return self._known_pws def _remove_proc(self, proc_name): del self._known_pws[proc_name] def lookup_id(self, process_name, ignore_round=False): if ignore_round: process_upid, process_round = unmake_id(process_name) for name, proc in self._known_pws.iteritems(): upid, round = unmake_id(name) if process_upid == upid: return proc else: return None else: if process_name not in self._known_pws: return None return self._known_pws[process_name] def get_all(self): return self._known_pws def _get_running(self): running_states = [ PidWrapper.RUNNING, PidWrapper.TERMINATING, PidWrapper.PENDING ] a = self.get_all().values() running = [i.get_state() for i in a] running = [i for i in a if i.get_state() in running_states] return running def poll(self): return self._factory.poll() def terminate(self): self._factory.terminate()
class BrickWriterDispatcher(object): def __init__(self, failure_callback, num_workers=1, pidantic_dir=None, working_dir=None): self.guid = create_guid() self.prep_queue = queue.Queue() self.work_queue = queue.Queue() self._pending_work = {} self._stashed_work = {} self._active_work = {} self._failures = {} self._do_stop = False self._count = -1 self._shutdown = False self._failure_callback = failure_callback self.context = zmq.Context(1) self.prov_sock = self.context.socket(zmq.REP) self.prov_port = self._get_port(self.prov_sock) log.info('Provisioning url: tcp://*:{0}'.format(self.prov_port)) self.resp_sock = self.context.socket(zmq.SUB) self.resp_port = self._get_port(self.resp_sock) self.resp_sock.setsockopt(zmq.SUBSCRIBE, '') log.info('Response url: tcp://*:{0}'.format(self.resp_port)) self.num_workers = num_workers if num_workers > 0 else 1 self.is_single_worker = self.num_workers == 1 self.working_dir = working_dir or '.' self.pidantic_dir = pidantic_dir or './pid_dir' self.workers = [] self._configure_workers() def _get_port(self, socket): for x in xrange(PORT_RANGE[0], PORT_RANGE[1]): try: socket.bind('tcp://*:{0}'.format(x)) return x except ZMQError: continue def _configure_workers(self): # TODO: if num_workers == 1, simply run one in-line (runs in a greenlet anyhow) if self.is_single_worker: from brick_worker import run_worker worker = run_worker(self.prov_port, self.resp_port) self.workers.append(worker) else: if os.path.exists(self.pidantic_dir): bdp = os.path.join(self.pidantic_dir, 'brick_dispatch') if os.path.exists(bdp): import zipfile, zlib with zipfile.ZipFile(os.path.join(bdp, 'archived_worker_logs.zip'), 'a', zipfile.ZIP_DEFLATED) as f: names = f.namelist() for x in [x for x in os.listdir(bdp) if x.startswith('worker_') and x not in names]: fn = os.path.join(bdp, x) f.write(filename=fn, arcname=x) os.remove(fn) else: os.makedirs(self.pidantic_dir) self.factory = SupDPidanticFactory(name='brick_dispatch', directory=self.pidantic_dir) # Check for old workers - FOR NOW, TERMINATE THEM TODO: These should be reusable... old_workers = self.factory.reload_instances() for x in old_workers: old_workers[x].cleanup() worker_cmd = 'bin/python coverage_model/brick_worker.py {0} {1}'.format(self.prov_port, self.resp_port) for x in xrange(self.num_workers): w = self.factory.get_pidantic(command=worker_cmd, process_name='worker_{0}'.format(x), directory=os.path.realpath(self.working_dir)) w.start() self.workers.append(w) ready=False while not ready: self.factory.poll() for x in self.workers: s = x.get_state() if s is PIDanticState.STATE_STARTING: break elif s is PIDanticState.STATE_RUNNING: continue elif s is PIDanticState.STATE_EXITED: self.shutdown() raise SystemError('Error starting worker - cannot continue') else: raise SystemError('Problem starting worker - cannot continue') ready = True def has_pending_work(self): return len(self._pending_work) > 0 def has_active_work(self): return len(self._active_work) > 0 def has_stashed_work(self): return len(self._stashed_work) > 0 def is_dirty(self): if not self.has_active_work(): if not self.has_stashed_work(): if not self.has_pending_work(): return False return True def get_dirty_values_async_result(self): dirty_async_res = AsyncResult() def dirty_check(self, res): while True: if self.is_dirty(): time.sleep(0.1) else: res.set(True) break spawn(dirty_check, self, dirty_async_res) return dirty_async_res def run(self): self._do_stop = False self._org_g = spawn(self.organize_work) self._prov_g = spawn(self.provisioner) self._rec_g = spawn(self.receiver) def shutdown(self, force=False, timeout=None): if self._shutdown: return # CBM TODO: Revisit to ensure this won't strand work or terminate workers before they complete their work...!! self._do_stop = True try: log.debug('Force == %s', force) if not force: log.debug('Waiting for organizer; timeout == %s',timeout) # Wait for the organizer to finish - ensures the prep_queue is empty self._org_g.join(timeout=timeout) log.debug('Waiting for provisioner; timeout == %s',timeout) # Wait for the provisioner to finish - ensures work_queue is empty self._prov_g.join(timeout=timeout) log.debug('Waiting for receiver; timeout == %s',timeout) # Wait for the receiver to finish - allows workers to finish their work self._rec_g.join(timeout=timeout) log.debug('Killing organizer, provisioner, and receiver greenlets') # Terminate the greenlets self._org_g.kill() self._prov_g.kill() self._rec_g.kill() log.debug('Greenlets killed') log.debug('Shutdown workers') # Shutdown workers - work should be completed by now... if self.is_single_worker: # Current work will be finished self.workers[0].stop() else: self.workers = self.factory.reload_instances() # CBM TODO: THIS DOES NOT ALLOW CURRENT WORK TO FINISH!!! for x in self.workers: self.workers[x].cleanup() self.factory.terminate() log.debug('Workers shutdown') except: raise finally: log.debug('Closing provisioner and receiver sockets') # Close sockets self.prov_sock.close() self.resp_sock.close() log.debug('Sockets closed') log.debug('Terminating the context') self.context.term() log.debug('Context terminated') self._shutdown = True def organize_work(self): while True: if self._do_stop and self.prep_queue.empty(): break try: # Timeout after 1 second to allow stopage and _stashed_work cleanup wd = self.prep_queue.get(timeout=1) except queue.Empty: # No new work added - see if there's anything on the stash to cleanup... for k in self._stashed_work: log.debug('Cleanup _stashed_work...') # Just want to trigger cleanup of the _stashed_work, pass an empty list of 'work', gets discarded self.put_work(k, self._stashed_work[k][0], []) continue try: k, wm, w = wd is_list = isinstance(w, list) if k not in self._stashed_work and len(w) == 0: log.debug('Discarding empty work') continue log.debug('Work: %s',w) if k in self._active_work: log.debug('Do Stash') # The work_key is being worked on if k not in self._stashed_work: # Create the stash for this work_key self._stashed_work[k] = (wm, []) # Add the work to the stash if is_list: self._stashed_work[k][1].extend(w[:]) else: self._stashed_work[k][1].append(w) else: # If there is a stash for this work_key, prepend it to work if k in self._stashed_work: log.debug('Was a stash, prepend: %s, %s', self._stashed_work[k], w) _, sv=self._stashed_work.pop(k) if is_list: sv.extend(w[:]) else: sv.append(w) w = sv is_list = True # Work is a list going forward!! log.debug('Work: %s',w) # The work_key is not yet pending not_in_pend = k not in self._pending_work if not_in_pend: # Create the pending for this work_key log.debug('-> new pointer \'%s\'', k) self._pending_work[k] = (wm, []) # Add the work to the pending log.debug('-> adding work to \'%s\': %s', k, w) if is_list: self._pending_work[k][1].extend(w[:]) else: self._pending_work[k][1].append(w) if not_in_pend: # Add the not-yet-pending work to the work_queue self.work_queue.put(k) except: raise def put_work(self, work_key, work_metrics, work): if self._shutdown: raise SystemError('This BrickDispatcher has been shutdown and cannot process more work!') self.prep_queue.put((work_key, work_metrics, work)) def _add_failure(self, wp): pwp = pack(wp) log.warn('Adding to _failures: %s', pwp) if pwp in self._failures: self._failures[pwp] += 1 else: self._failures[pwp] = 1 if self._failures[pwp] > WORK_FAILURE_RETRIES: raise ValueError('Maximum failure retries exceeded') def receiver(self): while True: try: if self.resp_sock.closed: break if self._do_stop and len(self._active_work) == 0: break log.debug('Receive response message (loop)') msg = None while msg is None: try: msg = self.resp_sock.recv(zmq.NOBLOCK) except zmq.ZMQError, e: if e.errno == zmq.EAGAIN: if self._do_stop: break else: time.sleep(0.1) else: raise if msg is not None: resp_type, worker_guid, work_key, work = unpack(msg) work = list(work) if work is not None else work if resp_type == SUCCESS: log.debug('Worker %s was successful', worker_guid) wguid, pw = self._active_work.pop(work_key) if pw in self._failures: self._failures.pop(pw) elif resp_type == FAILURE: log.debug('Failure reported for work on %s by worker %s', work_key, worker_guid) if work_key is None: # Worker failed before it did anything, put all work back on the prep queue to be reorganized by the organizer # Because it failed so miserably, need to find the work_key based on guid for k, v in self._active_work.iteritems(): if v[0] == worker_guid: work_key = k break if work_key is not None: wguid, pw = self._active_work.pop(work_key) try: self._add_failure(pw) except ValueError,e: self._failure_callback(e.message, unpack(pw)) continue self.put_work(*unpack(pw)) else: # Normal failure # Pop the work from active work, and queue the work returned by the worker wguid, pw = self._active_work.pop(work_key) try: self._add_failure(pw) except ValueError,e: self._failure_callback(e.message, unpack(pw)) continue _, wm, wk = unpack(pw) self.put_work(work_key, wm, work)
class SupDExe(object): def __init__(self, eeagent_cfg): self._working_dir = eeagent_cfg.launch_type.supd_directory self._eename = eeagent_cfg.name supdexe = _set_param_or_default(eeagent_cfg.launch_type, 'supdexe', None) self._slots = int(eeagent_cfg.slots) self._factory = SupDPidanticFactory(directory=self._working_dir, name=self._eename, supdexe=supdexe) pidantic_instances = self._factory.reload_instances() self._known_pws = {} for name in pidantic_instances: pidantic = pidantic_instances[name] pw = PidWrapper(self, name) pw.set_pidantic(pidantic) self._known_pws[name] = pw def run(self, name, parameters): pw = PidWrapper(self, name) self._known_pws[name] = pw command = parameters['exec'] + " " + ' '.join(parameters['argv']) dir = self._working_dir if "working_directory" in parameters: dir = parameters["working_directory"] pid = self._factory.get_pidantic(command=command, process_name=name, directory=dir) pw.set_pidantic(pid) running_jobs = self._get_running() x = len(running_jobs) if x <= self._slots: pid.start() else: pid.cancel_request() return pw def get_known_pws(self): return self._known_pws def _remove_proc(self, proc_name): del self._known_pws[proc_name] def lookup_id(self, name): if name not in self._known_pws: return None return self._known_pws[name] def get_all(self): return self._known_pws def _get_running(self): running_states = [PidWrapper.RUNNING, PidWrapper.TERMINATING, PidWrapper.REQUESTING] a = self.get_all().values() running = [i.get_state() for i in a] running = [i for i in a if i.get_state() in running_states] return running def poll(self): return self._factory.poll() def terminate(self): self._factory.terminate()
def _configure_workers(self): # TODO: if num_workers == 1, simply run one in-line (runs in a greenlet anyhow) if self.is_single_worker: from brick_worker import run_worker worker = run_worker(self.prov_port, self.resp_port) self.workers.append(worker) else: if os.path.exists(self.pidantic_dir): bdp = os.path.join(self.pidantic_dir, 'brick_dispatch') if os.path.exists(bdp): import zipfile, zlib with zipfile.ZipFile( os.path.join(bdp, 'archived_worker_logs.zip'), 'a', zipfile.ZIP_DEFLATED) as f: names = f.namelist() for x in [ x for x in os.listdir(bdp) if x.startswith('worker_') and x not in names ]: fn = os.path.join(bdp, x) f.write(filename=fn, arcname=x) os.remove(fn) else: os.makedirs(self.pidantic_dir) self.factory = SupDPidanticFactory(name='brick_dispatch', directory=self.pidantic_dir) # Check for old workers - FOR NOW, TERMINATE THEM TODO: These should be reusable... old_workers = self.factory.reload_instances() for x in old_workers: old_workers[x].cleanup() worker_cmd = 'bin/python coverage_model/brick_worker.py {0} {1}'.format( self.prov_port, self.resp_port) for x in xrange(self.num_workers): w = self.factory.get_pidantic( command=worker_cmd, process_name='worker_{0}'.format(x), directory=os.path.realpath(self.working_dir)) w.start() self.workers.append(w) ready = False while not ready: self.factory.poll() for x in self.workers: s = x.get_state() if s is PIDanticState.STATE_STARTING: break elif s is PIDanticState.STATE_RUNNING: continue elif s is PIDanticState.STATE_EXITED: self.shutdown() raise SystemError( 'Error starting worker - cannot continue') else: raise SystemError( 'Problem starting worker - cannot continue') ready = True
class BrickWriterDispatcher(object): def __init__(self, failure_callback, num_workers=1, pidantic_dir=None, working_dir=None): self.guid = create_guid() self.prep_queue = queue.Queue() self.work_queue = queue.Queue() self._pending_work = {} self._stashed_work = {} self._active_work = {} self._failures = {} self._do_stop = False self._count = -1 self._shutdown = False self._failure_callback = failure_callback self.context = zmq.Context(1) self.prov_sock = self.context.socket(zmq.REP) self.prov_port = self._get_port(self.prov_sock) log.info('Provisioning url: tcp://*:{0}'.format(self.prov_port)) self.resp_sock = self.context.socket(zmq.SUB) self.resp_port = self._get_port(self.resp_sock) self.resp_sock.setsockopt(zmq.SUBSCRIBE, '') log.info('Response url: tcp://*:{0}'.format(self.resp_port)) self.num_workers = num_workers if num_workers > 0 else 1 self.is_single_worker = self.num_workers == 1 self.working_dir = working_dir or '.' self.pidantic_dir = pidantic_dir or './pid_dir' self.workers = [] self._configure_workers() def _get_port(self, socket): for x in xrange(PORT_RANGE[0], PORT_RANGE[1]): try: socket.bind('tcp://*:{0}'.format(x)) return x except ZMQError: continue def _configure_workers(self): # TODO: if num_workers == 1, simply run one in-line (runs in a greenlet anyhow) if self.is_single_worker: from brick_worker import run_worker worker = run_worker(self.prov_port, self.resp_port) self.workers.append(worker) else: if os.path.exists(self.pidantic_dir): bdp = os.path.join(self.pidantic_dir, 'brick_dispatch') if os.path.exists(bdp): import zipfile, zlib with zipfile.ZipFile( os.path.join(bdp, 'archived_worker_logs.zip'), 'a', zipfile.ZIP_DEFLATED) as f: names = f.namelist() for x in [ x for x in os.listdir(bdp) if x.startswith('worker_') and x not in names ]: fn = os.path.join(bdp, x) f.write(filename=fn, arcname=x) os.remove(fn) else: os.makedirs(self.pidantic_dir) self.factory = SupDPidanticFactory(name='brick_dispatch', directory=self.pidantic_dir) # Check for old workers - FOR NOW, TERMINATE THEM TODO: These should be reusable... old_workers = self.factory.reload_instances() for x in old_workers: old_workers[x].cleanup() worker_cmd = 'bin/python coverage_model/brick_worker.py {0} {1}'.format( self.prov_port, self.resp_port) for x in xrange(self.num_workers): w = self.factory.get_pidantic( command=worker_cmd, process_name='worker_{0}'.format(x), directory=os.path.realpath(self.working_dir)) w.start() self.workers.append(w) ready = False while not ready: self.factory.poll() for x in self.workers: s = x.get_state() if s is PIDanticState.STATE_STARTING: break elif s is PIDanticState.STATE_RUNNING: continue elif s is PIDanticState.STATE_EXITED: self.shutdown() raise SystemError( 'Error starting worker - cannot continue') else: raise SystemError( 'Problem starting worker - cannot continue') ready = True def has_pending_work(self): return len(self._pending_work) > 0 def has_active_work(self): return len(self._active_work) > 0 def has_stashed_work(self): return len(self._stashed_work) > 0 def is_dirty(self): if not self.has_active_work(): if not self.has_stashed_work(): if not self.has_pending_work(): return False return True def get_dirty_values_async_result(self): dirty_async_res = AsyncResult() def dirty_check(self, res): while True: if self.is_dirty(): time.sleep(0.1) else: res.set(True) break spawn(dirty_check, self, dirty_async_res) return dirty_async_res def run(self): self._do_stop = False self._org_g = spawn(self.organize_work) self._prov_g = spawn(self.provisioner) self._rec_g = spawn(self.receiver) def shutdown(self, force=False, timeout=None): if self._shutdown: return # CBM TODO: Revisit to ensure this won't strand work or terminate workers before they complete their work...!! self._do_stop = True try: log.debug('Force == %s', force) if not force: log.debug('Waiting for organizer; timeout == %s', timeout) # Wait for the organizer to finish - ensures the prep_queue is empty self._org_g.join(timeout=timeout) log.debug('Waiting for provisioner; timeout == %s', timeout) # Wait for the provisioner to finish - ensures work_queue is empty self._prov_g.join(timeout=timeout) log.debug('Waiting for receiver; timeout == %s', timeout) # Wait for the receiver to finish - allows workers to finish their work self._rec_g.join(timeout=timeout) log.debug('Killing organizer, provisioner, and receiver greenlets') # Terminate the greenlets self._org_g.kill() self._prov_g.kill() self._rec_g.kill() log.debug('Greenlets killed') log.debug('Shutdown workers') # Shutdown workers - work should be completed by now... if self.is_single_worker: # Current work will be finished self.workers[0].stop() else: self.workers = self.factory.reload_instances() # CBM TODO: THIS DOES NOT ALLOW CURRENT WORK TO FINISH!!! for x in self.workers: self.workers[x].cleanup() self.factory.terminate() log.debug('Workers shutdown') except: raise finally: log.debug('Closing provisioner and receiver sockets') # Close sockets self.prov_sock.close() self.resp_sock.close() log.debug('Sockets closed') log.debug('Terminating the context') self.context.term() log.debug('Context terminated') self._shutdown = True def organize_work(self): while True: if self._do_stop and self.prep_queue.empty(): break try: # Timeout after 1 second to allow stopage and _stashed_work cleanup wd = self.prep_queue.get(timeout=1) except queue.Empty: # No new work added - see if there's anything on the stash to cleanup... for k in self._stashed_work: log.debug('Cleanup _stashed_work...') # Just want to trigger cleanup of the _stashed_work, pass an empty list of 'work', gets discarded self.put_work(k, self._stashed_work[k][0], []) continue try: k, wm, w = wd is_list = isinstance(w, list) if k not in self._stashed_work and len(w) == 0: log.debug('Discarding empty work') continue log.debug('Work: %s', w) if k in self._active_work: log.debug('Do Stash') # The work_key is being worked on if k not in self._stashed_work: # Create the stash for this work_key self._stashed_work[k] = (wm, []) # Add the work to the stash if is_list: self._stashed_work[k][1].extend(w[:]) else: self._stashed_work[k][1].append(w) else: # If there is a stash for this work_key, prepend it to work if k in self._stashed_work: log.debug('Was a stash, prepend: %s, %s', self._stashed_work[k], w) _, sv = self._stashed_work.pop(k) if is_list: sv.extend(w[:]) else: sv.append(w) w = sv is_list = True # Work is a list going forward!! log.debug('Work: %s', w) # The work_key is not yet pending not_in_pend = k not in self._pending_work if not_in_pend: # Create the pending for this work_key log.debug('-> new pointer \'%s\'', k) self._pending_work[k] = (wm, []) # Add the work to the pending log.debug('-> adding work to \'%s\': %s', k, w) if is_list: self._pending_work[k][1].extend(w[:]) else: self._pending_work[k][1].append(w) if not_in_pend: # Add the not-yet-pending work to the work_queue self.work_queue.put(k) except: raise def put_work(self, work_key, work_metrics, work): if self._shutdown: raise SystemError( 'This BrickDispatcher has been shutdown and cannot process more work!' ) self.prep_queue.put((work_key, work_metrics, work)) def _add_failure(self, wp): pwp = pack(wp) log.warn('Adding to _failures: %s', pwp) if pwp in self._failures: self._failures[pwp] += 1 else: self._failures[pwp] = 1 if self._failures[pwp] > WORK_FAILURE_RETRIES: raise ValueError('Maximum failure retries exceeded') def receiver(self): while True: try: if self.resp_sock.closed: break if self._do_stop and len(self._active_work) == 0: break log.debug('Receive response message (loop)') msg = None while msg is None: try: msg = self.resp_sock.recv(zmq.NOBLOCK) except zmq.ZMQError, e: if e.errno == zmq.EAGAIN: if self._do_stop: break else: time.sleep(0.1) else: raise if msg is not None: resp_type, worker_guid, work_key, work = unpack(msg) work = list(work) if work is not None else work if resp_type == SUCCESS: log.debug('Worker %s was successful', worker_guid) wguid, pw = self._active_work.pop(work_key) if pw in self._failures: self._failures.pop(pw) elif resp_type == FAILURE: log.debug( 'Failure reported for work on %s by worker %s', work_key, worker_guid) if work_key is None: # Worker failed before it did anything, put all work back on the prep queue to be reorganized by the organizer # Because it failed so miserably, need to find the work_key based on guid for k, v in self._active_work.iteritems(): if v[0] == worker_guid: work_key = k break if work_key is not None: wguid, pw = self._active_work.pop(work_key) try: self._add_failure(pw) except ValueError, e: self._failure_callback( e.message, unpack(pw)) continue self.put_work(*unpack(pw)) else: # Normal failure # Pop the work from active work, and queue the work returned by the worker wguid, pw = self._active_work.pop(work_key) try: self._add_failure(pw) except ValueError, e: self._failure_callback(e.message, unpack(pw)) continue _, wm, wk = unpack(pw) self.put_work(work_key, wm, work)
class SupDExe(object): def __init__(self, eeagent_cfg, log=logging): self.log = log self.log.debug("Starting SupDExe") self._working_dir = eeagent_cfg.launch_type.supd_directory self._eename = eeagent_cfg.name supdexe = _set_param_or_default(eeagent_cfg.launch_type, "supdexe", None) self._slots = int(eeagent_cfg.slots) self._factory = SupDPidanticFactory(directory=self._working_dir, name=self._eename, supdexe=supdexe) pidantic_instances = self._factory.reload_instances() self._known_pws = {} for name in pidantic_instances: pidantic = pidantic_instances[name] pw = PidWrapper(self, name) pw.set_pidantic(pidantic) self._known_pws[name] = pw self._state_change_cb = None self._state_change_cb_arg = None def set_state_change_callback(self, cb, user_arg): self._state_change_cb = cb self._state_change_cb_arg = user_arg for name in self._known_pws: pw = self._known_pws["name"] pw.set_state_change_callback(self._state_change_cb, self._state_change_cb_arg) def run(self, name, parameters): pw = PidWrapper(self, name) self._known_pws[name] = pw command = parameters["exec"] + " " + " ".join(parameters["argv"]) dir = self._working_dir if "working_directory" in parameters: dir = parameters["working_directory"] pid = self._factory.get_pidantic(command=command, process_name=name, directory=dir) pw.set_pidantic(pid) if self._state_change_cb: pw.set_state_change_callback(self._state_change_cb, self._state_change_cb_arg) running_jobs = self._get_running() x = len(running_jobs) if x <= self._slots: pid.start() else: pid.cancel_request() return pw def get_known_pws(self): return self._known_pws def _remove_proc(self, proc_name): del self._known_pws[proc_name] def lookup_id(self, process_name, ignore_round=False): if ignore_round: process_upid, process_round = unmake_id(process_name) for name, proc in self._known_pws.iteritems(): upid, round = unmake_id(name) if process_upid == upid: return proc else: return None else: if process_name not in self._known_pws: return None return self._known_pws[process_name] def get_all(self): return self._known_pws def _get_running(self): running_states = [PidWrapper.RUNNING, PidWrapper.TERMINATING, PidWrapper.PENDING] a = self.get_all().values() running = [i.get_state() for i in a] running = [i for i in a if i.get_state() in running_states] return running def poll(self): return self._factory.poll() def terminate(self): self._factory.terminate()
class EPUHarness(object): """EPUHarness. Sets up Process Dispatchers and EEAgents for testing. """ def __init__(self, exchange=None, pidantic_dir=None, amqp_uri=None, config=None, sysname=None): configs = ["epuharness"] config_files = get_config_paths(configs) if config: config_files.append(config) self.CFG = bootstrap.configure(config_files) self.sysname = sysname self.logdir = self.CFG.epuharness.logdir self.pidantic_dir = (pidantic_dir or os.environ.get('EPUHARNESS_PERSISTENCE_DIR') or self.CFG.epuharness.pidantic_dir) self.exchange = exchange or self.CFG.server.amqp.get( 'exchange', None) or str(uuid.uuid4()) self.CFG.server.amqp.exchange = self.exchange self.CFG.dashi.sysname = sysname self.dashi = bootstrap.dashi_connect(self.CFG.dashi.topic, self.CFG, amqp_uri=amqp_uri, sysname=sysname) self.amqp_cfg = dict(self.CFG.server.amqp) self.factory = None self.savelogs_dir = None def _setup_factory(self): if self.factory: return try: self.factory = SupDPidanticFactory(directory=self.pidantic_dir, name="epu-harness") except Exception: log.debug("Problem Connecting to SupervisorD", exc_info=True) raise HarnessException( "Could not connect to supervisord. Was epu-harness started?") def status(self, exit=True): """Get status of services that were previously started by epuharness """ self._setup_factory() instances = self.factory.reload_instances() self.factory.poll() return_code = 0 status = [] for name, instance in instances.iteritems(): state = instance.get_state() status.append((name, status)) if state != PIDanticState.STATE_RUNNING: return_code = 1 log.info("%s is %s" % (name, instance.get_state())) if exit: sys.exit(return_code) else: return status def get_logfiles(self): """Returns a list of logfile paths relevant to epuharness instance """ # pretty hacky. we could get these over the supd API instead. # but that's certainly slower and not really better. pidantic_dir = os.path.abspath(self.pidantic_dir) epuharness_dir = os.path.join(pidantic_dir, "epu-harness") logfiles = [] for f in os.listdir(epuharness_dir): if os.path.splitext(f)[1].lower() == ".log": logfiles.append(os.path.join(epuharness_dir, f)) return logfiles def stop(self, services=None, force=False, remove_dir=True): """Stop services that were previously started by epuharness @param force: When False raises an exception when there is something that can't be killed. """ cleanup = False self._setup_factory() instances = self.factory.reload_instances() # If we're killing everything, perform cleanup if services == instances.keys(): cleanup = True elif not services: cleanup = True services = instances.keys() log.info("Stopping %s" % ", ".join(services)) for service in services: instances_to_kill = filter(lambda x: x.startswith(service), instances.keys()) for instance_name in instances_to_kill: instance = instances[instance_name] self._clean_instance_config(instance) if not cleanup: instance.cleanup() if cleanup: if self.savelogs_dir: try: self._save_logs(self.savelogs_dir) except Exception: log.exception("Problem saving logs. Proceeding.") try: self.factory.terminate() except Exception as e: log.warning("Problem terminating factory, continuing : %s" % e) if remove_dir: careful_rmtree(self.pidantic_dir) self.dashi.cancel() self.dashi.disconnect() def _save_logs(self, output_dir): for logfile in self.get_logfiles(): basename = os.path.basename(logfile) dest_path = os.path.join(output_dir, basename) try: shutil.copy2(logfile, dest_path) except Exception: log.exception("Error copying logfile %s", logfile) def _clean_instance_config(self, instance): try: # Clean up config files command = instance._program_object.command command = command.split() for config in command: if config.endswith('.yml'): _cf = yaml.load(config) with open(_cf) as cf: cfg = yaml.load(cf) try: persistence = cfg['apps'][0]['config']['eeagent'][ 'launch_type']['persistence_directory'] careful_rmtree(persistence) except Exception: pass os.remove(config) except Exception, e: # Perhaps instance internals have changed log.warning("Couldn't delete temporary config files: %s" % e)
class EPUHarness(object): """EPUHarness. Sets up Process Dispatchers and EEAgents for testing. """ def __init__(self, exchange=None, pidantic_dir=None, amqp_uri=None, config=None, sysname=None): configs = ["epuharness"] config_files = get_config_paths(configs) if config: config_files.append(config) self.CFG = bootstrap.configure(config_files) self.sysname = sysname self.logdir = self.CFG.epuharness.logdir self.pidantic_dir = (pidantic_dir or os.environ.get('EPUHARNESS_PERSISTENCE_DIR') or self.CFG.epuharness.pidantic_dir) self.exchange = exchange or self.CFG.server.amqp.get('exchange', None) or str(uuid.uuid4()) self.CFG.server.amqp.exchange = self.exchange self.CFG.dashi.sysname = sysname self.dashi = bootstrap.dashi_connect(self.CFG.dashi.topic, self.CFG, amqp_uri=amqp_uri, sysname=sysname) self.amqp_cfg = dict(self.CFG.server.amqp) self.factory = None self.savelogs_dir = None def _setup_factory(self): if self.factory: return try: self.factory = SupDPidanticFactory(directory=self.pidantic_dir, name="epu-harness") except Exception: log.debug("Problem Connecting to SupervisorD", exc_info=True) raise HarnessException("Could not connect to supervisord. Was epu-harness started?") def status(self, exit=True): """Get status of services that were previously started by epuharness """ self._setup_factory() instances = self.factory.reload_instances() self.factory.poll() return_code = 0 status = [] for name, instance in instances.iteritems(): state = instance.get_state() status.append((name, status)) if state != PIDanticState.STATE_RUNNING: return_code = 1 log.info("%s is %s" % (name, instance.get_state())) if exit: sys.exit(return_code) else: return status def get_logfiles(self): """Returns a list of logfile paths relevant to epuharness instance """ # pretty hacky. we could get these over the supd API instead. # but that's certainly slower and not really better. pidantic_dir = os.path.abspath(self.pidantic_dir) epuharness_dir = os.path.join(pidantic_dir, "epu-harness") logfiles = [] for f in os.listdir(epuharness_dir): if os.path.splitext(f)[1].lower() == ".log": logfiles.append(os.path.join(epuharness_dir, f)) return logfiles def stop(self, services=None, force=False, remove_dir=True): """Stop services that were previously started by epuharness @param force: When False raises an exception when there is something that can't be killed. """ cleanup = False self._setup_factory() instances = self.factory.reload_instances() # If we're killing everything, perform cleanup if services == instances.keys(): cleanup = True elif not services: cleanup = True services = instances.keys() log.info("Stopping %s" % ", ".join(services)) for service in services: instances_to_kill = filter(lambda x: x.startswith(service), instances.keys()) for instance_name in instances_to_kill: instance = instances[instance_name] self._clean_instance_config(instance) if not cleanup: instance.cleanup() if cleanup: if self.savelogs_dir: try: self._save_logs(self.savelogs_dir) except Exception: log.exception("Problem saving logs. Proceeding.") try: self.factory.terminate() except Exception as e: log.warning("Problem terminating factory, continuing : %s" % e) if remove_dir: careful_rmtree(self.pidantic_dir) self.dashi.cancel() self.dashi.disconnect() def _save_logs(self, output_dir): for logfile in self.get_logfiles(): basename = os.path.basename(logfile) dest_path = os.path.join(output_dir, basename) try: shutil.copy2(logfile, dest_path) except Exception: log.exception("Error copying logfile %s", logfile) def _clean_instance_config(self, instance): try: # Clean up config files command = instance._program_object.command command = command.split() for config in command: if config.endswith('.yml'): _cf = yaml.load(config) with open(_cf) as cf: cfg = yaml.load(cf) try: persistence = cfg['apps'][0]['config']['eeagent']['launch_type']['persistence_directory'] careful_rmtree(persistence) except Exception: pass os.remove(config) except Exception, e: # Perhaps instance internals have changed log.warning("Couldn't delete temporary config files: %s" % e)