class SandboxTaskStateAwaiter(object): DEFAULT_UPDATE_INTERVAL = 1.0 # FIXME def __init__(self, sandbox, update_interval=DEFAULT_UPDATE_INTERVAL): self._sandbox = sandbox self._should_stop = False self._lock = threading.Lock() self._something_happend = threading.Condition(self._lock) self._worker_thread = None self._update_interval = update_interval self._running = {} self._incoming = set() def start(self): self._worker_thread = ProfiledThread(target=self._loop, name_prefix='SbxStateMon') self._worker_thread.start() def stop(self): with self._lock: self._should_stop = True self._something_happend.notify() self._worker_thread.join() def await(self, task_id): with self._lock: was_empty = not self._incoming assert task_id not in self._incoming and task_id not in self._running self._incoming.add(task_id) if was_empty: self._something_happend.notify()
def start(self, working_directory, io_directory, on_update, reset_tries=False, vaults_setup=None): self._on_update = on_update self._working_directory = working_directory self._io_directory = io_directory #self._init_non_persistent() self._proc_runner = rem.job.create_job_runner(None, None) if vaults_setup: def dictify(pairs): if pairs is None: return None return dict(pairs) vaults_setup = { 'global': dictify(vaults_setup['global']), 'jobs': { int(job_id): dictify(setup) for job_id, setup in vaults_setup['jobs'].items() } } self._vaults_setup = vaults_setup with self._lock: if reset_tries: self._graph_executor.reset_tries() #self.resume() #self._do_not_run = False self._main_thread = ProfiledThread(target=self._main_loop, name_prefix='PckLoop') self._main_thread.start()
def __init__(self, send_update): self._send_update = send_update self._pending_update = None self._pck_finished = False self._should_stop_max_time = None self._lock = threading.Lock() self._changed = threading.Condition(self._lock) self._worker_thread = ProfiledThread(target=self._the_loop, name_prefix='RemNotifier') self._worker_thread.daemon = True # FIXME See failed[0] self._worker_thread.start()
def Start(self): if not self.network_name or not self.tags_file or not self.port: logging.warning("ConnectionManager could'n start: wrong configuration. " + "network_name: %s, remote_tags_db_file: %s, system_port: %r", self.network_name, self.tags_file, self.port) return self.ReloadConfig() logging.debug("after_reload_config") for client in self.topologyInfo.servers.values(): if client.active and client.name != self.network_name: client.TryInitializePeersVersions(self.network_name) logging.debug("after_clients_versions_init") self.alive = True self.InitXMLRPCServer() self._accept_loop_thread = ProfiledThread(target=self.ServerLoop, name_prefix='ConnManager') self._accept_loop_thread.start() logging.debug("after_connection_manager_loop_start") for client in self.topologyInfo.servers.values(): self.scheduler.ScheduleTaskT(0, self.SendData, client, skip_logging=True)
class Packet(object): _MAX_TIME_WAIT = 60.0 # FIXME def __init__(self, pck_id, graph): self.id = pck_id self.name = '_TODO_packet_name_for_%s' % pck_id # TODO self.history = [] self._init_non_persistent() self.state = None #self._update_state_if_need() self._graph_executor = rem.job_graph.JobGraphExecutor( _ExecutorOps(self), self.id, graph, ) # TODO Better with self._lock: self._graph_executor.init() def _get_vaults_for(self, job_id): vaults = self._vaults_setup if not vaults: return None env = copy.copy(vaults['global']) or {} env.update(vaults['jobs'].get(job_id, {})) logging.debug('Vaults for %d: %s' % (job_id, env)) return env def _mark_as_finished_if_need(self): graph = self._graph_executor self._finished = \ graph.state in [GraphState.SUCCESSFULL, GraphState.ERROR] \ or graph.state == GraphState.TIME_WAIT \ and graph.get_nearest_retry_deadline() - time.time() > self._MAX_TIME_WAIT \ or (self._do_not_run or self._cancelled) and graph.is_null() def _update_state(self): new_state = self._calc_state() if new_state == self.state: return self.state = new_state self.history.append((new_state, time.time())) logging.info("new state %s" % new_state) def _init_non_persistent(self): self._lock = threading.RLock() self._something_changed = threading.Condition(self._lock) self._main_thread = None self._job_threads = [] self._proc_runner = None self._do_not_run = False self._finished = False self._cancelled = False self._has_updates = False def vivify_jobs_waiting_stoppers(self): with self._lock: self._graph_executor.vivify_jobs_waiting_stoppers() def __getstate__(self): sdict = self.__dict__.copy() sdict.pop('_lock', None) sdict.pop('_something_changed', None) sdict.pop('_working_directory', None) sdict.pop('_io_directory', None) sdict.pop('_main_thread', None) sdict.pop('_proc_runner', None) sdict.pop('_job_threads', None) sdict.pop('_on_update', None) sdict.pop('_do_not_run', None) sdict.pop('_finished', None) sdict.pop('_cancelled', None) sdict.pop('_has_updates', None) return sdict def __setstate__(self, sdict): self.__dict__.update(sdict) self._init_non_persistent() def start(self, working_directory, io_directory, on_update, reset_tries=False, vaults_setup=None): self._on_update = on_update self._working_directory = working_directory self._io_directory = io_directory #self._init_non_persistent() self._proc_runner = rem.job.create_job_runner(None, None) if vaults_setup: def dictify(pairs): if pairs is None: return None return dict(pairs) vaults_setup = { 'global': dictify(vaults_setup['global']), 'jobs': { int(job_id): dictify(setup) for job_id, setup in vaults_setup['jobs'].items() } } self._vaults_setup = vaults_setup with self._lock: if reset_tries: self._graph_executor.reset_tries() #self.resume() #self._do_not_run = False self._main_thread = ProfiledThread(target=self._main_loop, name_prefix='PckLoop') self._main_thread.start() def join(self): self._main_thread.join() def get_working_directory(self): return self._working_directory def get_io_directory(self): return self._io_directory def _start_one_another_job(self): logging.debug('+ Packet._start_one_another_job') job_runner = self._graph_executor.get_job_to_run() t = ProfiledThread(target=job_runner.run, name_prefix='Job') self._job_threads.append(t) t.start() def stop(self, kill_jobs): with self._lock: #if self._do_not_run: # XXX May be called with different kill_jobs #return if self._finished: # FIXME #raise RuntimeError("Already finished") return if self._cancelled: raise RuntimeError("Already cancelled") self._do_not_run = True if kill_jobs: self._graph_executor.cancel() self._mark_as_finished_if_need() self._something_changed.notify() # For those who changed their's minds after call to stop(kill_jobs=False) def resume(self): with self._lock: if self._finished: # FIXME raise RuntimeError("Already finished") if self._cancelled: raise RuntimeError("Already cancelled") if self._do_not_run: self._do_not_run = False self._graph_executor.reset_tries() self._mark_as_finished_if_need() self._something_changed.notify() def cancel(self): with self._lock: if self._finished: raise RuntimeError("Already finished") self._cancelled = True self._graph_executor.cancel() self._mark_as_finished_if_need() self._something_changed.notify() def is_cancelled(self): return self._cancelled def restart(self): with self._lock: if self._finished: raise RuntimeError("Already finished") if self._cancelled: raise RuntimeError("Already cancelled") self._do_not_run = False # was any self._graph_executor.reset() self._something_changed.notify() def produce_rem_update_message(self): graph = self._graph_executor state = { #'history': list(self.history), # TODO FIXME 'state': self.state, 'detailed_status': graph.produce_detailed_status(), 'succeed_jobs': map(str, graph.get_succeeded_jobs()), } if graph.state == GraphState.TIME_WAIT: state['nearest_retry_deadline'] = graph.get_nearest_retry_deadline() return state def _send_update(self): self._on_update(self.produce_rem_update_message()) def _main_loop(self): logging.debug('+ Packet.run') while True: with self._lock: logging.debug('_before_job_start_loop') if not (self._do_not_run or self._cancelled): logging.debug('_graph_executor.state == %s' \ % GraphState.str(self._graph_executor.state)) while self._graph_executor.state & GraphState.PENDING_JOBS: self._start_one_another_job() logging.debug('_before_send_update_check: %s' % ((self._has_updates, self._finished),)) if self._has_updates and not self._finished: logging.debug('_before_send_update') self._send_update() self._has_updates = False if self._finished: break logging.debug('_before_cond_wait') self._something_changed.wait() logging.debug('_after_cond_wait') logging.debug('+ exiting Packet.run') def _calc_state(self): graph = self._graph_executor return graph.state # FIXME if graph.is_null(): if self._do_not_run: return GraphState.SUSPENDED elif self._cancelled: return GraphState.CANCELLED return graph.state def _stop_waiting(self, stop_id): with self._lock: if self._cancelled or self._finished: # FIXME _do_not_run return self._graph_executor.stop_waiting(stop_id) # OPS for rem.job.Job def start_process(self, *args, **kwargs): return self._proc_runner(*args, **kwargs) def notify_long_execution(self, job): raise NotImplementedError() def _create_job_file_handles(self, job): return self._graph_executor.create_job_file_handles(job) def on_job_done(self, job_runner): self._graph_executor.on_job_done(job_runner) with self._lock: self._graph_executor.apply_jobs_results() def create_file_handles(self, job): return self._graph_executor.create_job_file_handles(job)
def _start_one_another_job(self): logging.debug('+ Packet._start_one_another_job') job_runner = self._graph_executor.get_job_to_run() t = ProfiledThread(target=job_runner.run, name_prefix='Job') self._job_threads.append(t) t.start()
def start(self): self._worker_thread = ProfiledThread(target=self._loop, name_prefix='SbxStateMon') self._worker_thread.start()
class ConnectionManager(Unpickable(topologyInfo=TopologyInfo, lock=PickableLock, alive=(bool, False), tags_file=str), ICallbackAcceptor): def InitXMLRPCServer(self): self.rpcserver = SimpleXMLRPCServer(("", self.port), allow_none=True) self.rpcserver.register_function(self.set_client_version, "set_client_version") self.rpcserver.register_function(self.get_client_version, "get_client_version") self.rpcserver.register_function(self.set_tags, "set_tags") self.rpcserver.register_function(self.register_tags_events, "register_tags_events") self.rpcserver.register_function(self.list_clients, "list_clients") self.rpcserver.register_function(self.list_tags, "list_tags") self.rpcserver.register_function(self.suspend_client, "suspend_client") self.rpcserver.register_function(self.resume_client, "resume_client") self.rpcserver.register_function(self.reload_config, "reload_config") self.rpcserver.register_function(self.register_share, "register_share") self.rpcserver.register_function(self.unregister_share, "unregister_share") self.rpcserver.register_function(self.get_client_info, "get_client_info") self.rpcserver.register_function(self.list_shares, "list_shares") self.rpcserver.register_function(self.list_shared_events, "list_shared_events") self.rpcserver.register_function(self.list_subscriptions, "list_subscriptions") self.rpcserver.register_function(self.check_connection, "check_connection") self.rpcserver.register_function(self.ping, "ping") def UpdateContext(self, context): self.scheduler = context.Scheduler self.network_name = context.network_name self.tags_file = context.remote_tags_db_file self.port = context.system_port if self.tags_file: self.acceptors = MapSetDB(self.tags_file) self.topologyInfo.UpdateContext(context) self.max_remotetags_resend_delay = context.max_remotetags_resend_delay def Start(self): if not self.network_name or not self.tags_file or not self.port: logging.warning("ConnectionManager could'n start: wrong configuration. " + "network_name: %s, remote_tags_db_file: %s, system_port: %r", self.network_name, self.tags_file, self.port) return self.ReloadConfig() logging.debug("after_reload_config") for client in self.topologyInfo.servers.values(): if client.active and client.name != self.network_name: client.TryInitializePeersVersions(self.network_name) logging.debug("after_clients_versions_init") self.alive = True self.InitXMLRPCServer() self._accept_loop_thread = ProfiledThread(target=self.ServerLoop, name_prefix='ConnManager') self._accept_loop_thread.start() logging.debug("after_connection_manager_loop_start") for client in self.topologyInfo.servers.values(): self.scheduler.ScheduleTaskT(0, self.SendData, client, skip_logging=True) def Stop(self): self.alive = False self._accept_loop_thread.join() self.rpcserver = None # shutdown listening socket def ServerLoop(self): rpc_fd = self.rpcserver.fileno() while self.alive: rout, _, _ = select.select((rpc_fd,), (), (), 0.01) if rpc_fd in rout: self.rpcserver.handle_request() def SendData(self, client): if self.alive and client.active: client.SendDataIfNeed(self.network_name) if hasattr(self, "scheduler"): self.scheduler.ScheduleTaskT( min(client.PENALTY_FACTOR ** client.errorsCnt, self.max_remotetags_resend_delay), self.SendData, client, skip_logging=True ) def RegisterTagEvent(self, tag, event, message=None): if not isinstance(tag, TagBase): raise RuntimeError("%s is not Tag class instance", tag.GetName()) if tag.IsRemote(): return tagname = tag.GetName() with self.lock: # see register_share acceptors = self.acceptors.get(tagname) if acceptors: logging.debug("on %s connmanager %s with acceptors list %s", TagEventName[event], tagname, acceptors) for clientname in acceptors: self.RegisterTagEventForClient(clientname, tagname, event, message) def RegisterTagEventForClient(self, clientname, tagname, event, message=None): logging.debug("%s remote tag %s on host %s", TagEventName[event], tagname, clientname) client = self.topologyInfo.GetClient(clientname, checkname=False) if client is None: logging.error("unknown client %s appeared", clientname) return False client.RegisterTagEvent("%s:%s" % (self.network_name, tagname), event, message) def ReloadConfig(self, filename=None): old_servers = set(self.topologyInfo.servers.keys()) self.topologyInfo.ReloadConfig() new_servers = set(self.topologyInfo.servers.keys()) new_servers -= old_servers if self.alive: for client in new_servers: self.scheduler.ScheduleTaskT(0, self.SendData, self.topologyInfo.servers[client], skip_logging=True) def Subscribe(self, tag): if tag.IsRemote(): client = self.topologyInfo.GetClient(tag.GetRemoteHost(), checkname=True) client.Subscribe(tag.GetName()) return True return False @traced_rpc_method() def set_tags(self, tags): # obsolete logging.debug("set %d remote tags", len(tags)) for tagname in tags: self.scheduler.tagRef.AcquireTag(tagname).CheckRemote().Set() return True @traced_rpc_method() def set_client_version(self, clientname, version): self.topologyInfo.GetClient(clientname, checkname=True).SetVersion(int(version)) logging.debug("set client version for %s to %s", clientname, version) return True @traced_rpc_method() def get_client_version(self): return PROTOCOL_VERSION @traced_rpc_method() def register_tags_events(self, updates): tagRef = self.scheduler.tagRef logging.debug("register_tags_events %d: %s", len(updates), updates) for update in updates: tagRef.AcquireTag(update[0]).CheckRemote().Modify(*update[1:]) logging.debug("done with: %s", update) logging.debug("register_tags_events %d: done", len(updates)) return True @traced_rpc_method() def list_clients(self): return [{"name": client.name, "url": client.url, "systemUrl": client.systemUrl, "active": client.active, "version": client.version, "errorsCount": client.errorsCnt, "tagsCount": len(client.events), "subscriptionsCount": len(client.subscriptions), "lastError": str(client.lastError)} for client in self.topologyInfo.servers.values()] @traced_rpc_method() def list_tags(self, name_prefix): data = set() for server in self.topologyInfo.servers.values(): if name_prefix is None or server.name.startswith(name_prefix): data.update(server.GetEventsAsTagsToSet()) return list(data) @traced_rpc_method() def suspend_client(self, name): client = self.topologyInfo.GetClient(name) return client.Suspend() @traced_rpc_method() def resume_client(self, name): client = self.topologyInfo.GetClient(name) return client.Resume() @traced_rpc_method() def reload_config(self, location=None): self.ReloadConfig(location) @traced_rpc_method() def register_share(self, tags, clientname): tagRef = self.scheduler.tagRef logging.debug("register_share %d tags for %s: %s", len(tags), clientname, tags) for tagname in tags: # XXX # 1. this lock only guarantee eventual-consistency of tag's history # 2. clients of self may see duplicates of events (even Reset) # 3. also guard self.acceptors with self.lock: self.acceptors.add(tagname, clientname) if tagRef._RawTag(tagname).IsLocallySet(): self.RegisterTagEventForClient(clientname, tagname, ETagEvent.Set) logging.debug("register_share %d tags for %s: done", len(tags), clientname) @traced_rpc_method() def unregister_share(self, tagname, clientname): with self.lock: return self.acceptors.remove(tagname, clientname) @traced_rpc_method() def get_client_info(self, clientname): client = self.topologyInfo.GetClient(clientname) res = {"name": client.name, "url": client.url, "systemUrl": client.systemUrl, "active": client.active, "version": client.version, "errorsCount": client.errorsCnt, "deferedTagsCount": len(client.events), "subscriptionsCount": len(client.subscriptions), "lastError": str(client.lastError)} return res @traced_rpc_method() def list_shares(self, clientname): client = self.topologyInfo.GetClient(clientname) return _get_tags_to_set(client.GetEventsAsList()) @traced_rpc_method() def list_shared_events(self, clientname): client = self.topologyInfo.GetClient(clientname) return client.GetEventsAsList() @traced_rpc_method() def list_subscriptions(self, clientname): client = self.topologyInfo.GetClient(clientname) return list(client.subscriptions) @traced_rpc_method() def check_connection(self, clientname): client = self.topologyInfo.GetClient(clientname) return client.connection.ping() @traced_rpc_method() def ping(self): return True def __getstate__(self): sdict = self.__dict__.copy() sdict.pop("scheduler", None) sdict.pop("rpcserver", None) sdict.pop("acceptors", None) sdict.pop("_accept_loop_thread", None) sdict["alive"] = False return getattr(super(ConnectionManager, self), "__getstate__", lambda: sdict)()
class RemNotifier(object): _RETRY_DELAY = 10.0 class RetriableError(RuntimeError): pass def __init__(self, send_update): self._send_update = send_update self._pending_update = None self._pck_finished = False self._should_stop_max_time = None self._lock = threading.Lock() self._changed = threading.Condition(self._lock) self._worker_thread = ProfiledThread(target=self._the_loop, name_prefix='RemNotifier') self._worker_thread.daemon = True # FIXME See failed[0] self._worker_thread.start() def stop(self, timeout=0): with self._lock: if self._should_stop_max_time: raise RuntimeError() self._should_stop_max_time = time.time() + timeout self._changed.notify() self._worker_thread.join() def send_update(self, update, is_final=False): with self._lock: self._pending_update = (update, is_final) self._changed.notify() def _the_loop(self): next_try_min_time = 0 while True: with self._lock: while True: now = time.time() if self._should_stop_max_time: if now > self._should_stop_max_time \ or next_try_min_time > self._should_stop_max_time: return if self._pending_update: deadline = next_try_min_time if now > deadline: break else: deadline = None self._changed.wait(deadline - now if deadline is not None else None) update, is_final = self._pending_update self._pending_update = None logging.debug('sending_update: %s' % ((update, is_final),)) try: self._send_update(update, is_final) except self.RetriableError: logging.exception('Failed to send update') with self._lock: if not self._pending_update: self._pending_update = (update, is_final) next_try_min_time = time.time() + self._RETRY_DELAY else: if is_final: return