def test_send_many(): loopback_queue = InterlockedQueue() def process_request(request, response): loopback_queue.push(int(request["message_text"])) with active_interface("jms_1", **interface_config(process_request = process_request)): for i in range(200): fake_request(30.0) xa = pmnc.transaction.create() xa.jms_1.send(str(i*5)) xa.jms_1.send(str(i*5+1)) xa.jms_1.send(str(i*5+2)) xa.jms_1.send(str(i*5+3)) xa.jms_1.send(str(i*5+4)) xa.execute() received = [] message_number = loopback_queue.pop(10.0) while message_number is not None: received.append(message_number) message_number = loopback_queue.pop(10.0) assert len(received) == 1000 received_sorted = [] for i in range(200): # every 5 should have been sent atomically received_sorted.extend(list(sorted(received[i*5:(i+1)*5]))) assert received_sorted == list(range(1000))
def test_interface_ordering(): file_names = [ "{0:08d}.msg".format(i) for i in range(100) ] shuffle(file_names) for file_name in file_names: write_file(file_name, b"data") loopback_queue = InterlockedQueue() def process_request(request, response): sleep(0.1) loopback_queue.push(request) with active_interface("file_1", **interface_config(process_request = process_request, filename_regex = write_prefix + "[0-9a-f]{8}\\.msg")): for i in range(100): file_name = loopback_queue.pop(3.0)["file_name"] assert os_path.basename(file_name).endswith("{0:08d}.msg".format(i)) if i % 10 == 9: write_file("{0:08d}.msg".format(i // 10), b"data") for i in range(10): file_name = loopback_queue.pop(3.0)["file_name"] assert os_path.basename(file_name).endswith("{0:08d}.msg".format(i)) assert loopback_queue.pop(3.0) is None
def test_process_timeout(): loopback_queue = InterlockedQueue() delay = Event(); delay.set() def process_request(request, response): if delay.is_set(): sleep(pmnc.request.remain + 1.0) loopback_queue.push(request) with active_interface("jms_1", **interface_config(process_request = process_request, request_timeout = 3.0)): fake_request(10.0) xa = pmnc.transaction.create() xa.jms_1.send("<xml/>") message_id = xa.execute()[0] assert loopback_queue.pop(3.0) is None delay.clear() request = loopback_queue.pop(10.0) assert request["message_id"] == message_id assert request["message_text"] == "<xml/>"
def test_process_failure(): loopback_queue = InterlockedQueue() fail = Event(); fail.set() def process_request(request, response): if fail.is_set(): sleep(1.0) raise Exception("processing failure") else: loopback_queue.push(request) with active_interface("jms_1", **interface_config(process_request = process_request)): fake_request(10.0) xa = pmnc.transaction.create() xa.jms_1.send("<xml/>") message_id = xa.execute()[0] assert loopback_queue.pop(3.0) is None fail.clear() request = loopback_queue.pop(10.0) assert request["message_id"] == message_id assert request["message_text"] == "<xml/>" headers = request["headers"] assert not headers.get("JMSCorrelationID")
class FakeRpcInterface: @typecheck def __init__(self, cages_list): self._cages_list = cages_list self._queue = InterlockedQueue() self._pass = 0 self._stopped = Event() def get_cages(self): try: self._cages = self._cages_list[self._pass] except IndexError: self._cages = {} self._stopped.set() self._pass += 1 return list(self._cages.keys()) def get_nodes(self, cage): return self._cages[cage] def process_event(self, node, cage, up_down, probe_result): self._queue.push((node, cage, up_down, probe_result)) def extract_events(self): result = {} event = self._queue.pop(1.0) while event is not None: node, cage, up_down, probe_result = event events = result.setdefault("{0:s}.{1:s}".format(node, cage), []) if up_down == "up": events.append(probe_result) else: events.append(None) event = self._queue.pop(1.0) return result
def __init__(self, name: str, *, server_address: (str, int), connect_timeout: float, response_timeout: float, ping_interval: optional(float), system_id: str, password: str, system_type: str, esme_ton: byte, esme_npi: byte, esme_addr: str, esme_type: one_of("rcvr", "xmit", "xcvr"), request_timeout: optional(float) = None, **kwargs): # this kwargs allows for extra application-specific # settings in config_interface_smpp_X.py self._name = name self._response_timeout = response_timeout if ping_interval: self._ping_timeout = Timeout(ping_interval) self._ping_response_timeout = Timeout(response_timeout) else: self._ping_timeout = self._ping_response_timeout = None self._ping_request = None self._in_q = InterlockedQueue() self._out_q = InterlockedQueue() self._inflight = InflightRequests() self._ceased = Event() if esme_type == "rcvr": bind_pdu = BindReceiverPDU elif esme_type == "xmit": bind_pdu = BindTransmitterPDU elif esme_type == "xcvr": bind_pdu = BindTransceiverPDU self._create_connection = \ lambda: _SMPPConnection(name, self._in_q, self._out_q, self._inflight, server_address = server_address, connect_timeout = connect_timeout, response_timeout = response_timeout, system_id = system_id, password = password, system_type = system_type, esme_ton = esme_ton, esme_npi = esme_npi, esme_addr = esme_addr, bind_pdu = bind_pdu) self._request_timeout = request_timeout or \ pmnc.config_interfaces.get("request_timeout") # this is now static if pmnc.request.self_test == __name__: # self-test self._process_request = kwargs["process_request"]
def drain_queue(): loopback_queue = InterlockedQueue() def process_request(request, response): loopback_queue.push(request) with active_interface("jms_1", **interface_config(process_request = process_request)): while loopback_queue.pop(10.0) is not None: pass
def test_post(): with expected(Exception("the request is no longer pending for response")): pmnc.__getattr__(__name__).post("RQ-ABC", "RESULT") rs_queue = InterlockedQueue() _rs_queues["RQ-ABC"] = rs_queue pmnc.__getattr__(__name__).post("RQ-ABC", "RESULT") assert rs_queue.pop() == "RESULT"
def pop(self, queue: InterlockedQueue): # respects wall-time timeout, see issue9892 if self.infinite: return queue.pop() remain = self.remain # there is no special handling for case remain == 0.0, because that while remain > 0.0: # would mean request deadline hence performance is no longer an issue result = queue.pop(remain) if result is not None: return result remain = self.remain else: return None
def test_interface_failure(): loopback_queue = InterlockedQueue() def process_request(request, response): not_defined with active_interface( "schedule_1", **interface_config(process_request=process_request)): assert loopback_queue.pop(4.0) is None
def test_post(): with expected( Exception("the request is no longer pending for response")): pmnc.__getattr__(__name__).post("RQ-ABC", "RESULT") rs_queue = InterlockedQueue() _rs_queues["RQ-ABC"] = rs_queue pmnc.__getattr__(__name__).post("RQ-ABC", "RESULT") assert rs_queue.pop() == "RESULT"
def pop(self, queue: InterlockedQueue ): # respects wall-time timeout, see issue9892 if self.infinite: return queue.pop() remain = self.remain # there is no special handling for case remain == 0.0, because that while remain > 0.0: # would mean request deadline hence performance is no longer an issue result = queue.pop(remain) if result is not None: return result remain = self.remain else: return None
def test_too_large(): fake_request(10.0) q = InterlockedQueue() def process_request(request, response): q.push(request["packet"]) with active_interface("udp", **interface_config(process_request = process_request)) as ifc: pmnc.transaction.udp_1.send(b"x" * 60000) assert q.pop(3.0) is None
def test_success(): fake_request(10.0) q = InterlockedQueue() def process_request(request, response): q.push(request["packet"]) with active_interface("udp", **interface_config(process_request = process_request)) as ifc: msg = b"foo" pmnc.transaction.udp_1.send(msg) assert q.pop(3.0) == msg
def test_timeout(): fake_request(10.0) q = InterlockedQueue() def process_request(request, response): sleep(5.0) with active_interface("udp", **interface_config(process_request = process_request)) as ifc: msg = b"foo" pmnc.transaction.udp_1.send(msg) assert q.pop(3.0) is None
def __init__(self, **kwargs): self._rpc_interface = pmnc.interfaces.get_interface("rpc") if self._rpc_interface is None: raise Exception("health monitor requires enabled rpc interface") self._probe_thread_pool = pmnc.shared_pools.get_private_thread_pool() self._up_cages = {} # { cage: { node: { location: ..., probe_result: ... } } } self._up_down_queue = InterlockedQueue() self._request_timeout = pmnc.config_interfaces.get("request_timeout") # this is now static if pmnc.request.self_test == __name__: # self-test self._process_event = kwargs["process_event"] self._probe_cage = kwargs["probe_cage"]
def test_interface_success(): loopback_queue = InterlockedQueue() def process_request(request, response): sleep(0.1) loopback_queue.push("ok") with active_interface( "schedule_1", **interface_config(process_request=process_request)): assert loopback_queue.pop(4.0) == "ok" assert loopback_queue.pop(1.0) is None assert loopback_queue.pop(3.0) == "ok" assert loopback_queue.pop(1.0) is None assert loopback_queue.pop(3.0) == "ok"
def test_interface_failure(): loopback_queue = InterlockedQueue() def process_request(request, response): not_defined with active_interface("file_1", **interface_config(process_request = process_request, filename_regex = write_prefix + "[0-9a-f]{8}\\.msg")): assert loopback_queue.pop(3.0) is None file_name = write_file(random_filename() + ".msg", b"data") assert os_path.isfile(file_name) assert loopback_queue.pop(3.0) is None assert os_path.isfile(file_name) remove(file_name)
def test_interface_no_wait(): loopback_queue = InterlockedQueue() def process_request(request, response): sleep(5.0) loopback_queue.push(request["invocation_time"]) with active_interface( "schedule_1", **interface_config(process_request=process_request)): dt1 = loopback_queue.pop(9.0) dt2 = loopback_queue.pop(6.0) dt3 = loopback_queue.pop(6.0) assert (dt3 - dt2).seconds == (dt2 - dt1).seconds == 3
def test_interface_remove(): loopback_queue = InterlockedQueue() def process_request(request, response): remove(request["file_name"]) loopback_queue.push(request) with active_interface("file_1", **interface_config(process_request = process_request, filename_regex = write_prefix + "[0-9a-f]{8}\\.msg")): file_name = write_file(random_filename() + ".msg", b"data") assert loopback_queue.pop(3.0) == dict(file_name = file_name) assert loopback_queue.pop(3.0) is None assert not os_path.exists(file_name)
def __init__(self, name, release): Resource.__init__(self, name) self._release = release self._ready, self._queue = Event(), InterlockedQueue() if __name__ == "__main__": self._timeout = Timeout(3.0) else: self._timeout = Timeout(60.0) self._count = 0
def get(name: str) -> InterlockedQueue: with _shared_queues_lock: queue = _shared_queues.get(name) if queue is None: queue = InterlockedQueue() _shared_queues[name] = queue return queue
def _get_rq_queue(cage): with _rq_queues_lock: if cage not in _rq_queues: rq_queue = InterlockedQueue() _rq_queues[cage] = rq_queue else: rq_queue = _rq_queues[cage] return rq_queue
def test_process_one(): loopback_queue = InterlockedQueue() def process_request(request, response): loopback_queue.push(request) with active_interface("jms_1", **interface_config(process_request = process_request)): fake_request(10.0) xa = pmnc.transaction.create() xa.jms_1.send(russian, JMSCorrelationID = russian, FOOBAR = "123") message_id = xa.execute()[0] request = loopback_queue.pop(10.0) assert request["message_id"] == message_id assert request["message_text"] == russian headers = request["headers"] assert headers["JMSCorrelationID"] == russian and headers["FOOBAR"] == "123"
def test_deletion_failure(): loopback_queue = InterlockedQueue() def process_request(request, response): loopback_queue.push(request) with active_interface("file_1", **interface_config(process_request = process_request, filename_regex = write_prefix + "[0-9a-f]{8}\\.msg")) as ifc: ifc._remove_file_ = ifc._remove_file ifc._remove_file = lambda file_name: 1 / 0 file_name = write_file(random_filename() + ".msg", b"data") assert loopback_queue.pop(3.0) == dict(file_name = file_name) assert loopback_queue.pop(3.0) is None assert os_path.isfile(file_name) assert ifc._processed_files == { file_name } ifc._remove_file = ifc._remove_file_ assert loopback_queue.pop(3.0) is None assert not os_path.exists(file_name) assert ifc._processed_files == set()
main_module_dir = os.path.dirname(sys.modules["__main__"].__file__) or os.getcwd() sys.path.insert(0, os.path.normpath(os.path.join(main_module_dir, "..", "..", "lib"))) import exc_string; from exc_string import exc_string import typecheck; from typecheck import typecheck, by_regex, optional import interlocked_queue; from interlocked_queue import InterlockedQueue import pmnc.perf_info; from pmnc.perf_info import get_working_set_size, get_cpu_times import pmnc.samplers; from pmnc.samplers import RawSampler, RateSampler import pmnc.threads; from pmnc.threads import HeavyThread ############################################################################### # module-level state => not reloadable _perf_thread = None _perf_queue = InterlockedQueue() _perf_lock = Lock() _perf_dump_60s = [] _perf_dump_10s = [None] * 6 _perf_stats = {} ############################################################################### def _normalize_time(ms: float) -> int: # takes time in milliseconds, returns 0-39 if ms <= 10.0: return 0 elif ms >= 100000.0: return 39 else:
before = time() assert not r.acquire(sl) # acquiring times out after = time() assert after - before >= 0.1 assert r.expired r = Request(timeout = 0.1, interface = "test", protocol = "test") before = time() assert not r.acquire_shared(sl) # shared acquiring times out after = time() assert after - before >= 0.1 assert r.expired ################################### ilq = InterlockedQueue() ilq.push(1) r = Request(timeout = 0.1, interface = "test", protocol = "test") before = time() assert r.pop(ilq) == 1 # popping succeeds after = time() assert after - before < 0.01 assert not r.expired r = Request(timeout = 0.1, interface = "test", protocol = "test") before = time() assert r.pop(ilq) is None # popping times out after = time() assert after - before >= 0.1
main_module_dir = os.path.dirname(sys.modules["__main__"].__file__) or os.getcwd() sys.path.insert(0, os.path.normpath(os.path.join(main_module_dir, "..", "..", "lib"))) import interlocked_queue; from interlocked_queue import InterlockedQueue def connect(resource): resource._q = resource._config.pop("trace_queue") # _config contains config dict resource._count = 0 resource._q.push(("connect", resource._count, resource._config)) def disconnect(resource): resource._count += 1 resource._q.push(("disconnect", resource._count)) self_test_config = dict \ ( param1 = "value1", param2 = "value2", connect = connect, disconnect = disconnect, trace_queue = InterlockedQueue() # self-test reads this queue to see what's going on ) # DO NOT TOUCH BELOW THIS LINE __all__ = [ "get", "copy" ] get = lambda key, default = None: pmnc.config.get_(config, self_test_config, key, default) copy = lambda: pmnc.config.copy_(config, self_test_config) # EOF
assert not t.wait(e) t = Timeout(1.0) assert t.wait(e) and not t.expired t = Timeout(0.5) assert not t.wait() and t.expired before = time() Timeout(0.11).wait() after = time() assert after - before >= 0.1 ################################### ilq = InterlockedQueue() t = Timeout(0.11) before = time() assert t.pop(ilq) is None after = time() assert after - before >= 0.1 assert t.expired t.reset(0.1) ilq.push(1) before = time() assert t.pop(ilq) == 1 after = time()
def pop(self, queue: InterlockedQueue ): # respects wall-time timeout, see issue9892 return queue.pop(self.remain) # inherits InterlockedQueue's behaviour
def __init__(self, cages_list): self._cages_list = cages_list self._queue = InterlockedQueue() self._pass = 0 self._stopped = Event()
class HealthMonitor: def __init__(self, **kwargs): self._rpc_interface = pmnc.interfaces.get_interface("rpc") if self._rpc_interface is None: raise Exception("health monitor requires enabled rpc interface") self._probe_thread_pool = pmnc.shared_pools.get_private_thread_pool() self._up_cages = {} # { cage: { node: { location: ..., probe_result: ... } } } self._up_down_queue = InterlockedQueue() self._request_timeout = pmnc.config_interfaces.get("request_timeout") # this is now static if pmnc.request.self_test == __name__: # self-test self._process_event = kwargs["process_event"] self._probe_cage = kwargs["probe_cage"] ################################### def start(self): self._probe_thread = HeavyThread(target = self._probe_thread_proc, name = "health_monitor:probe") # always called "health_monitor" self._probe_thread.start() def stop(self): self._probe_thread.stop() ################################### # this method is executed in a private thread and is scheduling probe calls # to cages known to the RPC interface or previously probed and found to be up def _probe_thread_proc(self): per_cage_interval = 0.0 # calls to _poll_up_down_queue are interleaved and allow this thread # to maintain structures such as _up_cages in response to events # posted by the probe threads to the _up_down_queue while self._poll_up_down_queue(per_cage_interval): try: # extract all cages currently known to the rpc interface and # merge them with cages previously probed and found to be up, # except for the health_monitor cage itself should be skipped probe_cages = \ { known_cage: { known_node: dict(location = known_location, probe_result = None) for known_node, known_location in self._rpc_interface.get_nodes(known_cage).items() } for known_cage in self._rpc_interface.get_cages() if known_cage != "health_monitor" } self._merge_cages(probe_cages, self._up_cages) probe_period = pmnc.config.get("probe_period") per_cage_interval = probe_period / (len(probe_cages) + 1) # walk through all cages to be probed and schedule calls to probe # to a private thread pool using fake unregistered requests for cage, nodes in probe_cages.items(): for node, cage_info in nodes.items(): cage_location = cage_info["location"] # note that the requests created here are not registered with # interfaces and enqueued to a different pool too, they are # therefore entitled to termination without warning at shutdown, # this is ok, because they do no useful work for the clients request = Request(timeout = self._request_timeout, interface = "__health_monitor__", protocol = "n/a", parameters = dict(auth_tokens = dict()), description = "probing cage {0:s} at {1:s}".format(cage, cage_location)) self._probe_thread_pool.enqueue(request, self.wu_probe_cage, (node, cage, cage_location, cage_info["probe_result"]), {}) # then again yield to polling the queue for a while if not self._poll_up_down_queue(per_cage_interval): break except: pmnc.log.error(exc_string()) # log and ignore ################################### # this method merges cages known to the RPC interface with cages # previously probed and known to be up, such merging is necessary # because if a cage dies just before its next advertisement broadcast, # it would disappear from known, but will not be probed again and # hence thought to be up forever @staticmethod def _merge_cages(known_cages: dict, up_cages: dict): probe_cages = known_cages # merging in place for up_cage, up_nodes in up_cages.items(): for up_node, up_cage_info in up_nodes.items(): probe_nodes = probe_cages.setdefault(up_cage, {}) if up_node in probe_nodes: cage_info = probe_nodes[up_node] if cage_info["location"] == up_cage_info["location"]: cage_info.update(probe_result = up_cage_info["probe_result"]) else: cage_info.update(probe_result = "restarted") # note this case else: probe_nodes[up_node] = up_cage_info ################################### # a call to this method is enqueued to a private thread pool # for each cage to probe on every pass of _probe_thread def wu_probe_cage(self, node, cage, location, prev_probe_result): if pmnc.request.expired: # no need to report anything for a probing request return if pmnc.log.debug: pmnc.log.debug("sending probe") try: probe_result = self._probe_cage(node, cage, location) except: pmnc.log.warning("probe failed: {0:s}".format(exc_string())) self._up_down_queue.push((node, cage, "down")) else: if pmnc.log.debug: pmnc.log.debug("probe returned successfully") if prev_probe_result == "restarted": # if the cage has restarted self._up_down_queue.push((node, cage, "down")) # we push "down" event first self._up_down_queue.push((node, cage, "up", location, probe_result)) ################################### # this method is invoked by one of the private pool threads # to send the actual probe call to the cage being probed @typecheck def _probe_cage(self, node, cage, location) -> dict: # health monitor has to create rpc resources manually, not using # pmnc(cage) syntax, because we need to access exact cage at exact # node and location (i.e. host and port) and to avoid discovery connect_timeout = pmnc.config_resource_rpc.get("discovery_timeout") rpc = pmnc.protocol_rpc.Resource("{0:s}.{1:s}".format(node, cage), broadcast_address = ("n/a", 0), discovery_timeout = connect_timeout, multiple_timeout_allowance = 0.0, flock_id = "unused", exact_locations = { cage: location }, # this prevents discovery pool__resource_name = cage) rpc.connect() try: rpc.begin_transaction("", source_module_name = __name__, transaction_options = {}, resource_args = (), resource_kwargs = {}) try: probe_result = rpc.health_monitor_event.probe() # there, an RPC call except: rpc.rollback() raise else: rpc.commit() finally: rpc.disconnect() return probe_result # if the cage returns anything but a dict, it is considered a failure ################################### # this method is called by the _probe_thread during its idle times # to fetch up/down events posted to the _up_down_queue by the probe # threads and in response to maintain structures such as _up_cages def _poll_up_down_queue(self, timeout: float) -> bool: # returns "should keep running" poll_timeout = Timeout(timeout) while not poll_timeout.expired: pop_timeout = Timeout(min(poll_timeout.remain, 1.0)) while not pop_timeout.expired: event = pop_timeout.pop(self._up_down_queue) if event is not None: try: node, cage, up_down, *args = event if up_down == "up": location, probe_result = args # add the cage to cages known to be up and schedule # application notification call if it was down or # returned a different probe result cage_info = self._up_cages.setdefault(cage, {}).setdefault(node, {}) if not cage_info or cage_info["probe_result"] != probe_result: self._schedule_up_down_event(node, cage, "up", probe_result) cage_info.update(location = location, probe_result = probe_result) elif up_down == "down": # remove the cage from cages known to be up and schedule # application notification call it was up if self._up_cages.setdefault(cage, {}).pop(node, None): self._schedule_up_down_event(node, cage, "down") except: pmnc.log.error(exc_string()) # log and ignore if current_thread().stopped(): return False return True ################################### # this method is called by the _probe_thread in response to change # of some cage's state detected in _poll_up_down_queue def _schedule_up_down_event(self, node, cage, up_down, probe_result = None): # application notification invokes methods from health_monitor_event module # and must be executed just like a regular request from some interface request = pmnc.interfaces.begin_request( timeout = self._request_timeout, interface = "__health_monitor__", protocol = "n/a", parameters = dict(auth_tokens = dict()), description = "cage {0:s}.{1:s} is {2:s}".format(node, cage, up_down)) # note that this request is not waited upon pmnc.interfaces.enqueue(request, self.wu_process_event, (node, cage, up_down, probe_result)) ################################### # this method is invoked by one of the interfaces pool threads to register # the event of some cage going up or down by calling an appropriate method # from the health_monitor_event module @typecheck def wu_process_event(self, node: str, cage: str, up_down: one_of("up", "down"), probe_result: optional(dict)): try: # see for how long the request was on the execution queue up to this moment # and whether it has expired in the meantime, if it did there is no reason # to proceed and we simply bail out if pmnc.request.expired: pmnc.log.error("request has expired and will not be processed") success = False return # goes through finally section below with pmnc.performance.request_processing(): self._process_event(node, cage, up_down, probe_result) except: pmnc.log.error(exc_string()) # log and ignore success = False else: success = True finally: # the request ends itself pmnc.interfaces.end_request(success) # possibly way after deadline ################################### def _process_event(self, node, cage, up_down, probe_result): if up_down == "up": pmnc.health_monitor_event.cage_up(node, cage, probe_result) elif up_down == "down": pmnc.health_monitor_event.cage_down(node, cage)
def execute_reverse(target_cage: valid_cage_name, module: valid_module_name, method: valid_method_name, args: tuple, kwargs: dict): # wrap up an RPC call identical to how it's done in protocol_rpc.py request_dict = pmnc.request.to_dict() # remove request parameters that must not cross the RPC border request_dict["parameters"].pop("retry", None) # wrap all the call parameters in a plain dict request = dict(source_cage=__cage__, target_cage=target_cage, module=module, method=method, args=args, kwargs=kwargs, request=request_dict) request_description = "reverse RPC request {0:s}.{1:s} to {2:s}".\ format(module, method, target_cage) # create a one-time response queue just for this request rs_queue = InterlockedQueue() request_id = pmnc.request.unique_id with _rs_queues_lock: _rs_queues[request_id] = rs_queue # register the call as being active try: pmnc.log.info("sending {0:s}".format(request_description)) try: # enqueue the call and wait for response rq_queue = _get_rq_queue(target_cage) rq_queue.push((request_id, request)) response = pmnc.request.pop(rs_queue) if response is None: raise Exception("request deadline waiting for response") try: result = response["result"] except KeyError: raise RPCError(description=response["exception"], terminal=False) except RPCError as e: pmnc.log.warning("{0:s} returned error: {1:s}".\ format(request_description, e.description)) raise except: pmnc.log.warning("{0:s} failed: {1:s}".\ format(request_description, exc_string())) ResourceError.rethrow(recoverable=False) else: pmnc.log.info("reverse RPC request returned successfully") return result finally: with _rs_queues_lock: del _rs_queues[request_id] # unregister the call
class HealthMonitor: def __init__(self, **kwargs): self._rpc_interface = pmnc.interfaces.get_interface("rpc") if self._rpc_interface is None: raise Exception("health monitor requires enabled rpc interface") self._probe_thread_pool = pmnc.shared_pools.get_private_thread_pool() self._up_cages = {} # { cage: { node: { location: ..., probe_result: ... } } } self._up_down_queue = InterlockedQueue() self._request_timeout = pmnc.config_interfaces.get("request_timeout") # this is now static if pmnc.request.self_test == __name__: # self-test self._process_event = kwargs["process_event"] self._probe_cage = kwargs["probe_cage"] ################################### def start(self): self._probe_thread = HeavyThread( target=self._probe_thread_proc, name="health_monitor:probe" ) # always called "health_monitor" self._probe_thread.start() def stop(self): self._probe_thread.stop() ################################### # this method is executed in a private thread and is scheduling probe calls # to cages known to the RPC interface or previously probed and found to be up def _probe_thread_proc(self): per_cage_interval = 0.0 # calls to _poll_up_down_queue are interleaved and allow this thread # to maintain structures such as _up_cages in response to events # posted by the probe threads to the _up_down_queue while self._poll_up_down_queue(per_cage_interval): try: # extract all cages currently known to the rpc interface and # merge them with cages previously probed and found to be up, # except for the health_monitor cage itself should be skipped probe_cages = { known_cage: { known_node: dict(location=known_location, probe_result=None) for known_node, known_location in self._rpc_interface.get_nodes(known_cage).items() } for known_cage in self._rpc_interface.get_cages() if known_cage != "health_monitor" } self._merge_cages(probe_cages, self._up_cages) probe_period = pmnc.config.get("probe_period") per_cage_interval = probe_period / (len(probe_cages) + 1) # walk through all cages to be probed and schedule calls to probe # to a private thread pool using fake unregistered requests for cage, nodes in probe_cages.items(): for node, cage_info in nodes.items(): cage_location = cage_info["location"] # note that the requests created here are not registered with # interfaces and enqueued to a different pool too, they are # therefore entitled to termination without warning at shutdown, # this is ok, because they do no useful work for the clients request = Request( timeout=self._request_timeout, interface="__health_monitor__", protocol="n/a", parameters=dict(auth_tokens=dict()), description="probing cage {0:s} at {1:s}".format(cage, cage_location), ) self._probe_thread_pool.enqueue( request, self.wu_probe_cage, (node, cage, cage_location, cage_info["probe_result"]), {} ) # then again yield to polling the queue for a while if not self._poll_up_down_queue(per_cage_interval): break except: pmnc.log.error(exc_string()) # log and ignore ################################### # this method merges cages known to the RPC interface with cages # previously probed and known to be up, such merging is necessary # because if a cage dies just before its next advertisement broadcast, # it would disappear from known, but will not be probed again and # hence thought to be up forever @staticmethod def _merge_cages(known_cages: dict, up_cages: dict): probe_cages = known_cages # merging in place for up_cage, up_nodes in up_cages.items(): for up_node, up_cage_info in up_nodes.items(): probe_nodes = probe_cages.setdefault(up_cage, {}) if up_node in probe_nodes: cage_info = probe_nodes[up_node] if cage_info["location"] == up_cage_info["location"]: cage_info.update(probe_result=up_cage_info["probe_result"]) else: cage_info.update(probe_result="restarted") # note this case else: probe_nodes[up_node] = up_cage_info ################################### # a call to this method is enqueued to a private thread pool # for each cage to probe on every pass of _probe_thread def wu_probe_cage(self, node, cage, location, prev_probe_result): if pmnc.request.expired: # no need to report anything for a probing request return pmnc.log.debug("sending probe") try: probe_result = self._probe_cage(node, cage, location) except: pmnc.log.warning("probe failed: {0:s}".format(exc_string())) self._up_down_queue.push((node, cage, "down")) else: pmnc.log.debug("probe returned successfully") if prev_probe_result == "restarted": # if the cage has restarted self._up_down_queue.push((node, cage, "down")) # we push "down" event first self._up_down_queue.push((node, cage, "up", location, probe_result)) ################################### # this method is invoked by one of the private pool threads # to send the actual probe call to the cage being probed @typecheck def _probe_cage(self, node, cage, location) -> dict: # health monitor has to create rpc resources manually, not using # pmnc(cage) syntax, because we need to access exact cage at exact # node and location (i.e. host and port) and to avoid discovery connect_timeout = pmnc.config_resource_rpc.get("discovery_timeout") rpc = pmnc.protocol_rpc.Resource( "{0:s}.{1:s}".format(node, cage), broadcast_address=("n/a", 0), discovery_timeout=connect_timeout, multiple_timeout_allowance=0.0, flock_id="unused", exact_locations={cage: location}, # this prevents discovery pool__resource_name=cage, ) rpc.connect() try: rpc.begin_transaction( "", source_module_name=__name__, transaction_options={}, resource_args=(), resource_kwargs={} ) try: probe_result = rpc.health_monitor_event.probe() # there, an RPC call except: rpc.rollback() raise else: rpc.commit() finally: rpc.disconnect() return probe_result # if the cage returns anything but a dict, it is considered a failure ################################### # this method is called by the _probe_thread during its idle times # to fetch up/down events posted to the _up_down_queue by the probe # threads and in response to maintain structures such as _up_cages def _poll_up_down_queue(self, timeout: float) -> bool: # returns "should keep running" poll_timeout = Timeout(timeout) while not poll_timeout.expired: pop_timeout = Timeout(min(poll_timeout.remain, 1.0)) while not pop_timeout.expired: event = pop_timeout.pop(self._up_down_queue) if event is not None: try: node, cage, up_down, *args = event if up_down == "up": location, probe_result = args # add the cage to cages known to be up and schedule # application notification call if it was down or # returned a different probe result cage_info = self._up_cages.setdefault(cage, {}).setdefault(node, {}) if not cage_info or cage_info["probe_result"] != probe_result: self._schedule_up_down_event(node, cage, "up", probe_result) cage_info.update(location=location, probe_result=probe_result) elif up_down == "down": # remove the cage from cages known to be up and schedule # application notification call it was up if self._up_cages.setdefault(cage, {}).pop(node, None): self._schedule_up_down_event(node, cage, "down") except: pmnc.log.error(exc_string()) # log and ignore if current_thread().stopped(): return False return True ################################### # this method is called by the _probe_thread in response to change # of some cage's state detected in _poll_up_down_queue def _schedule_up_down_event(self, node, cage, up_down, probe_result=None): # application notification invokes methods from health_monitor_event module # and must be executed just like a regular request from some interface request = pmnc.interfaces.begin_request( timeout=self._request_timeout, interface="__health_monitor__", protocol="n/a", parameters=dict(auth_tokens=dict()), description="cage {0:s}.{1:s} is {2:s}".format(node, cage, up_down), ) # note that this request is not waited upon pmnc.interfaces.enqueue(request, self.wu_process_event, (node, cage, up_down, probe_result)) ################################### # this method is invoked by one of the interfaces pool threads to register # the event of some cage going up or down by calling an appropriate method # from the health_monitor_event module @typecheck def wu_process_event(self, node: str, cage: str, up_down: one_of("up", "down"), probe_result: optional(dict)): try: # see for how long the request was on the execution queue up to this moment # and whether it has expired in the meantime, if it did there is no reason # to proceed and we simply bail out if pmnc.request.expired: pmnc.log.error("request has expired and will not be processed") success = False return # goes through finally section below with pmnc.performance.request_processing(): self._process_event(node, cage, up_down, probe_result) except: pmnc.log.error(exc_string()) # log and ignore success = False else: success = True finally: # the request ends itself pmnc.interfaces.end_request(success) # possibly way after deadline ################################### def _process_event(self, node, cage, up_down, probe_result): if up_down == "up": pmnc.health_monitor_event.cage_up(node, cage, probe_result) elif up_down == "down": pmnc.health_monitor_event.cage_down(node, cage)
assert not t.wait(e) t = Timeout(1.0) assert t.wait(e) and not t.expired t = Timeout(0.5) assert not t.wait() and t.expired before = time() Timeout(0.1).wait() after = time() assert after - before >= 0.1 ################################### ilq = InterlockedQueue() t = Timeout(0.1) before = time() assert t.pop(ilq) is None after = time() assert after - before >= 0.1 assert t.expired t.reset(0.1) ilq.push(1) before = time() assert t.pop(ilq) == 1 after = time()
def pop(self, queue: InterlockedQueue): # respects wall-time timeout, see issue9892 return queue.pop(self.remain) # inherits InterlockedQueue's behaviour