def __init__(self, stackParams: dict, nodeReg: Dict[str, HA]): super().__init__() self.name = stackParams["name"] self.bootstrapped = False self.nodeStackParams = stackParams self.nodestack = None # type: Stack self.lastcheck = {} # type: Dict[int, Tuple[int, float]] self.ratchet = Ratchet(a=8, b=0.198, c=-4, base=8, peak=3600) self.nodeReg = nodeReg # holds the last time we checked remotes self.nextCheck = 0 self._conns = set() # type: Set[str]
async def eventually(coroFunc: FlexFunc, *args, retryWait: float=0.1, timeout: float=5, ratchetSteps: Optional[int]=None, acceptableExceptions=None, verbose=True) -> T: if acceptableExceptions and not isinstance(acceptableExceptions, Iterable): acceptableExceptions = [acceptableExceptions] # TODO make smarter. Have an optional initial_wait. ratchet up to timeout start = time.perf_counter() ratchet = Ratchet.fromGoalDuration(retryWait, ratchetSteps, timeout).gen() if ratchetSteps else None def remaining(): return start + timeout - time.perf_counter() fname = getFuncName(coroFunc) while True: try: remain = remaining() if remain < 0: # this provides a convenient breakpoint for a debugger logger.warning("{} last try...".format(fname)) # noinspection PyCallingNonCallable res = coroFunc(*args) if isawaitable(res): res = await res if verbose: logger.debug("{} succeeded with {:.2f} seconds to spare". format(fname, remain)) return res except Exception as ex: if acceptableExceptions and type(ex) not in acceptableExceptions: raise if remain >= 0: if verbose: logger.trace("{} not succeeded yet, {:.2f} seconds " "remaining...".format(fname, remain)) await asyncio.sleep(next(ratchet) if ratchet else retryWait) else: logger.error("{} failed; not trying any more because {} " "seconds have passed; args were {}". format(fname, timeout, args)) raise ex
class NodeStacked(Batched): """ Behaviors that provides Node Stack functionality to Nodes and Clients """ def __init__(self, stackParams: dict, nodeReg: Dict[str, HA]): super().__init__() self.name = stackParams["name"] self.bootstrapped = False self.nodeStackParams = stackParams self.nodestack = None # type: Stack self.lastcheck = {} # type: Dict[int, Tuple[int, float]] self.ratchet = Ratchet(a=8, b=0.198, c=-4, base=8, peak=3600) self.nodeReg = nodeReg # holds the last time we checked remotes self.nextCheck = 0 self._conns = set() # type: Set[str] def __repr__(self): return self.name @property def conns(self) -> Set[str]: """ Get the connections of this node. :return: set of names of the connected nodes """ return self._conns @conns.setter def conns(self, value: Set[str]) -> None: """ Updates the connection count of this node if not already done. """ if not self._conns == value: old = self._conns self._conns = value ins = value - old outs = old - value self._connsChanged(ins, outs) def checkConns(self): """ Evaluate the connected nodes """ self.conns = self.nodestack.connecteds() def _connsChanged(self, ins: Set[str], outs: Set[str]) -> None: """ A series of operations to perform once a connection count has changed. - Set f to max number of failures this system can handle. - Set status to one of started, started_hungry or starting depending on the number of protocol instances. - Check protocol instances. See `checkProtocolInstaces()` :param ins: new nodes connected :param outs: nodes no longer connected """ for o in outs: logger.info("{} disconnected from {}".format(self, o), extra={"cli": "IMPORTANT"}) for i in ins: logger.info("{} now connected to {}".format(self, i), extra={"cli": "IMPORTANT"}) self.onConnsChanged(ins, outs) def onConnsChanged(self, ins: Set[str], outs: Set[str]): """ Subclasses can override """ pass def notConnectedNodes(self) -> Set[str]: """ Returns the names of nodes in the registry this node is NOT connected to. """ return set(self.nodeReg.keys()) - self.conns def startNodestack(self): self.nodestack = self.stackType().newStack(self.nodeStackParams) logger.info( "{} listening for other nodes at {}:{}".format(self, *self.nodestack.ha), extra={"cli": "LOW_STATUS"} ) self.nodestack.msgHandler = self.handleOneNodeMsg if self.nodestack.name in self.nodeReg: # remove this node's registation from the Node Registry # (no need to connect to itself) del self.nodeReg[self.nodestack.name] def connect(self, name) -> int: """ Connect to the node specified by name. :param name: name of the node to connect to :type name: str or (HA, tuple) :return: the uid of the remote estate """ if isinstance(name, (HA, tuple)): other_node_ha = name elif isinstance(name, str): other_node_ha = self.nodeReg[name] else: raise AttributeError() remote = RemoteEstate(stack=self.nodestack, ha=other_node_ha) self.nodestack.addRemote(remote) # updates the store time so the join timer is accurate self.nodestack.updateStamp() self.nodestack.join(uid=remote.uid, cascade=True, timeout=60) logger.info("{} looking for {} at {}:{}".format(self.name, name, *other_node_ha), extra={"cli": "PLAIN"}) return remote.uid def sign(self, msg: Mapping) -> Mapping: """ No signing is implemented in NodeStacked. Returns the msg as it is. :param msg: the message to sign """ return msg # don't sign by default def prepForSending(self, msg: Mapping) -> Mapping: """ Return a dictionary form of the message :param msg: the message to be sent :raises: ValueError if msg cannot be converted to an appropriate format for transmission """ if isinstance(msg, TaggedTupleBase): tmsg = msg.melted() elif isinstance(msg, Request): tmsg = msg.__getstate__() else: raise ValueError("Message cannot be converted to an appropriate format for transmission") smsg = self.sign(tmsg) return smsg def handleOneNodeMsg(self, wrappedMsg): raise NotImplementedError("{} must implement this method".format(self)) async def serviceLifecycle(self) -> None: """ Async function that does the following activities if the node is going: (See `Status.going`) - check connections (See `checkConns`) - maintain connections (See `maintainConnections`) """ if self.isGoing(): self.checkConns() self.maintainConnections() def maintainConnections(self): """ Try to connect to all the nodes. """ self._retryConnections() def _retryConnections(self): """ Try connecting to disconnected nodes again. :return: whether the retry attempt was successful """ cur = time.perf_counter() if cur > self.nextCheck: if any(r.joinInProcess() or r.allowInProcess() for r in self.nodestack.remotes.values()): logger.trace( "{} joins or allows already in process, so " "waiting to check for reconnects".format(self) ) self.nextCheck = cur + 3 return False self.nextCheck = cur + 15 # check again in 15 seconds, # unless sooner because of retries below conns, disconns = self.remotesByConnected() for disconn in disconns: if disconn.name not in self.nodeReg: logger.debug( "{} skipping reconnect on {} because " "it's not found in the registry".format(self, disconn.name) ) continue count, last = self.lastcheck.get(disconn.uid, (0, 0)) secsSinceLastCheck = cur - last secsToWait = self.ratchet.get(count) secsToWaitNext = self.ratchet.get(count + 1) if secsSinceLastCheck > secsToWait: dname = self.getRemoteName(disconn) logger.debug( "{} retrying to connect with {}".format(self.name, dname) + ( "" if not last else "; needed to wait at " "least {} and waited " "{} (next try will " "be {} seconds)".format( round(secsToWait, 2), round(secsSinceLastCheck, 2), round(secsToWaitNext, 2) ) ) ) self.lastcheck[disconn.uid] = count + 1, cur self.nextCheck = min(self.nextCheck, cur + secsToWaitNext) if disconn.joinInProcess(): logger.debug("waiting, because join is already in " "progress") else: logger.info("{} reconnecting to {} at {}:{}".format(self, dname, *disconn.ha)) # update the store time so the allow timer works self.nodestack.updateStamp() self.nodestack.allow(uid=disconn.uid, cascade=True) # remove items that have been connected for connected in conns: self.lastcheck.pop(connected.uid, None) logger.debug("{} next check for retries in {:.2f} seconds".format(self, self.nextCheck - cur)) return True return False def bootstrap(self, forced: bool = None): """ Connect to all nodes in the node registry. """ logging.info("{} is bootstrapping, forced is {}".format(self, forced), extra={"cli": False}) missing = self.reconcileNodeReg() if missing: logger.debug("{} found the following missing connections: {}".format(self, ", ".join(missing))) if not forced: names = list(self.nodeReg.keys()) names.append(self.name) nices = set(distributedConnectionMap(names)[self.name]) for name in nices: logger.debug("{} being nice and waiting for {} to join".format(self, name)) missing = missing.difference(nices) for name in missing: self.connect(name) self.bootstrapped = True def reconcileNodeReg(self): """ Handle remotes missing from the node registry and clean up old remotes no longer in this node's registry. :return: the missing remotes """ matches = set() # good matches found in nodestack remotes legacy = set() # old remotes that are no longer in registry conflicts = set() # matches found, but the ha conflicts logging.debug("{}'s nodereg is {}".format(self, self.nodeReg.items())) logging.debug("{}'s nodestack is {}".format(self, self.nodestack.remotes.values())) for r in self.nodestack.remotes.values(): if r.name in self.nodeReg: if r.ha == self.nodeReg[r.name]: matches.add(r.name) logging.debug("matched remote is {} {}".format(r.uid, r.ha)) else: conflicts.add((r.name, r.ha)) error("ha for {} doesn't match".format(r.name)) else: regName = [nm for nm, ha in self.nodeReg.items() if ha == r.ha] logging.debug("unmatched remote is {} {}".format(r.uid, r.ha)) # assert len(regName) == 1 if regName: logger.debug( "forgiving name mismatch for {} with same " "ha {} using another name {}".format(regName, r.ha, r.name) ) matches.add(regName[0]) else: logger.debug("{} found a legacy remote {} " "without a matching ha {}".format(self, r.name, r.ha)) legacy.add(r) # missing from remotes... need to connect missing = set(nm for nm, ha in self.nodeReg.items() if nm not in matches) if len(missing) + len(matches) + len(conflicts) != len(self.nodeReg): logger.error("Error reconciling nodeReg with remotes") logger.error("missing: {}".format(missing)) logger.error("matches: {}".format(matches)) logger.error("conflicts: {}".format(conflicts)) logger.error("nodeReg: {}".format(self.nodeReg.keys())) error("Error reconciling nodeReg with remotes; see logs") if conflicts: error("found conflicting address information {} in registry".format(conflicts)) if legacy: for l in legacy: logger.error( "{} found legacy entry [{}, {}] in remotes, " "that were not in registry".format(self, l.name, l.ha) ) # TODO Remove this. Why are we raising error. Someone might # attempt to connect to a node and might end up in its nodestack # error("found legacy entries {} in remotes, that were not " # "in registry".format(legacy)) # this could happen if we are somehow re-using the same temp directory return missing @staticmethod def stackType(): """ Return the type of the stack """ return Stack def remotesByConnected(self): conns, disconns = [], [] for r in self.nodestack.remotes.values(): array = conns if Stack.isRemoteConnected(r) else disconns array.append(r) return conns, disconns def getRemoteName(self, remote): if remote.name not in self.nodeReg: find = [name for name, ha in self.nodeReg.items() if ha == remote.ha] assert len(find) == 1 return find[0] return remote.name