def __init__(self, node): self.node = node self._view_no = 0 # type: int HasActionQueue.__init__(self) self.inBox = deque() self.outBox = deque() self.inBoxRouter = Router( (InstanceChange, self.process_instance_change_msg), (ViewChangeDone, self.process_vchd_msg), (FutureViewChangeDone, self.process_future_view_vchd_msg)) self.instanceChanges = InstanceChanges() # The quorum of `ViewChangeDone` msgs is different depending on whether we're doing a real view change, # or just propagating view_no and Primary from `CurrentState` messages sent to a newly joined Node. # TODO: separate real view change and Propagation of Primary # TODO: separate catch-up, view-change and primary selection so that # they are really independent. self.propagate_primary = False # Tracks if other nodes are indicating that this node is in lower view # than others. Keeps a map of view no to senders # TODO: Consider if sufficient ViewChangeDone for 2 different (and # higher views) are received, should one view change be interrupted in # between. self._next_view_indications = {} self._view_change_in_progress = False self.previous_master_primary = None self.set_defaults() self.initInsChngThrottling() # Action for _schedule instanceChange messages self.instance_change_action = None # Count of instance change rounds self.instance_change_rounds = 0 # Time for view_change_starting self.start_view_change_ts = 0 # Last successful viewNo. # In some cases view_change process can be uncompleted in time. # In that case we want to know, which viewNo was successful (last completed view_change) self.last_completed_view_no = 0 # Force periodic view change if enabled in config force_view_change_freq = node.config.ForceViewChangeFreq if force_view_change_freq > 0: self.startRepeating(self.on_master_degradation, force_view_change_freq)
def __init__(self, node): self.node = node self.view_no = 0 # type: int HasActionQueue.__init__(self) self.inBox = deque() self.outBox = deque() self.inBoxRouter = Router( (InstanceChange, self.process_instance_change_msg), (ViewChangeDone, self.process_vchd_msg) ) self.instanceChanges = InstanceChanges() # The quorum of `ViewChangeDone` msgs is different depending on whether we're doing a real view change, # or just propagating view_no and Primary from `CurrentState` messages sent to a newly joined Node. # TODO: separate real view change and Propagation of Primary # TODO: separate catch-up, view-change and primary selection so that # they are really independent. self.propagate_primary = False # Tracks if other nodes are indicating that this node is in lower view # than others. Keeps a map of view no to senders # TODO: Consider if sufficient ViewChangeDone for 2 different (and # higher views) are received, should one view change be interrupted in # between. self._next_view_indications = SortedDict() self._view_change_in_progress = False self.previous_master_primary = None self.set_defaults() self.initInsChngThrottling() # Action for _schedule instanceChange messages self.instance_change_action = None # Count of instance change rounds self.instance_change_rounds = 0
def test_too_old_messages_dont_count_towards_quorum(instance_changes, tconf): frm1 = "Node1" frm2 = "Node2" view_no = 1 quorum = 2 time_provider = MockTimestamp(0) instance_changes = InstanceChanges(tconf, time_provider) msg = InstanceChange(view_no, Suspicions.PRIMARY_DEGRADED.code) instance_changes.add_vote(msg, frm1) time_provider.value += (tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL/2) instance_changes.add_vote(msg, frm2) time_provider.value += (tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL/2) + 1 assert not instance_changes.has_quorum(view_no, quorum) assert instance_changes.has_view(view_no) assert instance_changes[view_no].msg == msg assert not instance_changes.has_inst_chng_from(view_no, frm1) assert instance_changes.has_inst_chng_from(view_no, frm2)
def test_equal_votes_dont_accumulate_when_added(instance_changes, tconf): frm = "Node1" view_no = 1 time_provider = MockTimestamp(0) second_vote_time = 1 instance_changes = InstanceChanges(tconf, time_provider) msg = InstanceChange(view_no, Suspicions.PRIMARY_DEGRADED.code) instance_changes.add_vote(msg, frm) time_provider.value = second_vote_time instance_changes.add_vote(msg, frm) assert instance_changes[view_no].voters[frm] == second_vote_time assert len(instance_changes[view_no].voters) == 1 assert len(instance_changes) == 1
class ViewChanger(HasActionQueue, MessageProcessor): def __init__(self, node): self.node = node self.view_no = 0 # type: int HasActionQueue.__init__(self) self.inBox = deque() self.outBox = deque() self.inBoxRouter = Router( (InstanceChange, self.process_instance_change_msg), (ViewChangeDone, self.process_vchd_msg)) self.instanceChanges = InstanceChanges() # The quorum of `ViewChangeDone` msgs is different depending on whether we're doing a real view change, # or just propagating view_no and Primary from `CurrentState` messages sent to a newly joined Node. # TODO: separate real view change and Propagation of Primary # TODO: separate catch-up, view-change and primary selection so that # they are really independent. self.propagate_primary = False # Tracks if other nodes are indicating that this node is in lower view # than others. Keeps a map of view no to senders # TODO: Consider if sufficient ViewChangeDone for 2 different (and # higher views) are received, should one view change be interrupted in # between. self._next_view_indications = SortedDict() self._view_change_in_progress = False self.previous_master_primary = None self.set_defaults() self.initInsChngThrottling() def __repr__(self): return "{}".format(self.name) # PROPERTIES @property def name(self): return self.node.name @property def config(self): return self.node.config @property def quorums(self): return self.node.quorums @property def view_change_in_progress(self): return self._view_change_in_progress @view_change_in_progress.setter def view_change_in_progress(self, value): self._view_change_in_progress = value @property def quorum(self) -> int: # TODO: re-factor this, separate this two states (selection of a new # primary and propagation of existing one) if not self.view_change_in_progress: return self.node.quorums.propagate_primary.value if self.propagate_primary: return self.node.quorums.propagate_primary.value return self.node.quorums.view_change_done.value @property def _hasViewChangeQuorum(self): # This method should just be present for master instance. """ Checks whether n-f nodes completed view change and whether one of them is the next primary """ num_of_ready_nodes = len(self._view_change_done) diff = self.quorum - num_of_ready_nodes if diff > 0: logger.debug('{} needs {} ViewChangeDone messages'.format( self, diff)) return False logger.debug("{} got view change quorum ({} >= {})".format( self.name, num_of_ready_nodes, self.quorum)) return True @property def _is_propagated_view_change_completed(self): if not self._propagated_view_change_completed and \ self.node.poolLedger is not None and \ self.propagate_primary: accepted = self.get_sufficient_same_view_change_done_messages() if accepted is not None: accepted_pool_ledger_i = \ next(filter(lambda x: x[0] == POOL_LEDGER_ID, accepted[1])) self_pool_ledger_i = \ next(filter(lambda x: x[0] == POOL_LEDGER_ID, self.node.ledger_summary)) logger.debug("{} Primary selection has been already completed " "on pool ledger info = {}, primary {}, self pool " "ledger info {}".format(self, accepted_pool_ledger_i, accepted[0], self_pool_ledger_i)) self._propagated_view_change_completed = True return self._propagated_view_change_completed @property def has_view_change_from_primary(self) -> bool: if not self._has_view_change_from_primary: next_primary_name = self.node.elector.next_primary_node_name(0) if next_primary_name not in self._view_change_done: logger.debug( "{} has not received ViewChangeDone from the next " "primary {} (view_no: {}, totalNodes: {})".format( self.name, next_primary_name, self.view_no, self.node.totalNodes)) else: logger.debug( '{} received ViewChangeDone from primary {}'.format( self, next_primary_name)) self._has_view_change_from_primary = True return self._has_view_change_from_primary @property def has_acceptable_view_change_quorum(self): if not self._has_acceptable_view_change_quorum: self._has_acceptable_view_change_quorum = ( self._hasViewChangeQuorum and (self._is_propagated_view_change_completed or self.has_view_change_from_primary)) return self._has_acceptable_view_change_quorum @property def is_behind_for_view(self) -> bool: # Checks if the node is currently behind the accepted state for this # view, only makes sense to call when the node has an acceptable # view change quorum _, accepted_ledger_summary = self.get_sufficient_same_view_change_done_messages( ) for (ledgerId, own_ledger_size, _), (_, accepted_ledger_size, _) in \ zip(self.node.ledger_summary, accepted_ledger_summary): if own_ledger_size < accepted_ledger_size: logger.debug( "{} ledger {} sizes are differ: own {} accepted {}" "".format(self, ledgerId, own_ledger_size, accepted_ledger_size)) return True return False # __ PROPERTIES __ # EXTERNAL EVENTS def on_master_degradation(self): """ """ view_no = self.view_no + 1 logger.info( "{} sending instance with view_no = {} and trying to start " "view change since performance of master instance degraded" "".format(self, view_no)) self.sendInstanceChange(view_no) self.do_view_change_if_possible(view_no) def on_primary_loss(self): view_no = self.view_no + 1 logger.info("{} sending instance with view_no = {} and trying " "to start view change since primary was lost" "".format(self, view_no)) self.sendInstanceChange(view_no, Suspicions.PRIMARY_DISCONNECTED) self.do_view_change_if_possible(view_no) # TODO we have `on_primary_loss`, do we need that one? def on_primary_about_to_be_disconnected(self): view_no = self.view_no + 1 logger.info("{} sending instance with view_no = {} " "since primary is about to be disconnected" "".format(self, view_no)) self.sendInstanceChange(view_no, Suspicions.PRIMARY_ABOUT_TO_BE_DISCONNECTED) def on_suspicious_primary(self, suspicion: Suspicions): view_no = self.view_no + 1 logger.info("{} sending instance with view_no = {} since primary " "seems suspicious, reason {}" "".format(self, view_no, suspicion.reason)) self.sendInstanceChange(view_no, suspicion) # TODO why we don't try to start view change here def on_view_change_not_completed_in_time(self): view_no = self.view_no + 1 logger.info("{} sending instance with view_no = {} since " "view change to view {} is not completed in time" "".format(self, view_no, self.view_no)) self.sendInstanceChange(view_no, Suspicions.INSTANCE_CHANGE_TIMEOUT) def on_catchup_complete(self): if self.node.is_synced and self.node.master_replica.isPrimary is None and \ not self._is_propagated_view_change_completed: self._send_view_change_done_message() self._start_selection() def on_future_view_vchd_msg(self, view_no, frm): assert view_no > self.view_no if view_no not in self._next_view_indications: self._next_view_indications[view_no] = set() self._next_view_indications[view_no].add(frm) self._start_view_change_if_possible(view_no) # __ EXTERNAL EVENTS __ def process_instance_change_msg(self, instChg: InstanceChange, frm: str) -> None: """ Validate and process an instance change request. :param instChg: the instance change request :param frm: the name of the node that sent this `msg` """ logger.debug("{} received instance change request: {} from {}".format( self, instChg, frm)) # TODO: add sender to blacklist? if not isinstance(instChg.viewNo, int): self.node.discard( instChg, "{}field view_no has incorrect type: {}".format( VIEW_CHANGE_PREFIX, type(instChg.viewNo))) elif instChg.viewNo <= self.view_no: self.node.discard( instChg, "Received instance change request with view no {} " "which is not more than its view no {}".format( instChg.viewNo, self.view_no), logger.info) else: # Record instance changes for views but send instance change # only when found master to be degraded. if quorum of view changes # found then change view even if master not degraded self._on_verified_instance_change_msg(instChg, frm) if self.instanceChanges.hasInstChngFrom(instChg.viewNo, self.name): logger.debug( "{} received instance change message {} but has already " "sent an instance change message".format(self, instChg)) elif not self.node.monitor.isMasterDegraded(): logger.debug( "{} received instance change message {} but did not " "find the master to be slow".format(self, instChg)) else: logger.info( "{}{} found master degraded after receiving instance change" " message from {}".format(VIEW_CHANGE_PREFIX, self, frm)) self.sendInstanceChange(instChg.viewNo) def process_vchd_msg(self, msg: ViewChangeDone, sender: str) -> bool: """ Processes ViewChangeDone messages. Once n-f messages have been received, decides on a primary for specific replica. :param msg: ViewChangeDone message :param sender: the name of the node from which this message was sent """ logger.debug("{}'s primary selector started processing of " "ViewChangeDone msg from {} : {}".format( self.name, sender, msg)) view_no = msg.viewNo if self.view_no != view_no: self.discard(msg, '{} got Primary from {} for view no {} ' 'whereas current view no is {}'.format( self, sender, view_no, self.view_no), logMethod=logger.debug) return False new_primary_name = msg.name if new_primary_name == self.previous_master_primary: self.discard(msg, '{} got Primary from {} for {} who was primary of ' 'master in previous view too'.format( self, sender, new_primary_name), logMethod=logger.debug) return False # Since a node can send ViewChangeDone more than one time self._on_verified_view_change_done_msg(msg, sender) # TODO why do we check that after the message tracking if self.node.master_replica.hasPrimary: self.discard( msg, "it already decided primary which is {}".format( self.node.master_replica.primaryName), logger.debug) return False self._start_selection() def send(self, msg): """ Send a message to the node. :param msg: the message to send """ logger.debug("{}'s view_changer sending {}".format(self.name, msg)) self.outBox.append(msg) async def serviceQueues(self, limit=None) -> int: """ Service at most `limit` messages from the inBox. :param limit: the maximum number of messages to service :return: the number of messages successfully processed """ return await self.inBoxRouter.handleAll(self.inBox, limit) def sendInstanceChange(self, view_no: int, suspicion=Suspicions.PRIMARY_DEGRADED): """ Broadcast an instance change request to all the remaining nodes :param view_no: the view number when the instance change is requested """ # If not found any sent instance change messages in last # `ViewChangeWindowSize` seconds or the last sent instance change # message was sent long enough ago then instance change message can be # sent otherwise no. canSendInsChange, cooldown = self.insChngThrottler.acquire() if canSendInsChange: logger.info("{}{} sending an instance change with view_no {}" " since {}".format(VIEW_CHANGE_PREFIX, self, view_no, suspicion.reason)) logger.info("{}{} metrics for monitor: {}".format( MONITORING_PREFIX, self, self.node.monitor.prettymetrics)) msg = self._create_instance_change_msg(view_no, suspicion.code) self.send(msg) # record instance change vote for self and try to change the view # if quorum is reached self._on_verified_instance_change_msg(msg, self.name) else: logger.debug( "{} cannot send instance change sooner then {} seconds".format( self, cooldown)) # noinspection PyAttributeOutsideInit def initInsChngThrottling(self): windowSize = self.node.config.ViewChangeWindowSize ratchet = Ratchet(a=2, b=0.05, c=1, base=2, peak=windowSize) self.insChngThrottler = Throttler(windowSize, ratchet.get) def _create_instance_change_msg(self, view_no, suspicion_code): return InstanceChange(view_no, suspicion_code) def _on_verified_instance_change_msg(self, msg, frm): view_no = msg.viewNo if not self.instanceChanges.hasInstChngFrom(view_no, frm): self.instanceChanges.addVote(msg, frm) if view_no > self.view_no: self.do_view_change_if_possible(view_no) def do_view_change_if_possible(self, view_no): # TODO: Need to handle skewed distributions which can arise due to # malicious nodes sending messages early on can, whyNot = self._canViewChange(view_no) if can: logger.info("{}{} initiating a view change to {} from {}".format( VIEW_CHANGE_PREFIX, self, view_no, self.view_no)) self.propagate_primary = False self.startViewChange(view_no) else: logger.debug(whyNot) return can def _start_view_change_if_possible(self, view_no) -> bool: ind_count = len(self._next_view_indications[view_no]) if self.quorums.propagate_primary.is_reached(ind_count): logger.info( '{}{} starting view change for {} after {} view change ' 'indications from other nodes'.format(VIEW_CHANGE_PREFIX, self, view_no, ind_count)) self.propagate_primary = True self.startViewChange(view_no) return True return False def _canViewChange(self, proposedViewNo: int) -> (bool, str): """ Return whether there's quorum for view change for the proposed view number and its view is less than or equal to the proposed view """ msg = None quorum = self.quorums.view_change.value if not self.instanceChanges.hasQuorum(proposedViewNo, quorum): msg = '{} has no quorum for view {}'.format(self, proposedViewNo) elif not proposedViewNo > self.view_no: msg = '{} is in higher view more than {}'.format( self, proposedViewNo) return not bool(msg), msg def startViewChange(self, proposed_view_no: int): """ Trigger the view change process. :param proposed_view_no: the new view number after view change. """ # TODO: consider moving this to pool manager # TODO: view change is a special case, which can have different # implementations - we need to make this logic pluggable for view_no in tuple(self._next_view_indications.keys()): if view_no > proposed_view_no: break self._next_view_indications.pop(view_no) self.view_no = proposed_view_no self.view_change_in_progress = True self.previous_master_primary = self.node.master_primary_name self.set_defaults() self.initInsChngThrottling() self.node.on_view_change_start() self.node.start_catchup() def _on_verified_view_change_done_msg(self, msg, frm): new_primary_name = msg.name ledger_summary = msg.ledgerInfo # TODO what is the case when node sends several different # view change done messages data = (new_primary_name, ledger_summary) self._view_change_done[frm] = data def _start_selection(self): error = None if not self.node.is_synced: error = "mode is {}".format(self.node.mode) elif not self.has_acceptable_view_change_quorum: error = "has no view change quorum or no message from next primary" else: rv = self.get_sufficient_same_view_change_done_messages() if rv is None: error = "there are not sufficient same ViewChangeDone messages" elif not (self._is_propagated_view_change_completed or self._verify_primary(*rv)): error = "failed to verify primary" if error is not None: logger.debug('{} cannot start primary selection because {}'.format( self, error)) return if self.is_behind_for_view: logger.debug( '{} is synced and has an acceptable view change quorum ' 'but is behind the accepted state'.format(self)) self.node.start_catchup() return logger.debug("{} starting selection".format(self)) nodeReg = None # in case of already completed view change # use node registry actual for the moment when it happened if self._is_propagated_view_change_completed: assert self._accepted_view_change_done_message is not None ledger_summary = self._accepted_view_change_done_message[1] pool_ledger_size = ledger_summary[POOL_LEDGER_ID][1] nodeReg = self.node.poolManager.getNodeRegistry(pool_ledger_size) self.node.select_primaries(nodeReg) if self.view_change_in_progress: self.view_change_in_progress = False self.node.on_view_change_complete() self.instanceChanges.pop(self.view_no - 1, None) self.previous_master_primary = None self.propagate_primary = False def set_defaults(self): # Tracks view change done message self._view_change_done = {} # replica name -> data # Set when an appropriate view change quorum is found which has # sufficient same ViewChangeDone messages self._primary_verified = False self._has_view_change_from_primary = False self._has_acceptable_view_change_quorum = False self._accepted_view_change_done_message = None # accept any primary if propagating view change done msgs # for already completed view change self._propagated_view_change_completed = False def get_sufficient_same_view_change_done_messages(self) -> Optional[Tuple]: # Returns whether has a quorum of ViewChangeDone messages that are same # TODO: Does not look like optimal implementation. if self._accepted_view_change_done_message is None and \ self._view_change_done: votes = self._view_change_done.values() votes = [(nm, tuple(tuple(i) for i in info)) for nm, info in votes] (new_primary, ledger_info), vote_count = mostCommonElement(votes) if vote_count >= self.quorum: logger.debug( '{} found acceptable primary {} and ledger info {}'.format( self, new_primary, ledger_info)) self._accepted_view_change_done_message = (new_primary, ledger_info) else: logger.debug('{} does not have acceptable primary, only {} ' 'votes for {}'.format(self, vote_count, (new_primary, ledger_info))) return self._accepted_view_change_done_message def _verify_primary(self, new_primary, ledger_info): """ This method is called when sufficient number of ViewChangeDone received and makes steps to switch to the new primary """ expected_primary = self.node.elector.next_primary_node_name(0) if new_primary != expected_primary: logger.error("{}{} expected next primary to be {}, but majority " "declared {} instead for view {}".format( PRIMARY_SELECTION_PREFIX, self.name, expected_primary, new_primary, self.view_no)) return False self._primary_verified = True return True # TODO: check if ledger status is expected def _send_view_change_done_message(self): """ Sends ViewChangeDone message to other protocol participants """ new_primary_name = self.node.elector.next_primary_node_name(0) ledger_summary = self.node.ledger_summary message = ViewChangeDone(self.view_no, new_primary_name, ledger_summary) logger.debug("{} is sending ViewChangeDone msg to all : {}" "".format(self, message)) self.send(message) self._on_verified_view_change_done_msg(message, self.name) # overridden method of PrimaryDecider def get_msgs_for_lagged_nodes(self) -> List[ViewChangeDone]: # Should not return a list, only done for compatibility with interface """ Returns the last accepted `ViewChangeDone` message. If no view change has happened returns ViewChangeDone with view no 0 to a newly joined node """ # TODO: Consider a case where more than one node joins immediately, # then one of the node might not have an accepted # ViewChangeDone message messages = [] accepted = self._accepted_view_change_done_message if accepted: messages.append(ViewChangeDone(self.view_no, *accepted)) elif self.name in self._view_change_done: messages.append( ViewChangeDone(self.view_no, *self._view_change_done[self.name])) else: logger.debug( '{} has no ViewChangeDone message to send for view {}'.format( self, self.view_no)) return messages
def instance_changes(tconf): return InstanceChanges(tconf)
def test_old_ic_discard(instance_changes, tconf): frm = "Node1" view_no = 1 quorum = 2 time_provider = MockTimestamp(0) instance_changes = InstanceChanges(tconf, time_provider) msg = InstanceChange(view_no, Suspicions.PRIMARY_DEGRADED.code) time_provider.value = 0 instance_changes.add_vote(msg, frm) time_provider.value += tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL + 1 assert not instance_changes.has_view(view_no) instance_changes.add_vote(msg, frm) time_provider.value += tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL + 1 assert not instance_changes.has_inst_chng_from(view_no, frm) instance_changes.add_vote(msg, frm) time_provider.value += tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL + 1 assert not instance_changes.has_quorum(view_no, quorum)
class ViewChanger(HasActionQueue, MessageProcessor): def __init__(self, node): self.node = node self._view_no = 0 # type: int HasActionQueue.__init__(self) self.pre_vc_strategy = None if hasattr(self.config, 'PRE_VC_STRATEGY'): self.pre_vc_strategy = preVCStrategies.get(self.config.PRE_VC_STRATEGY)(self) self.inBox = deque() self.outBox = deque() self.inBoxRouter = Router( (InstanceChange, self.process_instance_change_msg), (ViewChangeDone, self.process_vchd_msg), (FutureViewChangeDone, self.process_future_view_vchd_msg) ) self.instanceChanges = InstanceChanges(node.config) # The quorum of `ViewChangeDone` msgs is different depending on whether we're doing a real view change, # or just propagating view_no and Primary from `CurrentState` messages sent to a newly joined Node. # TODO: separate real view change and Propagation of Primary # TODO: separate catch-up, view-change and primary selection so that # they are really independent. self.propagate_primary = False # Tracks if other nodes are indicating that this node is in lower view # than others. Keeps a map of view no to senders # TODO: Consider if sufficient ViewChangeDone for 2 different (and # higher views) are received, should one view change be interrupted in # between. self._next_view_indications = {} self._view_change_in_progress = False self.previous_view_no = None self.previous_master_primary = None self.set_defaults() self.initInsChngThrottling() # Action for _schedule instanceChange messages self.instance_change_action = None # Count of instance change rounds self.instance_change_rounds = 0 # Time for view_change_starting self.start_view_change_ts = 0 # Last successful viewNo. # In some cases view_change process can be uncompleted in time. # In that case we want to know, which viewNo was successful (last completed view_change) self.last_completed_view_no = 0 # Force periodic view change if enabled in config force_view_change_freq = node.config.ForceViewChangeFreq if force_view_change_freq > 0: self.startRepeating(self.on_master_degradation, force_view_change_freq) # Start periodic freshness check state_freshness_update_interval = node.config.STATE_FRESHNESS_UPDATE_INTERVAL if state_freshness_update_interval > 0: self.startRepeating(self.check_freshness, state_freshness_update_interval) def __repr__(self): return "{}".format(self.name) # PROPERTIES @property def view_no(self): return self._view_no @view_no.setter def view_no(self, value): logger.info("{} setting view no to {}".format(self.name, value)) self._view_no = value @property def name(self): return self.node.name @property def config(self): return self.node.config @property def quorums(self): return self.node.quorums @property def view_change_in_progress(self): return self._view_change_in_progress @view_change_in_progress.setter def view_change_in_progress(self, value): self._view_change_in_progress = value @property def quorum(self) -> int: # TODO: re-factor this, separate this two states (selection of a new # primary and propagation of existing one) if not self.view_change_in_progress: return self.node.quorums.propagate_primary.value if self.propagate_primary: return self.node.quorums.propagate_primary.value return self.node.quorums.view_change_done.value @property def _hasViewChangeQuorum(self): # This method should just be present for master instance. """ Checks whether n-f nodes completed view change and whether one of them is the next primary """ num_of_ready_nodes = len(self._view_change_done) diff = self.quorum - num_of_ready_nodes if diff > 0: logger.info('{} needs {} ViewChangeDone messages'.format(self, diff)) return False logger.info("{} got view change quorum ({} >= {})". format(self.name, num_of_ready_nodes, self.quorum)) return True @property def _is_propagated_view_change_completed(self): if not self._propagated_view_change_completed and \ self.node.poolLedger is not None and \ self.propagate_primary: accepted = self.get_sufficient_same_view_change_done_messages() if accepted is not None: accepted_pool_ledger_i = \ next(filter(lambda x: x[0] == POOL_LEDGER_ID, accepted[1])) self_pool_ledger_i = \ next(filter(lambda x: x[0] == POOL_LEDGER_ID, self.node.ledger_summary)) logger.info("{} Primary selection has been already completed " "on pool ledger info = {}, primary {}, self pool ledger info {}". format(self, accepted_pool_ledger_i, accepted[0], self_pool_ledger_i)) self._propagated_view_change_completed = True return self._propagated_view_change_completed @property def has_view_change_from_primary(self) -> bool: if not self._has_view_change_from_primary: next_primary_name = self.node.elector._next_primary_node_name_for_master() if next_primary_name not in self._view_change_done: logger.info("{} has not received ViewChangeDone from the next " "primary {} (view_no: {}, totalNodes: {})". format(self.name, next_primary_name, self.view_no, self.node.totalNodes)) else: logger.info('{} received ViewChangeDone from primary {}'.format(self, next_primary_name)) self._has_view_change_from_primary = True return self._has_view_change_from_primary @property def has_acceptable_view_change_quorum(self): if not self._has_acceptable_view_change_quorum: self._has_acceptable_view_change_quorum = ( self._hasViewChangeQuorum and (self._is_propagated_view_change_completed or self.has_view_change_from_primary) ) return self._has_acceptable_view_change_quorum @property def is_behind_for_view(self) -> bool: # Checks if the node is currently behind the accepted state for this # view, only makes sense to call when the node has an acceptable # view change quorum _, accepted_ledger_summary = self.get_sufficient_same_view_change_done_messages() for (ledgerId, own_ledger_size, _), (_, accepted_ledger_size, _) in \ zip(self.node.ledger_summary, accepted_ledger_summary): if own_ledger_size < accepted_ledger_size: logger.info("{} ledger {} sizes are differ: own {} accepted {}". format(self, ledgerId, own_ledger_size, accepted_ledger_size)) return True return False # __ PROPERTIES __ # EXTERNAL EVENTS def on_master_degradation(self): self.propose_view_change() def check_freshness(self): if self.is_state_fresh_enough(): logger.debug("{} not sending instance change because found state to be fresh enough".format(self)) return self.propose_view_change(Suspicions.STATE_SIGS_ARE_NOT_UPDATED) def send_instance_change_if_needed(self, proposed_view_no, reason): can, whyNot = self._canViewChange(proposed_view_no) # if scheduled action will be awakened after view change completed, # then this action must be stopped also. if not can and self.view_no < proposed_view_no and self.is_primary_disconnected(): # Resend the same instance change message if we are not archive # InstanceChange quorum logger.info("Resend instance change message to all recipients") self.sendInstanceChange(proposed_view_no, reason) self._schedule(action=self.instance_change_action, seconds=self.config.INSTANCE_CHANGE_TIMEOUT) logger.info("Count of rounds without quorum of " "instance change messages: {}".format(self.instance_change_rounds)) self.instance_change_rounds += 1 else: # ViewChange procedure was started, therefore stop scheduling # resending instanceChange messages logger.info("Stop scheduling instance change resending") self._cancel(action=self.instance_change_action) self.instance_change_action = None self.instance_change_rounds = 0 def on_primary_loss(self): view_no = self.propose_view_change(Suspicions.PRIMARY_DISCONNECTED) if self.instance_change_action: # It's an action, scheduled for previous instanceChange round logger.info("Stop previous instance change resending schedule") self._cancel(action=self.instance_change_action) self.instance_change_rounds = 0 self.instance_change_action = partial(self.send_instance_change_if_needed, view_no, Suspicions.PRIMARY_DISCONNECTED) self._schedule(self.instance_change_action, seconds=self.config.INSTANCE_CHANGE_TIMEOUT) # TODO we have `on_primary_loss`, do we need that one? def on_primary_about_to_be_disconnected(self): self.propose_view_change(Suspicions.PRIMARY_ABOUT_TO_BE_DISCONNECTED) def on_suspicious_primary(self, suspicion: Suspicions): self.propose_view_change(suspicion) def on_view_change_not_completed_in_time(self): self.propose_view_change(Suspicions.INSTANCE_CHANGE_TIMEOUT) def on_catchup_complete(self): if self.node.is_synced and self.node.master_replica.isPrimary is None and \ not self._is_propagated_view_change_completed: self._send_view_change_done_message() self._start_selection() def process_future_view_vchd_msg(self, future_vcd_msg: FutureViewChangeDone, frm): # if we already started a view change then do not decide on a new one if self.view_change_in_progress: return from_current_state = future_vcd_msg.from_current_state view_no = future_vcd_msg.vcd_msg.viewNo # ToDo maybe we should compare with last_completed_view_no instead of viewNo. if not ((view_no > self.view_no) or (self.view_no == 0 and from_current_state)): # it means we already processed this future View Change Done return if self.view_no == 0 and from_current_state: # This is the first Propagate Primary, # so we need to make sure that we connected to the real primary for the proposed view # see test_view_change_after_back_to_quorum_with_disconnected_primary self.node.schedule_initial_propose_view_change() if view_no not in self._next_view_indications: self._next_view_indications[view_no] = {} self._next_view_indications[view_no][frm] = future_vcd_msg.vcd_msg self._start_view_change_if_possible(view_no, propagate_primary=from_current_state) # __ EXTERNAL EVENTS __ def process_instance_change_msg(self, instChg: InstanceChange, frm: str) -> None: """ Validate and process an instance change request. :param instChg: the instance change request :param frm: the name of the node that sent this `msg` """ if frm not in self.node.nodestack.connecteds: self.node.discard( instChg, "received instance change request: {} from {} " "which is not in connected list: {}". format(instChg, frm, self.node.nodestack.connecteds), logger.info) return logger.info("{} received instance change request: {} from {}".format(self, instChg, frm)) # TODO: add sender to blacklist? if not isinstance(instChg.viewNo, int): self.node.discard( instChg, "{}field view_no has incorrect type: {}". format(VIEW_CHANGE_PREFIX, type(instChg.viewNo))) elif instChg.viewNo <= self.view_no: self.node.discard( instChg, "Received instance change request with view no {} " "which is not more than its view no {}". format(instChg.viewNo, self.view_no), logger.info) else: # Record instance changes for views but send instance change # only when found master to be degraded. if quorum of view changes # found then change view even if master not degraded self._on_verified_instance_change_msg(instChg, frm) if self.instanceChanges.has_inst_chng_from(instChg.viewNo, self.name): logger.info("{} received instance change message {} but has already " "sent an instance change message".format(self, instChg)) elif not self.node.monitor.isMasterDegraded(): logger.info("{} received instance change message {} but did not " "find the master to be slow".format(self, instChg)) else: logger.display("{}{} found master degraded after receiving instance change" " message from {}".format(VIEW_CHANGE_PREFIX, self, frm)) self.sendInstanceChange(instChg.viewNo) def process_vchd_msg(self, msg: ViewChangeDone, sender: str) -> bool: """ Processes ViewChangeDone messages. Once n-f messages have been received, decides on a primary for specific replica. :param msg: ViewChangeDone message :param sender: the name of the node from which this message was sent """ logger.info("{}'s primary selector started processing of ViewChangeDone msg from {} : {}". format(self.name, sender, msg)) view_no = msg.viewNo if self.view_no != view_no: self.discard(msg, '{} got Primary from {} for view no {} ' 'whereas current view no is {}'. format(self, sender, view_no, self.view_no), logMethod=logger.info) return False new_primary_name = msg.name if new_primary_name == self.previous_master_primary: self.discard(msg, '{} got Primary from {} for {} who was primary of ' 'master in previous view too'. format(self, sender, new_primary_name), logMethod=logger.info) return False # Since a node can send ViewChangeDone more than one time self._on_verified_view_change_done_msg(msg, sender) # TODO why do we check that after the message tracking if self.node.master_replica.hasPrimary: self.discard(msg, "it already decided primary which is {}". format(self.node.master_replica.primaryName), logger.info) return False self._start_selection() def send(self, msg): """ Send a message to the node. :param msg: the message to send """ logger.debug("{}'s view_changer sending {}".format(self.name, msg)) self.outBox.append(msg) async def serviceQueues(self, limit=None) -> int: """ Service at most `limit` messages from the inBox. :param limit: the maximum number of messages to service :return: the number of messages successfully processed """ # do not start any view changes until catch-up is finished! if not Mode.is_done_syncing(self.node.mode): return 0 return await self.inBoxRouter.handleAll(self.inBox, limit) def sendInstanceChange(self, view_no: int, suspicion=Suspicions.PRIMARY_DEGRADED): """ Broadcast an instance change request to all the remaining nodes :param view_no: the view number when the instance change is requested """ # If not found any sent instance change messages in last # `ViewChangeWindowSize` seconds or the last sent instance change # message was sent long enough ago then instance change message can be # sent otherwise no. canSendInsChange, cooldown = self.insChngThrottler.acquire() if canSendInsChange: logger.info( "{}{} sending an instance change with view_no {}" " since {}".format( VIEW_CHANGE_PREFIX, self, view_no, suspicion.reason)) logger.info("{}{} metrics for monitor: {}" .format(MONITORING_PREFIX, self, self.node.monitor.prettymetrics)) msg = self._create_instance_change_msg(view_no, suspicion.code) self.send(msg) # record instance change vote for self and try to change the view # if quorum is reached self._on_verified_instance_change_msg(msg, self.name) else: logger.info("{} cannot send instance change sooner then {} seconds".format(self, cooldown)) # noinspection PyAttributeOutsideInit def initInsChngThrottling(self): windowSize = self.node.config.ViewChangeWindowSize ratchet = Ratchet(a=2, b=0.05, c=1, base=2, peak=windowSize) self.insChngThrottler = Throttler(windowSize, ratchet.get) def _create_instance_change_msg(self, view_no, suspicion_code): return InstanceChange(view_no, suspicion_code) def _on_verified_instance_change_msg(self, msg, frm): view_no = msg.viewNo if not self.instanceChanges.has_inst_chng_from(view_no, frm): self.instanceChanges.add_vote(msg, frm) if view_no > self.view_no: self.do_view_change_if_possible(view_no) def do_view_change_if_possible(self, view_no): # TODO: Need to handle skewed distributions which can arise due to # malicious nodes sending messages early on can, whyNot = self._canViewChange(view_no) if can: logger.display("{}{} initiating a view change to {} from {}". format(VIEW_CHANGE_PREFIX, self, view_no, self.view_no)) self.propagate_primary = False self.startViewChange(view_no) else: logger.info(whyNot) return can def _qourum_is_reached(self, count, propagate_primary): if propagate_primary: return self.quorums.propagate_primary.is_reached(count) else: return self.quorums.view_change_done.is_reached(count) def _start_view_change_if_possible(self, view_no, propagate_primary=False) -> bool: ind_count = len(self._next_view_indications[view_no]) if self._qourum_is_reached(ind_count, propagate_primary): logger.display('{}{} starting view change for {} after {} view change ' 'indications from other nodes'.format(VIEW_CHANGE_PREFIX, self, view_no, ind_count)) self.propagate_primary = propagate_primary self.startViewChange(view_no) return True return False def _canViewChange(self, proposedViewNo: int) -> (bool, str): """ Return whether there's quorum for view change for the proposed view number and its view is less than or equal to the proposed view """ msg = None quorum = self.quorums.view_change.value if not self.instanceChanges.has_quorum(proposedViewNo, quorum): msg = '{} has no quorum for view {}'.format(self, proposedViewNo) elif not proposedViewNo > self.view_no: msg = '{} is in higher view more than {}'.format( self, proposedViewNo) return not bool(msg), msg def startViewChange(self, proposed_view_no: int, continue_vc=False): """ Trigger the view change process. :param proposed_view_no: the new view number after view change. """ # TODO: consider moving this to pool manager # TODO: view change is a special case, which can have different # implementations - we need to make this logic pluggable if self.pre_vc_strategy and (not self.propagate_primary) and (not continue_vc): self.pre_vc_strategy.prepare_view_change(proposed_view_no) return elif self.pre_vc_strategy: self.pre_vc_strategy.on_strategy_complete() self.previous_view_no = self.view_no self.view_no = proposed_view_no self.view_change_in_progress = True self.previous_master_primary = self.node.master_primary_name self.set_defaults() self._process_vcd_for_future_view() self.initInsChngThrottling() self.node.on_view_change_start() self.node.start_catchup() def _process_vcd_for_future_view(self): # make sure that all received VCD messages for future view # (including the current view) are stored, as they will be needed for a quorum # to finish the View Change and start selection. # This is especially critical for Propagate Primary mode (on receiving CURRENT_STATE by a new node). if self.view_no in self._next_view_indications: for frm, vcd in self._next_view_indications[self.view_no].items(): # we call _on_verified_view_change_done_msg, not process_vchd_msg, # since we may be in propagate primary mode where some of validation inside process_vchd_msg # is not correct (for example, checking that the new primary differs from the current one) self._on_verified_view_change_done_msg(vcd, frm) # remove all for previous views for view_no in tuple(self._next_view_indications.keys()): if view_no <= self.view_no: del self._next_view_indications[view_no] def _on_verified_view_change_done_msg(self, msg, frm): new_primary_name = msg.name ledger_summary = msg.ledgerInfo # TODO what is the case when node sends several different # view change done messages data = (new_primary_name, ledger_summary) self._view_change_done[frm] = data def _start_selection(self): error = None if not self.node.is_synced: error = "mode is {}".format(self.node.mode) elif not self.has_acceptable_view_change_quorum: error = "has no view change quorum or no message from next primary" else: rv = self.get_sufficient_same_view_change_done_messages() if rv is None: error = "there are not sufficient same ViewChangeDone messages" elif not (self._is_propagated_view_change_completed or self._verify_primary(*rv)): error = "failed to verify primary" if error is not None: logger.info('{} cannot start primary selection because {}'.format(self, error)) return if self.is_behind_for_view: logger.info('{} is synced and has an acceptable view change quorum ' 'but is behind the accepted state'.format(self)) self.node.start_catchup() return logger.info("{} starting selection".format(self)) nodeReg = None # in case of already completed view change # use node registry actual for the moment when it happened if self._is_propagated_view_change_completed: assert self._accepted_view_change_done_message is not None ledger_summary = self._accepted_view_change_done_message[1] pool_ledger_size = ledger_summary[POOL_LEDGER_ID][1] nodeReg = self.node.poolManager.getNodeRegistry(pool_ledger_size) if self.view_change_in_progress: self.node.backup_instance_faulty_processor.restore_replicas() self.node.select_primaries(nodeReg) if self.view_change_in_progress: self.view_change_in_progress = False self.node.on_view_change_complete() # when we had INSTANCE_CHANGE message, they added into instanceChanges # by msg.view_no. When view change was occured and view_no is changed, # then we should delete all INSTANCE_CHANGE messages with current (already changed) # view_no (which used in corresponded INSTANCE_CHANGE messages) # Therefore we delete all INSTANCE_CHANGE messages from previous and current view number for view_number in list(self.instanceChanges.keys()): if view_number <= self.view_no: self.instanceChanges.pop(view_number, None) self.previous_view_no = None self.previous_master_primary = None self.propagate_primary = False def set_defaults(self): # Tracks view change done message self._view_change_done = {} # replica name -> data # Set when an appropriate view change quorum is found which has # sufficient same ViewChangeDone messages self._primary_verified = False self._has_view_change_from_primary = False self._has_acceptable_view_change_quorum = False self._accepted_view_change_done_message = None # accept any primary if propagating view change done msgs # for already completed view change self._propagated_view_change_completed = False def get_sufficient_same_view_change_done_messages(self) -> Optional[Tuple]: # Returns whether has a quorum of ViewChangeDone messages that are same # TODO: Does not look like optimal implementation. if self._accepted_view_change_done_message is None and \ self._view_change_done: votes = self._view_change_done.values() votes = [(nm, tuple(tuple(i) for i in info)) for nm, info in votes] (new_primary, ledger_info), vote_count = mostCommonElement(votes) if vote_count >= self.quorum: logger.info('{} found acceptable primary {} and ledger info {}'. format(self, new_primary, ledger_info)) self._accepted_view_change_done_message = (new_primary, ledger_info) else: logger.info('{} does not have acceptable primary, only {} votes for {}'. format(self, vote_count, (new_primary, ledger_info))) return self._accepted_view_change_done_message def _verify_primary(self, new_primary, ledger_info): """ This method is called when sufficient number of ViewChangeDone received and makes steps to switch to the new primary """ expected_primary = self.node.elector._next_primary_node_name_for_master() if new_primary != expected_primary: logger.error("{}{} expected next primary to be {}, but majority " "declared {} instead for view {}" .format(PRIMARY_SELECTION_PREFIX, self.name, expected_primary, new_primary, self.view_no)) return False self._primary_verified = True return True # TODO: check if ledger status is expected def _send_view_change_done_message(self): """ Sends ViewChangeDone message to other protocol participants """ new_primary_name = self.node.elector._next_primary_node_name_for_master() ledger_summary = self.node.ledger_summary message = ViewChangeDone(self.view_no, new_primary_name, ledger_summary) logger.info("{} is sending ViewChangeDone msg to all : {}".format(self, message)) self.send(message) self._on_verified_view_change_done_msg(message, self.name) # overridden method of PrimaryDecider def get_msgs_for_lagged_nodes(self) -> List[ViewChangeDone]: # Should not return a list, only done for compatibility with interface """ Returns the last accepted `ViewChangeDone` message. If no view change has happened returns ViewChangeDone with view no 0 to a newly joined node """ # TODO: Consider a case where more than one node joins immediately, # then one of the node might not have an accepted # ViewChangeDone message messages = [] accepted = self._accepted_view_change_done_message if accepted: messages.append(ViewChangeDone(self.last_completed_view_no, *accepted)) elif self.name in self._view_change_done: messages.append(ViewChangeDone(self.last_completed_view_no, *self._view_change_done[self.name])) else: logger.info('{} has no ViewChangeDone message to send for view {}'. format(self, self.view_no)) return messages def propose_view_change(self, suspicion=Suspicions.PRIMARY_DEGRADED): proposed_view_no = self.view_no # TODO: For some reason not incrementing view_no in most cases leads to lots of failing/flaky tests # if suspicion == Suspicions.INSTANCE_CHANGE_TIMEOUT or not self.view_change_in_progress: if suspicion != Suspicions.STATE_SIGS_ARE_NOT_UPDATED or not self.view_change_in_progress: proposed_view_no += 1 self.sendInstanceChange(proposed_view_no, suspicion) return proposed_view_no def is_primary_disconnected(self): return \ self.node.primaries_disconnection_times[self.node.master_replica.instId] and self.node.master_primary_name and \ self.node.master_primary_name not in self.node.nodestack.conns def is_state_fresh_enough(self): replica = self.node.master_replica timestamps = replica.get_ledgers_last_update_time().values() oldest_timestamp = min(timestamps) time_elapsed = replica.get_time_for_3pc_batch() - oldest_timestamp threshold = self.config.ACCEPTABLE_FRESHNESS_INTERVALS_COUNT * self.config.STATE_FRESHNESS_UPDATE_INTERVAL return time_elapsed < threshold
def __init__(self, name: str, nodeRegistry: Dict[str, HA], clientAuthNr: ClientAuthNr=None, ha: HA=None, cliname: str=None, cliha: HA=None, basedirpath: str=None, primaryDecider: PrimaryDecider = None, opVerifiers: Iterable[Any]=None): """ Create a new node. :param nodeRegistry: names and host addresses of all nodes in the pool :param clientAuthNr: client authenticator implementation to be used :param basedirpath: path to the base directory used by `nstack` and `cstack` :param primaryDecider: the mechanism to be used to decide the primary of a protocol instance """ self.opVerifiers = opVerifiers or [] self.primaryDecider = primaryDecider me = nodeRegistry[name] self.allNodeNames = list(nodeRegistry.keys()) if isinstance(me, NodeDetail): sha = me.ha scliname = me.cliname scliha = me.cliha nodeReg = {k: v.ha for k, v in nodeRegistry.items()} else: sha = me if isinstance(me, HA) else HA(*me) scliname = None scliha = None nodeReg = {k: HA(*v) for k, v in nodeRegistry.items()} if not ha: # pull it from the registry ha = sha if not cliname: # default to the name plus the suffix cliname = scliname if scliname else name + CLIENT_STACK_SUFFIX if not cliha: # default to same ip, port + 1 cliha = scliha if scliha else HA(ha[0], ha[1]+1) nstack = dict(name=name, ha=ha, main=True, auto=AutoMode.never) cstack = dict(name=cliname, ha=cliha, main=True, auto=AutoMode.always) if basedirpath: nstack['basedirpath'] = basedirpath cstack['basedirpath'] = basedirpath self.clientAuthNr = clientAuthNr or SimpleAuthNr() self.nodeInBox = deque() self.clientInBox = deque() self.created = time.perf_counter() HasActionQueue.__init__(self) NodeStacked.__init__(self, nstack, nodeReg) ClientStacked.__init__(self, cstack) Motor.__init__(self) Propagator.__init__(self) self.totalNodes = len(nodeRegistry) self.f = getMaxFailures(self.totalNodes) self.requiredNumberOfInstances = self.f + 1 # per RBFT self.minimumNodes = (2 * self.f) + 1 # minimum for a functional pool self.txnStore = TransactionStore() self.replicas = [] # type: List[replica.Replica] self.instanceChanges = InstanceChanges() self.viewNo = 0 # type: int self.rank = self.getRank(self.name, nodeRegistry) self.elector = None # type: PrimaryDecider self.forwardedRequests = set() # type: Set[Tuple[(str, int)]] self.instances = Instances() self.monitor = Monitor(self.name, Delta=.8, Lambda=60, Omega=5, instances=self.instances) # Requests that are to be given to the replicas by the node. Each # element of the list is a deque for the replica with number equal to # its index in the list and each element of the deque is a named tuple self.msgsToReplicas = [] # type: List[deque] # Requests that are to be given to the elector by the node self.msgsToElector = deque() nodeRoutes = [(Propagate, self.processPropagate), (InstanceChange, self.processInstanceChange)] nodeRoutes.extend((msgTyp, self.sendToElector) for msgTyp in [Nomination, Primary, Reelection]) nodeRoutes.extend((msgTyp, self.sendToReplica) for msgTyp in [PrePrepare, Prepare, Commit]) self.nodeMsgRouter = Router(*nodeRoutes) self.clientMsgRouter = Router((Request, self.processRequest)) self.perfCheckFreq = 10 self._schedule(self.checkPerformance, self.perfCheckFreq) self.clientBlacklister = SimpleBlacklister( self.name + CLIENT_BLACKLISTER_SUFFIX) # type: Blacklister self.nodeBlacklister = SimpleBlacklister( self.name + NODE_BLACKLISTER_SUFFIX) # type: Blacklister # BE CAREFUL HERE # This controls which message types are excluded from signature # verification. These are still subject to RAET's signature verification # but client signatures will not be checked on these. Expressly # prohibited from being in this is ClientRequest and Propagation, # which both require client signature verification self.authnWhitelist = (Nomination, Primary, Reelection, Batch, PrePrepare, Prepare, Commit, InstanceChange) self.addReplicas()
class Node(HasActionQueue, NodeStacked, ClientStacked, Motor, Propagator, MessageProcessor): suspicions = {s.code: s.reason for s in Suspicions.getList()} def __init__(self, name: str, nodeRegistry: Dict[str, HA], clientAuthNr: ClientAuthNr=None, ha: HA=None, cliname: str=None, cliha: HA=None, basedirpath: str=None, primaryDecider: PrimaryDecider = None, opVerifiers: Iterable[Any]=None): """ Create a new node. :param nodeRegistry: names and host addresses of all nodes in the pool :param clientAuthNr: client authenticator implementation to be used :param basedirpath: path to the base directory used by `nstack` and `cstack` :param primaryDecider: the mechanism to be used to decide the primary of a protocol instance """ self.opVerifiers = opVerifiers or [] self.primaryDecider = primaryDecider me = nodeRegistry[name] self.allNodeNames = list(nodeRegistry.keys()) if isinstance(me, NodeDetail): sha = me.ha scliname = me.cliname scliha = me.cliha nodeReg = {k: v.ha for k, v in nodeRegistry.items()} else: sha = me if isinstance(me, HA) else HA(*me) scliname = None scliha = None nodeReg = {k: HA(*v) for k, v in nodeRegistry.items()} if not ha: # pull it from the registry ha = sha if not cliname: # default to the name plus the suffix cliname = scliname if scliname else name + CLIENT_STACK_SUFFIX if not cliha: # default to same ip, port + 1 cliha = scliha if scliha else HA(ha[0], ha[1]+1) nstack = dict(name=name, ha=ha, main=True, auto=AutoMode.never) cstack = dict(name=cliname, ha=cliha, main=True, auto=AutoMode.always) if basedirpath: nstack['basedirpath'] = basedirpath cstack['basedirpath'] = basedirpath self.clientAuthNr = clientAuthNr or SimpleAuthNr() self.nodeInBox = deque() self.clientInBox = deque() self.created = time.perf_counter() HasActionQueue.__init__(self) NodeStacked.__init__(self, nstack, nodeReg) ClientStacked.__init__(self, cstack) Motor.__init__(self) Propagator.__init__(self) self.totalNodes = len(nodeRegistry) self.f = getMaxFailures(self.totalNodes) self.requiredNumberOfInstances = self.f + 1 # per RBFT self.minimumNodes = (2 * self.f) + 1 # minimum for a functional pool self.txnStore = TransactionStore() self.replicas = [] # type: List[replica.Replica] self.instanceChanges = InstanceChanges() self.viewNo = 0 # type: int self.rank = self.getRank(self.name, nodeRegistry) self.elector = None # type: PrimaryDecider self.forwardedRequests = set() # type: Set[Tuple[(str, int)]] self.instances = Instances() self.monitor = Monitor(self.name, Delta=.8, Lambda=60, Omega=5, instances=self.instances) # Requests that are to be given to the replicas by the node. Each # element of the list is a deque for the replica with number equal to # its index in the list and each element of the deque is a named tuple self.msgsToReplicas = [] # type: List[deque] # Requests that are to be given to the elector by the node self.msgsToElector = deque() nodeRoutes = [(Propagate, self.processPropagate), (InstanceChange, self.processInstanceChange)] nodeRoutes.extend((msgTyp, self.sendToElector) for msgTyp in [Nomination, Primary, Reelection]) nodeRoutes.extend((msgTyp, self.sendToReplica) for msgTyp in [PrePrepare, Prepare, Commit]) self.nodeMsgRouter = Router(*nodeRoutes) self.clientMsgRouter = Router((Request, self.processRequest)) self.perfCheckFreq = 10 self._schedule(self.checkPerformance, self.perfCheckFreq) self.clientBlacklister = SimpleBlacklister( self.name + CLIENT_BLACKLISTER_SUFFIX) # type: Blacklister self.nodeBlacklister = SimpleBlacklister( self.name + NODE_BLACKLISTER_SUFFIX) # type: Blacklister # BE CAREFUL HERE # This controls which message types are excluded from signature # verification. These are still subject to RAET's signature verification # but client signatures will not be checked on these. Expressly # prohibited from being in this is ClientRequest and Propagation, # which both require client signature verification self.authnWhitelist = (Nomination, Primary, Reelection, Batch, PrePrepare, Prepare, Commit, InstanceChange) self.addReplicas() def start(self): oldstatus = self.status super().start() if oldstatus in Status.going(): logger.info("{} is already {}, so start has no effect". format(self, self.status.name)) else: self.startNodestack() self.startClientstack() self.elector = self.newPrimaryDecider() # if first time running this node if not self.nodestack.remotes: logger.info("{} first time running; waiting for key sharing..." "".format(self)) else: self.maintainConnections() @staticmethod def getRank(name: str, allNames: Sequence[str]): return sorted(allNames).index(name) def newPrimaryDecider(self): if self.primaryDecider: return self.primaryDecider else: return primary_elector.PrimaryElector(self) @property def nodeCount(self) -> int: """ The plus one is for this node, for example, if this node has three connections, then there would be four total nodes :return: number of connected nodes this one """ return len(self._conns) + 1 def onStopping(self): """ Actions to be performed on stopping the node. - Close the UDP socket of the nodestack """ if self.nodestack: self.nodestack.close() self.nodestack = None if self.clientstack: self.clientstack.close() self.clientstack = None self.reset() self.logstats() self.conns.clear() def reset(self): logger.info("{} reseting...".format(self), extra={"cli": False}) self.nextCheck = 0 self.aqStash.clear() self.actionQueue.clear() self.elector = None async def prod(self, limit: int=None) -> int: """ This function is executed by the node each time it gets its share of CPU time from the event loop. :param limit: the number of items to be serviced in this attempt :return: total number of messages serviced by this node """ await self.serviceLifecycle() c = 0 if self.status is not Status.stopped: c += await self.serviceNodeMsgs(limit) c += self.serviceReplicas(limit) c += await self.serviceClientMsgs(limit) c += self._serviceActions() c += await self.serviceElector() self.flushOutBoxes() return c def serviceReplicas(self, limit) -> int: """ Execute `serviceReplicaMsgs`, `serviceReplicaOutBox` and `serviceReplicaInBox` with `limit` number of messages. See the respective functions for more information. :param limit: the maximum number of messages to process :return: the sum of messages successfully processed by serviceReplicaMsgs, serviceReplicaInBox and serviceReplicaOutBox """ a = self.serviceReplicaMsgs(limit) b = self.serviceReplicaOutBox(limit) c = self.serviceReplicaInBox(limit) return a + b + c async def serviceNodeMsgs(self, limit: int) -> int: """ Process `limit` number of messages from the nodeInBox. :param limit: the maximum number of messages to process :return: the number of messages successfully processed """ n = await self.nodestack.service(limit) await self.processNodeInBox() return n async def serviceClientMsgs(self, limit: int) -> int: """ Process `limit` number of messages from the clientInBox. :param limit: the maximum number of messages to process :return: the number of messages successfully processed """ c = await self.clientstack.service(limit) await self.processClientInBox() return c async def serviceElector(self) -> int: """ Service the elector's inBox, outBox and action queues. :return: the number of messages successfully serviced """ if not self.isReady(): return 0 o = self.serviceElectorOutBox() i = await self.serviceElectorInbox() a = self.elector._serviceActions() return o + i + a def onConnsChanged(self, newConns: Set[str], staleConns: Set[str]): """ A series of operations to perform once a connection count has changed. - Set f to max number of failures this system can handle. - Set status to one of started, started_hungry or starting depending on the number of protocol instances. - Check protocol instances. See `checkProtocolInstaces()` """ if self.isGoing(): if self.nodeCount >= self.totalNodes: self.status = Status.started self.stopKeySharing() elif self.nodeCount >= self.minimumNodes: self.status = Status.started_hungry else: self.status = Status.starting self.elector.nodeCount = self.nodeCount if self.isReady(): self.checkInstances() if isinstance(self.elector, PrimaryElector): msgs = self.elector.getElectionMsgsForLaggedNodes() logger.debug("{} has msgs {} for new nodes {}".format(self, msgs, newConns)) for n in newConns: self.sendElectionMsgsToLaggedNode(n, msgs) def sendElectionMsgsToLaggedNode(self, nodeName: str, msgs: List[Any]): rid = self.nodestack.getRemote(nodeName).uid for msg in msgs: logger.debug("{} sending election message {} to lagged node {}" .format(self, msg, nodeName)) self.send(msg, rid) def _statusChanged(self, old: Status, new: Status) -> None: """ Perform some actions based on whether this node is ready or not. :param old: the previous status :param new: the current status """ pass def checkInstances(self) -> None: """ Check if this node has the minimum required number of protocol instances, i.e. f+1. If not, add a replica. If no election is in progress, this node will try to nominate one of its replicas as primary. This method is called whenever a connection with a new node is established. """ logger.debug("{} choosing to start election on the basis of count {} " "and nodes {}".format(self, self.nodeCount, self.conns)) self._schedule(self.decidePrimaries) def addReplicas(self): while len(self.replicas) < self.requiredNumberOfInstances: self.addReplica() def decidePrimaries(self): """ Choose the primary replica for each protocol instance in the system using a PrimaryDecider. """ self.elector.decidePrimaries() def createReplica(self, instId: int, isMaster: bool) -> 'replica.Replica': """ Create a new replica with the specified parameters. This is a convenience method used to create replicas from a node instead of passing in replicas in the Node's constructor. :param protNo: protocol instance number :param isMaster: does this replica belong to the master protocol instance? :return: a new instance of Replica """ return replica.Replica(self, instId, isMaster) def addReplica(self): """ Create and add a new replica to this node. If this is the first replica on this node, it will belong to the Master protocol instance. """ instId = len(self.replicas) if len(self.replicas) == 0: isMaster = True instDesc = "master" else: isMaster = False instDesc = "backup" replica = self.createReplica(instId, isMaster) self.replicas.append(replica) self.msgsToReplicas.append(deque()) self.monitor.addInstance() logger.info("{} added replica {} to instance {} ({})". format(self, replica, instId, instDesc), extra={"cli": True}) return replica def serviceReplicaMsgs(self, limit: int=None) -> int: """ Process `limit` number of replica messages. Here processing means appending to replica inbox. :param limit: the maximum number of replica messages to process :return: the number of replica messages processed """ msgCount = 0 for idx, replicaMsgs in enumerate(self.msgsToReplicas): while replicaMsgs and (not limit or msgCount < limit): msgCount += 1 msg = replicaMsgs.popleft() self.replicas[idx].inBox.append(msg) return msgCount def serviceReplicaOutBox(self, limit: int=None) -> int: """ Process `limit` number of replica messages. Here processing means appending to replica inbox. :param limit: the maximum number of replica messages to process :return: the number of replica messages processed """ msgCount = 0 for replica in self.replicas: while replica.outBox and (not limit or msgCount < limit): msgCount += 1 msg = replica.outBox.popleft() if isinstance(msg, (PrePrepare, Prepare, Commit)): self.send(msg) elif isinstance(msg, Ordered): self.processOrdered(msg) elif isinstance(msg, Exception): self.processEscalatedException(msg) else: logger.error("Received msg {} and don't know how to " "handle it".format(msg)) return msgCount def serviceReplicaInBox(self, limit: int=None): """ Process `limit` number of messages in the replica inbox for each replica on this node. :param limit: the maximum number of replica messages to process :return: the number of replica messages processed successfully """ msgCount = 0 for replica in self.replicas: msgCount += replica.serviceQueues(limit) return msgCount def serviceElectorOutBox(self, limit: int=None) -> int: """ Service at most `limit` number of messages from the elector's outBox. :return: the number of messages successfully serviced. """ msgCount = 0 while self.elector.outBox and (not limit or msgCount < limit): msgCount += 1 msg = self.elector.outBox.popleft() if isinstance(msg, (Nomination, Primary, Reelection)): self.send(msg) elif isinstance(msg, BlacklistMsg): nodeName = getattr(msg, f.NODE_NAME.nm) code = getattr(msg, f.SUSP_CODE.nm) self.reportSuspiciousNode(nodeName, code=code) else: logger.error("Received msg {} and don't know how to handle it". format(msg)) return msgCount async def serviceElectorInbox(self, limit: int=None) -> int: """ Service at most `limit` number of messages from the elector's outBox. :return: the number of messages successfully serviced. """ msgCount = 0 while self.msgsToElector and (not limit or msgCount < limit): msgCount += 1 msg = self.msgsToElector.popleft() self.elector.inBox.append(msg) await self.elector.serviceQueues(limit) return msgCount @property def hasPrimary(self) -> bool: """ Does this node have a primary replica? :return: whether this node has a primary """ return any(replica.isPrimary for replica in self.replicas) @property def primaryReplicaNo(self) -> Optional[int]: """ Return the index of the primary or None if there's no primary among the replicas on this node. :return: index of the primary """ for idx, replica in enumerate(self.replicas): if replica.isPrimary: return idx return None def isValidNodeMsg(self, msg) -> bool: """ Return whether the node message is valid. :param msg: the node message to validate """ if msg.instId >= len(self.msgsToReplicas): # TODO should we raise suspicion here? self.discard(msg, "non-existent protocol instance {}" .format(msg.instId)) return False return True def sendToReplica(self, msg, frm): """ Send the message to the intended replica. :param msg: the message to send :param frm: the name of the node which sent this `msg` """ if self.isValidNodeMsg(msg): self.msgsToReplicas[msg.instId].append((msg, frm)) def sendToElector(self, msg, frm): """ Send the message to the intended elector. :param msg: the message to send :param frm: the name of the node which sent this `msg` """ if self.isValidNodeMsg(msg): logger.debug("{} sending message to elector: {}". format(self, (msg, frm))) self.msgsToElector.append((msg, frm)) def handleOneNodeMsg(self, wrappedMsg): """ Validate and process one message from a node. :param wrappedMsg: Tuple of message and the name of the node that sent the message """ try: vmsg = self.validateNodeMsg(wrappedMsg) if vmsg: self.unpackNodeMsg(*vmsg) except SuspiciousNode as ex: self.reportSuspiciousNodeEx(ex) except Exception as ex: msg, frm = wrappedMsg self.discard(msg, ex) def validateNodeMsg(self, wrappedMsg): """ Validate another node's message sent to this node. :param wrappedMsg: Tuple of message and the name of the node that sent the message :return: Tuple of message from node and name of the node """ msg, frm = wrappedMsg if self.isNodeBlacklisted(frm): self.discard(msg, "received from blacklisted node {}" .format(frm), logger.info) return None op = msg.pop(OP_FIELD_NAME, None) if not op: raise MissingNodeOp cls = TaggedTuples.get(op, None) if not cls: raise InvalidNodeOp(op) try: cMsg = cls(**msg) except Exception as ex: raise InvalidNodeMsg from ex try: self.verifySignature(cMsg) # TODO why must we catch and raise? Is there a way to know earlier that # the signature exception is suspicious? If so, how suspicious? except BaseExc as ex: raise SuspiciousNode(frm, ex, cMsg) from ex # TODO are both needed? logger.debug("{} received node message from {}: {}". format(self, frm, cMsg), extra={"cli": False}) return cMsg, frm def unpackNodeMsg(self, msg, frm) -> None: """ If the message is a batch message validate each message in the batch, otherwise add the message to the node's inbox. :param msg: a node message :param frm: the name of the node that sent this `msg` """ if isinstance(msg, Batch): for m in msg.messages: self.handleOneNodeMsg((m, frm)) else: self.postToNodeInBox(msg, frm) def postToNodeInBox(self, msg, frm): """ Append the message to the node inbox :param msg: a node message :param frm: the name of the node that sent this `msg` """ self.nodeInBox.append((msg, frm)) async def processNodeInBox(self): """ Process the messages in the node inbox asynchronously. """ while self.nodeInBox: m = self.nodeInBox.popleft() try: await self.nodeMsgRouter.handle(m) except SuspiciousNode as ex: self.reportSuspiciousNodeEx(ex) self.discard(m, ex) def handleOneClientMsg(self, wrappedMsg): """ Validate and process a client message :param wrappedMsg: a message from a client """ try: vmsg = self.validateClientMsg(wrappedMsg) if vmsg: self.unpackClientMsg(*vmsg) except SuspiciousClient as ex: msg, frm = wrappedMsg exc = ex.__cause__ if ex.__cause__ else ex self.reportSuspiciousClient(frm, exc) self.discard(wrappedMsg, exc) except InvalidClientRequest as ex: exc = ex.__cause__ if ex.__cause__ else ex reason = "client request invalid: {} {}".\ format(exc.__class__.__name__, exc) self.discard(wrappedMsg, reason, cliOutput=True) def validateClientMsg(self, wrappedMsg): """ Validate a message sent by a client. :param wrappedMsg: a message from a client :return: Tuple of clientMessage and client address """ msg, frm = wrappedMsg if self.isClientBlacklisted(frm): self.discard(msg, "received from blacklisted client {}" .format(frm), logger.info) return None if all(attr in msg.keys() for attr in [OPERATION, 'clientId', 'reqId']): self.checkValidOperation(msg[OPERATION]) cls = Request elif OP_FIELD_NAME in msg: op = msg.pop(OP_FIELD_NAME) cls = TaggedTuples.get(op, None) if not cls: raise InvalidClientOp(op) if cls is not Batch: raise InvalidClientMsgType(cls) else: raise InvalidClientRequest try: cMsg = cls(**msg) except Exception as ex: raise InvalidClientRequest from ex try: self.verifySignature(cMsg) except Exception as ex: raise SuspiciousClient from ex logger.trace("{} received CLIENT message: {}". format(self.clientstack.name, cMsg)) return cMsg, frm def unpackClientMsg(self, msg, frm): """ If the message is a batch message validate each message in the batch, otherwise add the message to the node's clientInBox. :param msg: a client message :param frm: the clientId of the client that sent this `msg` """ if isinstance(msg, Batch): for m in msg.messages: self.handleOneClientMsg((m, frm)) else: self.postToClientInBox(msg, frm) def postToClientInBox(self, msg, frm): """ Append the message to the node's clientInBox :param msg: a client message :param frm: the name of the node that sent this `msg` """ self.clientInBox.append((msg, frm)) async def processClientInBox(self): """ Process the messages in the node's clientInBox asynchronously. All messages in the inBox have already been validated, including signature check. """ while self.clientInBox: m = self.clientInBox.popleft() req, frm = m logger.debug("{} processing {} request {}". format(self.clientstack.name, frm, req.reqId), extra={"cli": True}) await self.clientMsgRouter.handle(m) async def processRequest(self, request: Request, frm: str): """ Handle a REQUEST from the client. If the request has already been executed, the node re-sends the reply to the client. Otherwise, the node acknowledges the client request, adds it to its list of client requests, and sends a PROPAGATE to the remaining nodes. :param request: the REQUEST from the client :param frm: the clientId of the client that sent this REQUEST """ logger.debug("Node {} received client request: {}". format(self.name, request)) # If request is already processed(there is a reply for the request in # the node's transaction store then return the reply from the # transaction store) txnId = self.txnStore.isRequestAlreadyProcessed(request) if txnId: logger.debug("{} returning REPLY from already processed " "REQUEST: {}".format(self, request)) reply = self.txnStore.transactions[txnId] self.transmitToClient(reply, request.clientId) else: self.transmitToClient(RequestAck(request.reqId), frm) # If not already got the propagate request(PROPAGATE) for the # corresponding client request(REQUEST) self.recordAndPropagate(request) # noinspection PyUnusedLocal async def processPropagate(self, msg: Propagate, frm): """ Process one propagateRequest sent to this node asynchronously - If this propagateRequest hasn't been seen by this node, then broadcast it to all nodes after verifying the the signature. - Add the client to blacklist if its signature is invalid :param msg: the propagateRequest :param frm: the name of the node which sent this `msg` """ logger.debug("Node {} received propagated request: {}". format(self.name, msg)) reqDict = msg.request request = Request(**reqDict) self.requests.addPropagate(request, frm) self.propagate(request) self.tryForwarding(request) def processOrdered(self, ordered: Ordered, retryNo: int = 0): """ Process and orderedRequest. Execute client request with retries if client request hasn't yet reached this node but corresponding PROPAGATE, PRE-PREPARE, PREPARE and COMMIT request did :param ordered: an orderedRequest :param retryNo: the retry number used in recursion :return: True if successful, None otherwise """ instId, viewNo, clientId, reqId, digest = tuple(ordered) self.monitor.requestOrdered(clientId, reqId, instId, byMaster=(instId == self.instances.masterId)) # Only the request ordered by master protocol instance are executed by # the client if instId == self.instances.masterId: key = (clientId, reqId) if key in self.requests: req = self.requests[key].request self.executeRequest(viewNo, req) logger.debug("Node {} executing client request {} {}". format(self.name, clientId, reqId)) # If the client request hasn't reached the node but corresponding # PROPAGATE, PRE-PREPARE, PREPARE and COMMIT request did, # then retry 3 times elif retryNo < 3: retryNo += 1 p = partial(self.processOrdered, ordered, retryNo) self._schedule(p, random.randint(2, 4)) logger.debug("Node {} retrying executing client request {} {}". format(self.name, clientId, reqId)) return True else: logger.trace("{} got ordered request from backup replica". format(self)) def processEscalatedException(self, ex): """ Process an exception escalated from a Replica """ if isinstance(ex, SuspiciousNode): self.reportSuspiciousNodeEx(ex) else: raise RuntimeError("unhandled replica-escalated exception") from ex def processInstanceChange(self, instChg: InstanceChange, frm: str) -> None: """ Validate and process an instance change request. :param instChg: the instance change request :param frm: the name of the node that sent this `msg` """ logger.debug("Node {} received instance change request: {} from {}". format(self, instChg, frm)) if instChg.viewNo < self.viewNo: self.discard(instChg, "Received instance change request with view no {} " "which is less than its view no {}". format(instChg.viewNo, self.viewNo), logger.debug) else: if not self.instanceChanges.hasView(instChg.viewNo): if self.monitor.isMasterDegraded(): self.instanceChanges.addVote(instChg.viewNo, frm) self.sendInstanceChange(instChg.viewNo) else: self.discard(instChg, "received instance change message from {} but " "did not find the master to be slow". format(frm), logger.debug) return else: if self.instanceChanges.hasInstChngFrom(instChg.viewNo, frm): self.reportSuspiciousNode(frm, code=Suspicions.DUPLICATE_INST_CHNG) else: self.instanceChanges.addVote(instChg.viewNo, frm) if self.canViewChange(instChg.viewNo): logger.debug("{} initiating a view change with view " "no {}".format(self, self.viewNo)) self.startViewChange(instChg.viewNo) else: logger.trace("{} cannot initiate a view change". format(self)) def checkPerformance(self): """ Check if master instance is slow and send an instance change request. :returns True if master performance is OK, otherwise False """ logger.debug("{} checking its performance".format(self)) self._schedule(self.checkPerformance, self.perfCheckFreq) if self.instances.masterId is not None: if self.monitor.isMasterDegraded(): logger.info("{} master has lower performance than backups. " "Sending an instance change with viewNo {}". format(self, self.viewNo)) logger.info("{} metrics for monitor: {}". format(self, self.monitor.prettymetrics)) self.sendInstanceChange(self.viewNo) return False else: logger.debug("{}'s master has higher performance than backups". format(self)) return True def executeRequest(self, viewNo: int, req: Request) -> None: """ Execute the REQUEST sent to this Node :param viewNo: the view number (See glossary) :param req: the client REQUEST """ reply = self.generateReply(viewNo, req) self.transmitToClient(reply, req.clientId) txnId = reply.result['txnId'] self.txnStore.addToProcessedRequests(req.clientId, req.reqId, txnId, reply) asyncio.ensure_future(self.txnStore.addReply( clientId=req.clientId, reply=reply, txnId=txnId)) def sendInstanceChange(self, viewNo: int): """ Broadcast an instance change request to all the remaining nodes :param viewNo: the view number when the instance change is requested """ self.send(InstanceChange(viewNo)) self.instanceChanges.addVote(viewNo, self.name) @property def quorum(self) -> int: r""" Return the quorum of this RBFT system. Equal to :math:`2f + 1`. """ return (2 * self.f) + 1 def canViewChange(self, proposedViewNo: int) -> bool: """ Return whether there's quorum for view change for the proposed view number and its view is less than or equal to the proposed view """ return self.instanceChanges.hasQuorum(proposedViewNo, self.f) and \ self.viewNo <= proposedViewNo def startViewChange(self, proposedViewNo: int): """ Trigger the view change process. :param proposedViewNo: the new view number after view change. """ self.viewNo = proposedViewNo + 1 logger.debug("{} resetting monitor stats after view change". format(self)) self.monitor.reset() # Now communicate the view change to the elector which will # contest primary elections across protocol all instances self.elector.viewChanged(self.viewNo) def verifySignature(self, msg) -> bool: """ Validate the signature of the request Note: Batch is whitelisted because the inner messages are checked :param msg: a message requiring signature verification :return: None; raises an exception if the signature is not valid """ if isinstance(msg, self.authnWhitelist): return # whitelisted message types rely on RAET for authn if isinstance(msg, Propagate): typ = 'propagate ' req = msg.request else: typ = '' req = msg if not isinstance(req, Mapping): req = msg.__getstate__() key = (req['clientId'], req['reqId']) if key in self.requests and self.requests[key].forwarded: return identifier = self.clientAuthNr.authenticate(req) logger.debug("{} authenticated {} signature on {}request {}". format(self, identifier, typ, req['reqId']), extra={"cli": True}) def generateReply(self, viewNo: int, req: Request) -> Reply: """ Return a new clientReply created using the viewNo, request and the computed txnId of the request :param viewNo: the view number (See glossary) :param req: the REQUEST :return: a clientReply generated from the request """ logger.debug("{} replying request {}".format(self, req)) txnId = sha256("{}{}{}".format(viewNo, req.clientId, req.reqId). encode('utf-8')).hexdigest() return Reply(viewNo, req.reqId, {"txnId": txnId}) def startKeySharing(self, timeout=60): """ Start key sharing till the timeout is reached. Other nodes will be able to join this node till the timeout is reached. :param timeout: the time till which key sharing is active """ if self.isKeySharing: logger.info("{} already key sharing".format(self), extra={"cli": "LOW_STATUS"}) else: logger.info("{} starting key sharing".format(self), extra={"cli": "STATUS"}) self.nodestack.keep.auto = AutoMode.always self._schedule(partial(self.stopKeySharing, timedOut=True), timeout) # remove any unjoined remotes for r in list(self.nodestack.remotes.values()): if not r.joined: logger.debug("{} removing unjoined remote {}" .format(self, r)) self.nodestack.removeRemote(r) # if just starting, then bootstrap force = time.perf_counter() - self.created > 5 self.maintainConnections(force=force) def stopKeySharing(self, timedOut=False): """ Stop key sharing, i.e don't allow any more nodes to join this node. """ if self.isKeySharing: if timedOut: logger.info("{} key sharing timed out; was not able to " "connect to {}". format(self, ", ".join(self.notConnectedNodes())), extra={"cli": "WARNING"}) else: logger.info("{} completed key sharing".format(self), extra={"cli": "STATUS"}) self.nodestack.keep.auto = AutoMode.never def reportSuspiciousNodeEx(self, ex: SuspiciousNode): self.reportSuspiciousNode(ex.node, ex.reason, ex.code, ex.offendingMsg) def reportSuspiciousNode(self, nodeName: str, reason=None, code: int=None, offendingMsg=None): logger.warning("{} suspicion raised on node {} for {}; suspicion code " "is {}".format(self, nodeName, reason, code)) # TODO need a more general solution here if code == InvalidSignature.code: self.blacklistNode(nodeName, reason=InvalidSignature.reason, code=InvalidSignature.code) if code in self.suspicions: self.blacklistNode(nodeName, reason=self.suspicions[code], code=code) if offendingMsg: self.discard(offendingMsg, reason, logger.warning) def reportSuspiciousClient(self, clientName: str, reason): logger.warning("{} suspicion raised on client {} for {}; " "doing nothing for now". format(self, clientName, reason)) self.blacklistClient(clientName) def isClientBlacklisted(self, clientName: str): return self.clientBlacklister.isBlacklisted(clientName) def blacklistClient(self, clientName: str, reason: str=None): msg = "{} blacklisting client {}".format(self, clientName) if reason: msg += " for reason {}".format(reason) logger.debug(msg) self.clientBlacklister.blacklist(clientName) def isNodeBlacklisted(self, nodeName: str): return self.nodeBlacklister.isBlacklisted(nodeName) def blacklistNode(self, nodeName: str, reason: str=None, code: int=None): msg = "{} blacklisting node {}".format(self, nodeName) if reason: msg += " for reason {}".format(reason) if code: msg += " for code {}".format(code) logger.debug(msg) self.nodeBlacklister.blacklist(nodeName) def __enter__(self): return self # noinspection PyUnusedLocal def __exit__(self, exc_type, exc_val, exc_tb): self.stop() def logstats(self): lines = [] l = lines.append l("node {} current stats".format(self)) l("--------------------------------------------------------") l("node inbox size : {}".format(len(self.nodeInBox))) l("client inbox size : {}". format(len(self.clientInBox))) l("age (seconds) : {}". format(time.perf_counter() - self.created)) l("next check for reconnect: {}". format(time.perf_counter() - self.nextCheck)) l("node connections : {}".format(self._conns)) l("f : {}".format(self.f)) l("master instance : {}".format(self.instances.masterId)) l("replicas : {}".format(len(self.replicas))) l("view no : {}".format(self.viewNo)) l("rank : {}".format(self.rank)) l("msgs to replicas : {}". format(len(self.msgsToReplicas))) l("msgs to elector : {}". format(len(self.msgsToElector))) l("action queue : {} {}". format(len(self.actionQueue), id(self.actionQueue))) l("action queue stash : {} {}". format(len(self.aqStash), id(self.aqStash))) logger.info("\n".join(lines), extra={"cli": False}) def checkValidOperation(self, msg): if self.opVerifiers: try: for v in self.opVerifiers: v.verify(msg) except Exception as ex: raise InvalidClientRequest from ex