示例#1
0
class Replica(HasActionQueue, MessageProcessor):
    STASHED_CHECKPOINTS_BEFORE_CATCHUP = 1
    HAS_NO_PRIMARY_WARN_THRESCHOLD = 10

    def __init__(self,
                 node: 'plenum.server.node.Node',
                 instId: int,
                 config=None,
                 isMaster: bool = False,
                 bls_bft_replica: BlsBftReplica = None,
                 metrics: MetricsCollector = NullMetricsCollector(),
                 get_current_time=None,
                 get_time_for_3pc_batch=None):
        """
        Create a new replica.

        :param node: Node on which this replica is located
        :param instId: the id of the protocol instance the replica belongs to
        :param isMaster: is this a replica of the master protocol instance
        """
        HasActionQueue.__init__(self)
        self.get_current_time = get_current_time or time.perf_counter
        self.get_time_for_3pc_batch = get_time_for_3pc_batch or node.utc_epoch
        # self.stats = Stats(TPCStat)
        self.config = config or getConfig()
        self.metrics = metrics
        self.node = node
        self.instId = instId
        self.name = self.generateName(node.name, self.instId)
        self.logger = getlogger(self.name)
        self.validator = ReplicaValidator(self)

        self.outBox = deque()
        """
        This queue is used by the replica to send messages to its node. Replica
        puts messages that are consumed by its node
        """

        self.inBox = deque()
        """
        This queue is used by the replica to receive messages from its node.
        Node puts messages that are consumed by the replica
        """

        self.inBoxStash = deque()
        """
        If messages need to go back on the queue, they go here temporarily and
        are put back on the queue on a state change
        """

        self._is_master = isMaster

        # Dictionary to keep track of the which replica was primary during each
        # view. Key is the view no and value is the name of the primary
        # replica during that view
        self.primaryNames = OrderedDict()  # type: OrderedDict[int, str]

        # Flag being used for preterm exit from the loop in the method
        # `processStashedMsgsForNewWaterMarks`. See that method for details.
        self.consumedAllStashedMsgs = True

        self._freshness_checker = FreshnessChecker(
            freshness_timeout=self.config.STATE_FRESHNESS_UPDATE_INTERVAL)

        self._bls_bft_replica = bls_bft_replica
        self._state_root_serializer = state_roots_serializer

        # Did we log a message about getting request while absence of primary
        self.warned_no_primary = False

        self._consensus_data = ConsensusSharedData(
            self.name, self.node.poolManager.node_names_ordered_by_rank(),
            self.instId, self.isMaster)
        self._internal_bus = InternalBus()
        self._external_bus = ExternalBus(send_handler=self.send)
        self.stasher = self._init_replica_stasher()
        self._subscription = Subscription()
        self._bootstrap_consensus_data()
        self._subscribe_to_external_msgs()
        self._subscribe_to_internal_msgs()
        self._checkpointer = self._init_checkpoint_service()
        self._ordering_service = self._init_ordering_service()
        self._message_req_service = self._init_message_req_service()
        self._view_change_service = self._init_view_change_service()
        for ledger_id in self.ledger_ids:
            self.register_ledger(ledger_id)

    @property
    def internal_bus(self):
        return self._internal_bus

    def cleanup(self):
        # Aggregate all the currently forwarded requests
        req_keys = set()
        for msg in self.inBox:
            if isinstance(msg, ReqKey):
                req_keys.add(msg.digest)
        for req_queue in self._ordering_service.requestQueues.values():
            for req_key in req_queue:
                req_keys.add(req_key)
        for pp in self._ordering_service.sent_preprepares.values():
            for req_key in pp.reqIdr:
                req_keys.add(req_key)
        for pp in self._ordering_service.prePrepares.values():
            for req_key in pp.reqIdr:
                req_keys.add(req_key)

        for req_key in req_keys:
            if req_key in self.requests:
                self.requests.ordered_by_replica(req_key)
                self.requests.free(req_key)

        self._ordering_service.cleanup()
        self._checkpointer.cleanup()
        self._subscription.unsubscribe_all()
        self.stasher.unsubscribe_from_all()

    @property
    def external_bus(self):
        return self._external_bus

    @property
    def isMaster(self):
        return self._is_master

    @isMaster.setter
    def isMaster(self, value):
        self._is_master = value
        self._consensus_data.is_master = value

    def _bootstrap_consensus_data(self):
        self._consensus_data.requests = self.requests
        self._consensus_data.node_mode = self.node.mode
        self._consensus_data.quorums = self.quorums

    def set_primaries(self, primaries):
        self._consensus_data.primaries = primaries

    def _subscribe_to_external_msgs(self):
        # self._subscription.subscribe(self._external_bus, ReqKey, self.readyFor3PC)
        pass

    def _process_new_view_accepted(self, msg: NewViewAccepted):
        self.clear_requests_and_fix_last_ordered()

    def _subscribe_to_internal_msgs(self):
        self._subscription.subscribe(self.internal_bus, Ordered,
                                     self._send_ordered)
        self._subscription.subscribe(self.internal_bus, NeedBackupCatchup,
                                     self._caught_up_backup)
        self._subscription.subscribe(self.internal_bus, ReqKey,
                                     self.readyFor3PC)
        self._subscription.subscribe(self.internal_bus, RaisedSuspicion,
                                     self._process_suspicious_node)
        self._subscription.subscribe(self.internal_bus, NewViewAccepted,
                                     self._process_new_view_accepted)

    def register_ledger(self, ledger_id):
        # Using ordered set since after ordering each PRE-PREPARE,
        # the request key is removed, so fast lookup and removal of
        # request key is needed. Need the collection to be ordered since
        # the request key needs to be removed once its ordered
        if ledger_id not in self._ordering_service.requestQueues:
            self._ordering_service.requestQueues[ledger_id] = OrderedSet()
        if ledger_id != AUDIT_LEDGER_ID:
            self._freshness_checker.register_ledger(
                ledger_id=ledger_id,
                initial_time=self.get_time_for_3pc_batch())

    @property
    def last_prepared_before_view_change(self):
        return self._consensus_data.legacy_last_prepared_before_view_change

    @last_prepared_before_view_change.setter
    def last_prepared_before_view_change(self, lst):
        self._consensus_data.legacy_last_prepared_before_view_change = lst

    @property
    def h(self) -> int:
        return self._consensus_data.low_watermark

    @property
    def H(self) -> int:
        return self._consensus_data.high_watermark

    @property
    def last_ordered_3pc(self) -> tuple:
        return self._ordering_service.last_ordered_3pc

    @last_ordered_3pc.setter
    def last_ordered_3pc(self, key3PC):
        self._ordering_service.last_ordered_3pc = key3PC

    @property
    def lastPrePrepareSeqNo(self):
        return self._ordering_service.lastPrePrepareSeqNo

    @property
    def requests(self):
        return self.node.requests

    @property
    def ledger_ids(self):
        return self.node.ledger_ids

    @property
    def quorums(self):
        return self.node.quorums

    @property
    def utc_epoch(self):
        return self.node.utc_epoch()

    @staticmethod
    def generateName(nodeName: str, instId: int):
        """
        Create and return the name for a replica using its nodeName and
        instanceId.
         Ex: Alpha:1
        """

        if isinstance(nodeName, str):
            # Because sometimes it is bytes (why?)
            if ":" in nodeName:
                # Because in some cases (for requested messages) it
                # already has ':'. This should be fixed.
                return nodeName
        return "{}:{}".format(nodeName, instId)

    @staticmethod
    def getNodeName(replicaName: str):
        return replicaName.split(":")[0]

    @property
    def isPrimary(self):
        """
        Is this node primary?

        :return: True if this node is primary, False if not, None if primary status not known
        """
        return self._consensus_data.is_primary

    @property
    def hasPrimary(self):
        return self.primaryName is not None

    @property
    def primaryName(self):
        """
        Name of the primary replica of this replica's instance

        :return: Returns name if primary is known, None otherwise
        """
        return self._consensus_data.primary_name

    @primaryName.setter
    def primaryName(self, value: Optional[str]) -> None:
        """
        Set the value of isPrimary.

        :param value: the value to set isPrimary to
        """
        if value is not None:
            self.warned_no_primary = False
        self.primaryNames[self.viewNo] = value
        self.compact_primary_names()
        if value != self._consensus_data.primary_name:
            self._consensus_data.primary_name = value
            self.logger.info(
                "{} setting primaryName for view no {} to: {}".format(
                    self, self.viewNo, value))
            if value is None:
                # Since the GC needs to happen after a primary has been
                # decided.
                return
            self._gc_before_new_view()

    def compact_primary_names(self):
        min_allowed_view_no = self.viewNo - 1
        views_to_remove = []
        for view_no in self.primaryNames:
            if view_no >= min_allowed_view_no:
                break
            views_to_remove.append(view_no)
        for view_no in views_to_remove:
            self.primaryNames.pop(view_no)

    def primaryChanged(self, primaryName):
        self._ordering_service.batches.clear()
        if self.isMaster:
            # Since there is no temporary state data structure and state root
            # is explicitly set to correct value
            for lid in self.ledger_ids:
                try:
                    ledger = self.node.getLedger(lid)
                except KeyError:
                    continue
                ledger.reset_uncommitted()

        self.primaryName = primaryName
        self._setup_for_non_master_after_view_change(self.viewNo)

    def on_view_change_start(self):
        if self.isMaster:
            lst = self._ordering_service.l_last_prepared_certificate_in_view()
            self._consensus_data.legacy_last_prepared_before_view_change = lst
            self.logger.info(
                '{} setting last prepared for master to {}'.format(self, lst))

    def on_view_change_done(self):
        if self.isMaster:
            self.last_prepared_before_view_change = None
        self.stasher.process_all_stashed(STASH_VIEW_3PC)

    def _clear_all_3pc_msgs(self):
        self._ordering_service._clear_all_3pc_msgs()

    def clear_requests_and_fix_last_ordered(self):
        if self.isMaster:
            return

        self._internal_bus.send(CheckpointStabilized(self.last_ordered_3pc))
        self._ordering_service._lastPrePrepareSeqNo = 0
        self._ordering_service.last_ordered_3pc = (self.viewNo, 0)
        self._clear_all_3pc_msgs()

    def on_propagate_primary_done(self):
        if self.isMaster:
            # if this is a Primary that is re-connected (that is view change is not actually changed,
            # we just propagate it, then make sure that we did't break the sequence
            # of ppSeqNo
            self._checkpointer.update_watermark_from_3pc()
        elif not self.isPrimary:
            self._checkpointer.set_watermarks(low_watermark=0,
                                              high_watermark=sys.maxsize)

    def get_lowest_probable_prepared_certificate_in_view(
            self, view_no) -> Optional[int]:
        """
        Return lowest pp_seq_no of the view for which can be prepared but
        choose from unprocessed PRE-PREPAREs and PREPAREs.
        """
        # TODO: Naive implementation, dont need to iterate over the complete
        # data structures, fix this later
        seq_no_pp = SortedList()  # pp_seq_no of PRE-PREPAREs
        # pp_seq_no of PREPAREs with count of PREPAREs for each
        seq_no_p = set()

        for (v, p) in self._ordering_service.prePreparesPendingPrevPP:
            if v == view_no:
                seq_no_pp.add(p)
            if v > view_no:
                break

        for (
                v, p
        ), pr in self._ordering_service.preparesWaitingForPrePrepare.items():
            if v == view_no and len(pr) >= self.quorums.prepare.value:
                seq_no_p.add(p)

        for n in seq_no_pp:
            if n in seq_no_p:
                return n
        return None

    def _setup_for_non_master_after_view_change(self, current_view):
        if not self.isMaster:
            for v in list(self.stashed_out_of_order_commits.keys()):
                if v < current_view:
                    self.stashed_out_of_order_commits.pop(v)

    def is_primary_in_view(self, viewNo: int) -> Optional[bool]:
        """
        Return whether this replica was primary in the given view
        """
        if viewNo not in self.primaryNames:
            return False
        return self.primaryNames[viewNo] == self.name

    def isMsgForCurrentView(self, msg):
        """
        Return whether this request's view number is equal to the current view
        number of this replica.
        """
        viewNo = getattr(msg, "viewNo", None)
        return viewNo == self.viewNo

    def isPrimaryForMsg(self, msg) -> Optional[bool]:
        """
        Return whether this replica is primary if the request's view number is
        equal this replica's view number and primary has been selected for
        the current view.
        Return None otherwise.
        :param msg: message
        """
        return self.isPrimary if self.isMsgForCurrentView(msg) \
            else self.is_primary_in_view(msg.viewNo)

    def isMsgFromPrimary(self, msg, sender: str) -> bool:
        """
        Return whether this message was from primary replica
        :param msg:
        :param sender:
        :return:
        """
        if self.isMsgForCurrentView(msg):
            return self.primaryName == sender
        try:
            return self.primaryNames[msg.viewNo] == sender
        except KeyError:
            return False

    def __repr__(self):
        return self.name

    @property
    def f(self) -> int:
        """
        Return the number of Byzantine Failures that can be tolerated by this
        system. Equal to (N - 1)/3, where N is the number of nodes in the
        system.
        """
        return self.node.f

    @property
    def viewNo(self):
        """
        Return the current view number of this replica.
        """
        return self._consensus_data.view_no

    @property
    def stashed_out_of_order_commits(self):
        # Commits which are not being ordered since commits with lower
        # sequence numbers have not been ordered yet. Key is the
        # viewNo and value a map of pre-prepare sequence number to commit
        # type: Dict[int,Dict[int,Commit]]
        return self._ordering_service.stashed_out_of_order_commits

    def send_3pc_batch(self):
        return self._ordering_service.send_3pc_batch()

    @staticmethod
    def batchDigest(reqs):
        return replica_batch_digest(reqs)

    def readyFor3PC(self, key: ReqKey):
        try:
            fin_req = self.requests[key.digest].finalised
        except KeyError:
            # Means that this request is outdated and is dropped from the main requests queue
            self.logger.debug(
                '{} reports request {} is ready for 3PC but it has been dropped '
                'from requests queue, ignore this request'.format(self, key))
            return
        queue = self._ordering_service.requestQueues[
            self.node.ledger_id_for_request(fin_req)]
        queue.add(key.digest)
        if not self.hasPrimary and len(
                queue
        ) >= self.HAS_NO_PRIMARY_WARN_THRESCHOLD and not self.warned_no_primary:
            self.logger.warning(
                '{} is getting requests but still does not have '
                'a primary so the replica will not process the request '
                'until a primary is chosen'.format(self))
            self.warned_no_primary = True

    @measure_replica_time(MetricsName.SERVICE_REPLICA_QUEUES_TIME,
                          MetricsName.SERVICE_BACKUP_REPLICAS_QUEUES_TIME)
    def serviceQueues(self, limit=None):
        """
        Process `limit` number of messages in the inBox.

        :param limit: the maximum number of messages to process
        :return: the number of messages successfully processed
        """
        # TODO should handle SuspiciousNode here
        r = self.dequeue_pre_prepares()
        # r += self.inBoxRouter.handleAllSync(self.inBox, limit)
        r += self._handle_external_messages(self.inBox, limit)
        r += self.send_3pc_batch()
        r += self._serviceActions()
        return r
        # Messages that can be processed right now needs to be added back to the
        # queue. They might be able to be processed later

    def _handle_external_messages(self, deq: deque, limit=None) -> int:
        """
        Synchronously handle all items in a deque.

        :param deq: a deque of items to be handled by this router
        :param limit: the number of items in the deque to the handled
        :return: the number of items handled successfully
        """
        count = 0
        while deq and (not limit or count < limit):
            count += 1
            msg = deq.popleft()
            external_msg, sender = msg if len(msg) == 2 else (msg, None)
            # TODO: get rid of appending instId to sender
            sender = self.generateName(sender, self.instId)
            self._external_bus.process_incoming(external_msg, sender)
        return count

    def _gc_before_new_view(self):
        # Trigger GC for all batches of old view
        # Clear any checkpoints, since they are valid only in a view
        # ToDo: Need to send a cmd like ViewChangeStart into internal bus
        # self._gc(self.last_ordered_3pc)
        self._ordering_service.gc(self.last_ordered_3pc)
        # ToDo: get rid of directly calling
        self._ordering_service._clear_prev_view_pre_prepares()
        # self._clear_prev_view_pre_prepares()

    def has_already_ordered(self, view_no, pp_seq_no):
        return compare_3PC_keys(
            (view_no, pp_seq_no), self.last_ordered_3pc) >= 0

    def dequeue_pre_prepares(self):
        return self._ordering_service.dequeue_pre_prepares()

    def send(self, msg, to_nodes=None) -> None:
        """
        Send a message to the node on which this replica resides.

        :param stat:
        :param rid: remote id of one recipient (sends to all recipients if None)
        :param msg: the message to send
        """
        self.logger.trace("{} sending {}".format(self, msg.__class__.__name__),
                          extra={
                              "cli": True,
                              "tags": ['sending']
                          })
        self.logger.trace("{} sending {}".format(self, msg))
        if to_nodes:
            self.node.sendToNodes(msg, names=to_nodes)
            return
        self.outBox.append(msg)

    def revert_unordered_batches(self):
        """
        Revert changes to ledger (uncommitted) and state made by any requests
        that have not been ordered.
        """
        return self._ordering_service.revert_unordered_batches()

    def on_catch_up_finished(self,
                             last_caught_up_3PC=None,
                             master_last_ordered_3PC=None):
        if master_last_ordered_3PC and last_caught_up_3PC and \
                compare_3PC_keys(master_last_ordered_3PC,
                                 last_caught_up_3PC) > 0:
            if self.isMaster:
                self._caught_up_till_3pc(last_caught_up_3PC)
            else:
                self._ordering_service.first_batch_after_catchup = True
                self._catchup_clear_for_backup()
        self.stasher.process_all_stashed(STASH_CATCH_UP)

    def discard_req_key(self, ledger_id, req_key):
        return self._ordering_service.discard_req_key(ledger_id, req_key)

    def _caught_up_backup(self, msg: NeedBackupCatchup):
        self._caught_up_till_3pc(msg.caught_up_till_3pc)

    def _caught_up_till_3pc(self, last_caught_up_3PC):
        self._ordering_service._caught_up_till_3pc(last_caught_up_3PC)
        self._checkpointer.caught_up_till_3pc(last_caught_up_3PC)

    def _catchup_clear_for_backup(self):
        if not self.isPrimary:
            self.outBox.clear()
            self._checkpointer.catchup_clear_for_backup()
            self._ordering_service.catchup_clear_for_backup()

    def _remove_ordered_from_queue(self, last_caught_up_3PC=None):
        """
        Remove any Ordered that the replica might be sending to node which is
        less than or equal to `last_caught_up_3PC` if `last_caught_up_3PC` is
        passed else remove all ordered, needed in catchup
        """
        to_remove = []
        for i, msg in enumerate(self.outBox):
            if isinstance(msg, Ordered) and \
                    (not last_caught_up_3PC or
                     compare_3PC_keys((msg.viewNo, msg.ppSeqNo), last_caught_up_3PC) >= 0):
                to_remove.append(i)

        self.logger.trace(
            '{} going to remove {} Ordered messages from outbox'.format(
                self, len(to_remove)))

        # Removing Ordered from queue but returning `Ordered` in order that
        # they should be processed.
        removed = []
        for i in reversed(to_remove):
            removed.insert(0, self.outBox[i])
            del self.outBox[i]
        return removed

    def _get_last_timestamp_from_state(self, ledger_id):
        if ledger_id == DOMAIN_LEDGER_ID:
            ts_store = self.node.db_manager.get_store(TS_LABEL)
            if ts_store:
                last_timestamp = ts_store.get_last_key()
                if last_timestamp:
                    last_timestamp = int(last_timestamp.decode())
                    self.logger.debug(
                        "Last ordered timestamp from store is : {}"
                        "".format(last_timestamp))
                    return last_timestamp
        return None

    def get_ledgers_last_update_time(self) -> dict:
        if self._freshness_checker:
            return self._freshness_checker.get_last_update_time()

    def get_valid_req_ids_from_all_requests(self, reqs, invalid_indices):
        return [
            req.key for idx, req in enumerate(reqs)
            if idx not in invalid_indices
        ]

    def report_suspicious_node(self, ex):
        if self.isMaster:
            self.node.reportSuspiciousNodeEx(ex)
        else:
            self.warn_suspicious_backup(ex.node, ex.reason, ex.code)

    def warn_suspicious_backup(self, nodeName, reason, code):
        self.logger.warning(
            "backup replica {} raised suspicion on node {} for {}; suspicion code "
            "is {}".format(self, nodeName, reason, code))

    def set_validators(self, validators):
        self._consensus_data.set_validators(validators)

    def set_view_no(self, view_no):
        self._consensus_data.view_no = view_no

    def set_view_change_status(self, legacy_vc_in_progress):
        self._consensus_data.legacy_vc_in_progress = legacy_vc_in_progress

    def set_mode(self, mode):
        self._consensus_data.node_mode = mode

    def set_primaries_batch_needed(self, value):
        self._ordering_service.primaries_batch_needed = value

    def update_connecteds(self, connecteds: dict):
        self._external_bus.update_connecteds(connecteds)

    def _init_replica_stasher(self):
        return StashingRouter(self.config.REPLICA_STASH_LIMIT,
                              buses=[self.internal_bus, self._external_bus],
                              unstash_handler=self._add_to_inbox)

    def _process_suspicious_node(self, msg: RaisedSuspicion):
        if msg.inst_id != self.instId:
            return
        self.report_suspicious_node(msg.ex)

    def _send_ordered(self, msg: Ordered):
        self.send(msg)

    def _init_checkpoint_service(self) -> CheckpointService:
        return CheckpointService(data=self._consensus_data,
                                 bus=self.internal_bus,
                                 network=self._external_bus,
                                 stasher=self.stasher,
                                 db_manager=self.node.db_manager,
                                 metrics=self.metrics)

    def _init_ordering_service(self) -> OrderingService:
        return OrderingService(
            data=self._consensus_data,
            timer=self.node.timer,
            bus=self.internal_bus,
            network=self._external_bus,
            write_manager=self.node.write_manager,
            bls_bft_replica=self._bls_bft_replica,
            freshness_checker=self._freshness_checker,
            get_current_time=self.get_current_time,
            get_time_for_3pc_batch=self.get_time_for_3pc_batch,
            stasher=self.stasher,
            metrics=self.metrics)

    def _init_message_req_service(self) -> MessageReqService:
        return MessageReqService(data=self._consensus_data,
                                 bus=self.internal_bus,
                                 network=self._external_bus,
                                 metrics=self.metrics)

    def _init_view_change_service(self) -> ViewChangeService:
        return ViewChangeService(data=self._consensus_data,
                                 timer=self.node.timer,
                                 bus=self.internal_bus,
                                 network=self._external_bus,
                                 stasher=self.stasher)

    def _add_to_inbox(self, message):
        self.inBox.append(message)
示例#2
0
class ViewChangeService:
    def __init__(self, data: ConsensusSharedData, timer: TimerService,
                 bus: InternalBus, network: ExternalBus,
                 stasher: StashingRouter):
        self._config = getConfig()
        self._logger = getlogger()

        self._data = data
        self._new_view_builder = NewViewBuilder(self._data)
        self._timer = timer
        self._bus = bus
        self._network = network
        self._router = stasher

        # Last successful viewNo.
        # In some cases view_change process can be uncompleted in time.
        # In that case we want to know, which viewNo was successful (last completed view_change)
        self.last_completed_view_no = self._data.view_no

        self._resend_inst_change_timer = RepeatingTimer(
            self._timer,
            self._config.NEW_VIEW_TIMEOUT,
            partial(self._propose_view_change_not_complete_in_time),
            active=False)

        self._router.subscribe(ViewChange, self.process_view_change_message)
        self._router.subscribe(ViewChangeAck,
                               self.process_view_change_ack_message)
        self._router.subscribe(NewView, self.process_new_view_message)

        self._old_prepared = {}  # type: Dict[int, BatchID]
        self._old_preprepared = {}  # type: Dict[int, List[BatchID]]
        self._primaries_selector = RoundRobinPrimariesSelector()

        self._subscription = Subscription()
        self._subscription.subscribe(self._bus, NeedViewChange,
                                     self.process_need_view_change)

    def __repr__(self):
        return self._data.name

    @property
    def view_change_votes(self):
        return self._data.view_change_votes

    def process_need_view_change(self, msg: NeedViewChange):
        self._logger.info("{} processing {}".format(self, msg))

        # 1. calculate new viewno
        view_no = msg.view_no
        if view_no is None:
            view_no = self._data.view_no + 1

        # 2. Do cleanup before new view change starts
        self._clean_on_view_change_start()

        # 3. Update shared data
        self._data.view_no = view_no
        self._data.waiting_for_new_view = True
        self._data.primaries = self._primaries_selector.select_primaries(
            view_no=self._data.view_no,
            instance_count=self._data.quorums.f + 1,
            validators=self._data.validators)
        for i, primary_name in enumerate(self._data.primaries):
            self._logger.display(
                "{} selected primary {} for instance {} (view {})".format(
                    PRIMARY_SELECTION_PREFIX, primary_name, i,
                    self._data.view_no),
                extra={
                    "cli": "ANNOUNCE",
                    "tags": ["node-election"]
                })

        old_primary = self._data.primary_name
        self._data.primary_name = generateName(
            self._data.primaries[self._data.inst_id], self._data.inst_id)

        if not self._data.is_master:
            return

        if old_primary and self._data.primary_name == old_primary:
            self._logger.info("Selected master primary is the same with the "
                              "current master primary (new_view {}). "
                              "Propose a new view {}".format(
                                  self._data.view_no, self._data.view_no + 1))
            self._propose_view_change(Suspicions.INCORRECT_NEW_PRIMARY.code)

        # 4. Build ViewChange message
        vc = self._build_view_change_msg()

        # 5. Send ViewChangeStarted via internal bus to update other services
        self._logger.info("{} sending {}".format(self, vc))
        self._bus.send(ViewChangeStarted(view_no=self._data.view_no))

        # 6. Send ViewChange msg to other nodes (via external bus)
        self._network.send(vc)
        self.view_change_votes.add_view_change(vc, self._data.name)

        # 7. Unstash messages for view change
        self._router.process_all_stashed(STASH_WAITING_VIEW_CHANGE)

        # 8. Restart instance change timer
        self._resend_inst_change_timer.stop()
        self._resend_inst_change_timer.start()

    def _clean_on_view_change_start(self):
        self._clear_old_batches(self._old_prepared)
        self._clear_old_batches(self._old_preprepared)
        self.view_change_votes.clear()
        self._data.new_view = None

    def _clear_old_batches(self, batches: Dict[int, Any]):
        for pp_seq_no in list(batches.keys()):
            if pp_seq_no <= self._data.stable_checkpoint:
                del batches[pp_seq_no]

    def _build_view_change_msg(self):
        for batch_id in self._data.prepared:
            self._old_prepared[batch_id.pp_seq_no] = batch_id
        prepared = sorted(list(self._old_prepared.values()))

        for new_bid in self._data.preprepared:
            pretenders = self._old_preprepared.get(new_bid.pp_seq_no, [])
            pretenders = [
                bid for bid in pretenders if bid.pp_digest != new_bid.pp_digest
            ]
            pretenders.append(new_bid)
            self._old_preprepared[new_bid.pp_seq_no] = pretenders
        preprepared = sorted(
            [bid for bids in self._old_preprepared.values() for bid in bids])

        return ViewChange(viewNo=self._data.view_no,
                          stableCheckpoint=self._data.stable_checkpoint,
                          prepared=prepared,
                          preprepared=preprepared,
                          checkpoints=list(self._data.checkpoints))

    def process_view_change_message(self, msg: ViewChange, frm: str):
        result = self._validate(msg, frm)
        if result != PROCESS:
            return result, None

        self._logger.info("{} processing {} from {}".format(self, msg, frm))

        self.view_change_votes.add_view_change(msg, frm)

        vca = ViewChangeAck(viewNo=msg.viewNo,
                            name=getNodeName(frm),
                            digest=view_change_digest(msg))
        self.view_change_votes.add_view_change_ack(
            vca, getNodeName(self._data.name))

        if self._data.is_primary:
            self._send_new_view_if_needed()
            return PROCESS, None

        primary_node_name = getNodeName(self._data.primary_name)
        self._logger.info("{} sending {}".format(self, vca))
        self._network.send(vca, [primary_node_name])

        self._finish_view_change_if_needed()
        return PROCESS, None

    def process_view_change_ack_message(self, msg: ViewChangeAck, frm: str):
        result = self._validate(msg, frm)
        if result != PROCESS:
            return result, None

        self._logger.info("{} processing {} from {}".format(self, msg, frm))

        if not self._data.is_primary:
            return PROCESS, None

        self.view_change_votes.add_view_change_ack(msg, frm)
        self._send_new_view_if_needed()
        return PROCESS, None

    def process_new_view_message(self, msg: NewView, frm: str):
        result = self._validate(msg, frm)
        if result != PROCESS:
            return result, None

        self._logger.info("{} processing {} from {}".format(self, msg, frm))

        if frm != self._data.primary_name:
            self._logger.info(
                "{} Received NewView {} for view {} from non-primary {}; expected primary {}"
                .format(self._data.name, msg, self._data.view_no, frm,
                        self._data.primary_name))
            return DISCARD, "New View from non-Primary"

        self._data.new_view = msg
        self._finish_view_change_if_needed()
        return PROCESS, None

    def _validate(self, msg: Union[ViewChange, ViewChangeAck, NewView],
                  frm: str) -> int:
        # TODO: Proper validation
        if not self._data.is_master:
            return DISCARD

        if msg.viewNo < self._data.view_no:
            return DISCARD

        if msg.viewNo == self._data.view_no and not self._data.waiting_for_new_view:
            return DISCARD

        if msg.viewNo > self._data.view_no:
            return STASH_WAITING_VIEW_CHANGE

        return PROCESS

    def _send_new_view_if_needed(self):
        confirmed_votes = self.view_change_votes.confirmed_votes
        if not self._data.quorums.view_change.is_reached(len(confirmed_votes)):
            return

        view_changes = [
            self.view_change_votes.get_view_change(*v) for v in confirmed_votes
        ]
        cp = self._new_view_builder.calc_checkpoint(view_changes)
        if cp is None:
            return

        batches = self._new_view_builder.calc_batches(cp, view_changes)
        if batches is None:
            return

        if cp not in self._data.checkpoints:
            return

        nv = NewView(viewNo=self._data.view_no,
                     viewChanges=sorted(confirmed_votes, key=itemgetter(0)),
                     checkpoint=cp,
                     batches=batches)
        self._logger.info("{} sending {}".format(self, nv))
        self._network.send(nv)
        self._data.new_view = nv
        self._finish_view_change()

    def _finish_view_change_if_needed(self):
        if self._data.new_view is None:
            return

        view_changes = []
        for name, vc_digest in self._data.new_view.viewChanges:
            vc = self.view_change_votes.get_view_change(name, vc_digest)
            # We don't have needed ViewChange, so we cannot validate NewView
            if vc is None:
                self._request_view_change_message((name, vc_digest))
                return
            view_changes.append(vc)

        cp = self._new_view_builder.calc_checkpoint(view_changes)
        if cp is None or cp != self._data.new_view.checkpoint:
            # New primary is malicious
            self._logger.info(
                "{} Received invalid NewView {} for view {}: expected checkpoint {}"
                .format(self._data.name, self._data.new_view,
                        self._data.view_no, cp))
            self._propose_view_change(
                Suspicions.NEW_VIEW_INVALID_CHECKPOINTS.code)
            return

        batches = self._new_view_builder.calc_batches(cp, view_changes)
        if batches != self._data.new_view.batches:
            # New primary is malicious
            self._logger.info(
                "{} Received invalid NewView {} for view {}: expected batches {}"
                .format(self._data.name, self._data.new_view,
                        self._data.view_no, batches))
            self._propose_view_change(Suspicions.NEW_VIEW_INVALID_BATCHES.code)
            return

        self._finish_view_change()

    def _finish_view_change(self):
        # Update shared data
        self._data.waiting_for_new_view = False
        self._data.prev_view_prepare_cert = self._data.new_view.batches[-1].pp_seq_no \
            if self._data.new_view.batches else 0

        # Cancel View Change timeout task
        self._resend_inst_change_timer.stop()
        # send message to other services
        self._bus.send(
            NewViewAccepted(view_no=self._data.new_view.viewNo,
                            view_changes=self._data.new_view.viewChanges,
                            checkpoint=self._data.new_view.checkpoint,
                            batches=self._data.new_view.batches))
        self.last_completed_view_no = self._data.view_no

    def _propose_view_change_not_complete_in_time(self):
        self._propose_view_change(Suspicions.INSTANCE_CHANGE_TIMEOUT.code)
        if self._data.new_view is None:
            self._request_new_view_message(self._data.view_no)

    def _propose_view_change(self, suspision_code):
        proposed_view_no = self._data.view_no
        # TODO: For some reason not incrementing view_no in most cases leads to lots of failing/flaky tests
        # if suspicion == Suspicions.INSTANCE_CHANGE_TIMEOUT or not self.view_change_in_progress:
        if suspision_code != Suspicions.STATE_SIGS_ARE_NOT_UPDATED or not self._data.waiting_for_new_view:
            proposed_view_no += 1
        self._logger.info(
            "{} proposing a view change to {} with code {}".format(
                self, proposed_view_no, suspision_code))
        msg = InstanceChange(proposed_view_no, suspision_code)
        self._network.send(msg)

    def _request_new_view_message(self, view_no):
        self._bus.send(
            MissingMessage(msg_type=NEW_VIEW,
                           key=view_no,
                           inst_id=self._data.inst_id,
                           dst=[getNodeName(self._data.primary_name)],
                           stash_data=None))

    def _request_view_change_message(self, key):
        self._bus.send(
            MissingMessage(msg_type=VIEW_CHANGE,
                           key=key,
                           inst_id=self._data.inst_id,
                           dst=None,
                           stash_data=None))
class StashingRouter(Router):
    Handler = Callable[..., Optional[Tuple[int, str]]]

    def __init__(self, limit: int, buses: List[Router], unstash_handler: Callable = None):
        super().__init__()
        self._limit = limit
        self._logger = getlogger()
        self._queues = {}  # type: Dict[int, StashingQueue]
        # TODO: This call has been added to saving the old message order in the list.
        # This is a replica's method that moves the message to the inBox, rather than
        # calling the handler immediately, as the default router does.
        self._unstash_handler = unstash_handler
        self._subscriptions = Subscription()
        self._buses = buses

    def set_sorted_stasher(self, code: int, key: Callable):
        self._queues[code] = SortedStash(self._limit, key)

    def subscribe(self, message_type: Type, handler: Handler) -> Router.SubscriptionID:
        # TODO: subscribe to one bus only
        for bus in self._buses:
            self._subscriptions.subscribe(bus,
                                          message_type,
                                          partial(self._process, handler))
        return super().subscribe(message_type, handler)

    def unsubscribe_from_all(self):
        self._subscriptions.unsubscribe_all()

    def process_all_stashed(self, code: Optional[int] = None):
        """
        Try to process all stashed messages, re-stashing some of them if needed

        :param code: stash code, None if we need to unstash all
        """
        if code is None:
            for code in sorted(self._queues.keys()):
                self.process_all_stashed(code)
            return

        queue = self._queues.get(code)
        if not queue:
            return

        data = queue.pop_all()
        self._logger.info("Unstash {} messages from stash with code {}".format(len(data), code))
        for msg_tuple in data:
            self._resolve_and_process(*msg_tuple)

    def process_stashed_until_first_restash(self, code: Optional[int] = None):
        """
        Try to process all stashed messages until handler indicates that some message
        needs to be stashed again (this can be especially useful with sorted stashes).

        :param code: stash code, None if we need to unstash all
        """
        if code is None:
            for code in sorted(self._queues.keys()):
                self.process_stashed_until_first_restash(code)
            return

        queue = self._queues.get(code)
        while queue:
            msg_tuple = queue.pop()
            if not self._resolve_and_process(*msg_tuple):
                break

    def stash_size(self, code: Optional[int] = None):
        if code is None:
            return sum(len(q) for q in self._queues.values())

        queue = self._queues.get(code)
        return len(queue) if queue else 0

    def _process(self, handler: Handler, message: Any, *args) -> bool:
        """
        Tries to process message using given handler. Returns True if message
        was stashed for reprocessing in future, False otherwise.
        """
        result = handler(message, *args)
        code, reason = result if result else (None, None)

        # If handler returned either None or PROCESS we assume it successfully processed message
        # and no further action is needed
        if not code:
            return True

        if code == DISCARD:
            self.discard(message, args, reason)
            return True

        self._stash(code, reason, message, *args)
        return False

    def _resolve_and_process(self, message: Any, *args) -> bool:
        handlers = self.handlers(type(message))
        if len(handlers) == 0:
            raise LogicError("Handler for message {} not found".format(message))
        return self._unstash(handlers[0], message, *args)

    def _unstash(self, handler: Handler, message: Any, *args) -> bool:
        if self._unstash_handler is None:
            return self._process(handler, message, *args)
        else:
            self._unstash_handler((message, *args))

    def _stash(self, code: int, reason: str, message: Any, *args):
        self._logger.trace("Stashing message {} with metadata {} "
                           "with the reason {}".format(message, args, reason))

        queue = self._queues.setdefault(code, UnsortedStash(self._limit))
        if not queue.push(message, *args):
            # TODO: This is actually better be logged on info level with some throttling applied,
            #  however this cries for some generic easy to use solution, which we don't have yet.
            self._logger.debug("Cannot stash message {} with metadata {} - queue is full".format(message, args))

    def discard(self, msg, args, reason):
        self._logger.trace("Discarded message {} with metadata {} "
                           "with the reason {}".format(msg, args, reason))
示例#4
0
class MessageReqService:
    def __init__(self,
                 data: ConsensusSharedData,
                 bus: InternalBus,
                 network: ExternalBus,
                 metrics=NullMetricsCollector()):
        self._logger = getlogger()
        self._data = data
        self._bus = bus
        self._subscription = Subscription()
        self._subscription.subscribe(bus, MissingMessage,
                                     self.process_missing_message)
        self._subscription.subscribe(bus, Ordered, self.process_ordered)
        self._subscription.subscribe(bus, ViewChangeStarted,
                                     self.process_view_change_started)
        self._subscription.subscribe(bus, CheckpointStabilized,
                                     self.process_checkpoint_stabilized)

        self._network = network
        self._subscription.subscribe(network, MessageReq,
                                     self.process_message_req)
        self._subscription.subscribe(network, MessageRep,
                                     self.process_message_rep)

        self.metrics = metrics
        self.handlers = {
            PREPREPARE: PreprepareHandler(self._data),
            PREPARE: PrepareHandler(self._data),
            COMMIT: CommitHandler(self._data),
            VIEW_CHANGE: ViewChangeHandler(self._data)
        }
        self.three_pc_handlers = {PREPREPARE, PREPARE, COMMIT}

    @measure_time(MetricsName.PROCESS_MESSAGE_REQ_TIME)
    def process_message_req(self, msg: MessageReq, frm):
        # Assumes a shared memory architecture. In case of multiprocessing,
        # RPC architecture, use deques to communicate the message and node will
        # maintain a unique internal message id to correlate responses.
        msg_type = msg.msg_type
        if msg_type not in self.handlers.keys():
            self.discard(msg, "Unknown message type {}".format(msg_type),
                         self._logger.warning)
            return
        handler = self.handlers[msg_type]
        try:
            resp = handler.process_message_req(msg)
        except IncorrectMessageForHandlingException as e:
            self.discard(e.msg, e.reason, e.log_method)
            return

        if not resp:
            return

        with self.metrics.measure_time(MetricsName.SEND_MESSAGE_REP_TIME):
            self._network.send(MessageRep(
                **{
                    f.MSG_TYPE.nm: msg_type,
                    f.PARAMS.nm: msg.params,
                    f.MSG.nm: resp
                }),
                               dst=[
                                   frm.split(":")[0],
                               ])

    @measure_time(MetricsName.PROCESS_MESSAGE_REP_TIME)
    def process_message_rep(self, msg: MessageRep, frm):
        msg_type = msg.msg_type
        if msg_type not in self.handlers.keys():
            self.discard(msg, "Unknown message type {}".format(msg_type),
                         self._logger.warning)
            return
        if msg.msg is None:
            self._logger.debug(
                '{} got null response for requested {} from {}'.format(
                    self, msg_type, frm))
            return
        handler = self.handlers[msg_type]
        try:
            validated_msg, frm = handler.extract_message(msg, frm)
            self._network.process_incoming(validated_msg, frm)
        except IncorrectMessageForHandlingException as e:
            self.discard(e.msg, e.reason, e.log_method)

    @measure_time(MetricsName.SEND_MESSAGE_REQ_TIME)
    def process_missing_message(self, msg: MissingMessage):
        if msg.inst_id != self._data.inst_id:
            return
        # TODO: Using a timer to retry would be a better thing to do
        self._logger.trace('{} requesting {} for {} from {}'.format(
            self, msg.msg_type, msg.key, msg.dst))
        handler = self.handlers[msg.msg_type]

        try:
            params = handler.prepare_msg_to_request(msg.key, msg.stash_data)
        except IncorrectMessageForHandlingException as e:
            self.discard(e.msg, e.reason, e.log_method)
            return
        if params:
            self._network.send(
                MessageReq(**{
                    f.MSG_TYPE.nm: msg.msg_type,
                    f.PARAMS.nm: params
                }),
                dst=msg.dst)

    def process_ordered(self, msg: Ordered):
        for msg_type, handler in self.handlers.items():
            if msg_type not in self.three_pc_handlers:
                continue
            handler.requested_messages.pop((msg.viewNo, msg.ppSeqNo), None)

    def process_view_change_started(self, msg: ViewChangeStarted):
        for msg_type, handler in self.handlers.items():
            handler.cleanup()

    def process_checkpoint_stabilized(self, msg: CheckpointStabilized):
        for msg_type, handler in self.handlers.items():
            if msg_type not in self.three_pc_handlers:
                continue
            for key in list(handler.requested_messages.keys()):
                if compare_3PC_keys(key, msg.last_stable_3pc) >= 0:
                    handler.requested_messages.pop(key)

    def discard(self, msg, reason, logMethod=logging.error, cliOutput=False):
        """
        Discard a message and log a reason using the specified `logMethod`.

        :param msg: the message to discard
        :param reason: the reason why this message is being discarded
        :param logMethod: the logging function to be used
        :param cliOutput: if truthy, informs a CLI that the logged msg should
        be printed
        """
        reason = "" if not reason else " because {}".format(reason)
        logMethod("{} discarding message {}{}".format(self, msg, reason),
                  extra={"cli": cliOutput})

    @property
    def name(self):
        return self._data.name

    def __str__(self) -> str:
        return "{} - MessageReq3pcService".format(self._data.name)
示例#5
0
class CheckpointService:
    STASHED_CHECKPOINTS_BEFORE_CATCHUP = 1

    # TODO: Remove view_no from key after implementing INDY-1336
    CheckpointKey = NamedTuple('CheckpointKey', [('view_no', int),
                                                 ('pp_seq_no', int),
                                                 ('digest', str)])

    def __init__(
            self,
            data: ConsensusSharedData,
            bus: InternalBus,
            network: ExternalBus,
            stasher: StashingRouter,
            db_manager: DatabaseManager,
            metrics: MetricsCollector = NullMetricsCollector(),
    ):
        self._data = data
        self._bus = bus
        self._network = network
        self._stasher = stasher
        self._subscription = Subscription()
        self._validator = CheckpointMsgValidator(self._data)
        self._db_manager = db_manager
        self.metrics = metrics

        # Received checkpoints, mapping CheckpointKey -> List(node_alias)
        self._received_checkpoints = defaultdict(
            set)  # type: Dict[CheckpointService.CheckpointKey, Set[str]]

        self._config = getConfig()
        self._logger = getlogger()

        self._subscription.subscribe(stasher, Checkpoint,
                                     self.process_checkpoint)

        self._subscription.subscribe(bus, Ordered, self.process_ordered)
        self._subscription.subscribe(bus, BackupSetupLastOrdered,
                                     self.process_backup_setup_last_ordered)
        self._subscription.subscribe(bus, NewViewAccepted,
                                     self.process_new_view_accepted)

    def cleanup(self):
        self._subscription.unsubscribe_all()

    @property
    def view_no(self):
        return self._data.view_no

    @property
    def is_master(self):
        return self._data.is_master

    @property
    def last_ordered_3pc(self):
        return self._data.last_ordered_3pc

    @measure_consensus_time(MetricsName.PROCESS_CHECKPOINT_TIME,
                            MetricsName.BACKUP_PROCESS_CHECKPOINT_TIME)
    def process_checkpoint(self, msg: Checkpoint, sender: str) -> (bool, str):
        """
        Process checkpoint messages
        :return: whether processed (True) or stashed (False)
        """
        self._logger.info('{} processing checkpoint {} from {}'.format(
            self, msg, sender))
        result, reason = self._validator.validate(msg)
        if result != PROCESS:
            return result, reason

        key = self._checkpoint_key(msg)
        self._received_checkpoints[key].add(sender)
        self._try_to_stabilize_checkpoint(key)
        self._start_catchup_if_needed(key)

    def process_backup_setup_last_ordered(self, msg: BackupSetupLastOrdered):
        if msg.inst_id != self._data.inst_id:
            return
        self.update_watermark_from_3pc()

    def process_ordered(self, ordered: Ordered):
        for batch_id in reversed(self._data.preprepared):
            if batch_id.pp_seq_no == ordered.ppSeqNo:
                self._add_to_checkpoint(batch_id.pp_seq_no, batch_id.view_no,
                                        ordered.auditTxnRootHash)
                return
        raise LogicError(
            "CheckpointService | Can't process Ordered msg because "
            "ppSeqNo {} not in preprepared".format(ordered.ppSeqNo))

    def _start_catchup_if_needed(self, key: CheckpointKey):
        if self._have_own_checkpoint(key):
            return

        unknown_stabilized = self._unknown_stabilized_checkpoints()
        lag_in_checkpoints = len(unknown_stabilized)
        if lag_in_checkpoints <= self.STASHED_CHECKPOINTS_BEFORE_CATCHUP:
            return

        last_key = sorted(unknown_stabilized,
                          key=lambda v: (v.view_no, v.pp_seq_no))[-1]

        if self.is_master:
            # TODO: This code doesn't seem to be needed, but it was there. Leaving just in case
            #  tests explain why it was really needed.
            # self._logger.display(
            #     '{} has lagged for {} checkpoints so updating watermarks to {}'.format(
            #         self, lag_in_checkpoints, last_key.pp_seq_no))
            # self.set_watermarks(low_watermark=last_key.pp_seq_no)

            if not self._data.is_primary:
                self._logger.display(
                    '{} has lagged for {} checkpoints so the catchup procedure starts'
                    .format(self, lag_in_checkpoints))
                self._bus.send(NeedMasterCatchup())
        else:
            self._logger.info(
                '{} has lagged for {} checkpoints so adjust last_ordered_3pc to {}, '
                'shift watermarks and clean collections'.format(
                    self, lag_in_checkpoints, last_key.pp_seq_no))
            # Adjust last_ordered_3pc, shift watermarks, clean operational
            # collections and process stashed messages which now fit between
            # watermarks
            # TODO: Actually we might need to process view_no from last_key as well, however
            #  it wasn't processed before, and it will go away when INDY-1336 gets implemented
            key_3pc = (self.view_no, last_key.pp_seq_no)
            self._bus.send(
                NeedBackupCatchup(inst_id=self._data.inst_id,
                                  caught_up_till_3pc=key_3pc))
            self.caught_up_till_3pc(key_3pc)

    def gc_before_new_view(self):
        self._reset_checkpoints()
        # ToDo: till_3pc_key should be None?
        self._remove_received_checkpoints(till_3pc_key=(self.view_no, 0))

    def caught_up_till_3pc(self, caught_up_till_3pc):
        # TODO: Add checkpoint using audit ledger
        cp_seq_no = caught_up_till_3pc[
            1] // self._config.CHK_FREQ * self._config.CHK_FREQ
        self._mark_checkpoint_stable(cp_seq_no)

    def catchup_clear_for_backup(self):
        self._reset_checkpoints()
        self._remove_received_checkpoints()
        self.set_watermarks(low_watermark=0, high_watermark=sys.maxsize)

    def _add_to_checkpoint(self, ppSeqNo, view_no, audit_txn_root_hash):
        if ppSeqNo % self._config.CHK_FREQ != 0:
            return

        key = self.CheckpointKey(view_no=view_no,
                                 pp_seq_no=ppSeqNo,
                                 digest=audit_txn_root_hash)

        self._do_checkpoint(ppSeqNo, view_no, audit_txn_root_hash)
        self._try_to_stabilize_checkpoint(key)

    @measure_consensus_time(MetricsName.SEND_CHECKPOINT_TIME,
                            MetricsName.BACKUP_SEND_CHECKPOINT_TIME)
    def _do_checkpoint(self, pp_seq_no, view_no, audit_txn_root_hash):
        self._logger.info(
            "{} sending Checkpoint {} view {} audit txn root hash {}".format(
                self, pp_seq_no, view_no, audit_txn_root_hash))

        checkpoint = Checkpoint(self._data.inst_id, view_no, 0, pp_seq_no,
                                audit_txn_root_hash)
        self._network.send(checkpoint)
        self._data.checkpoints.add(checkpoint)

    def _try_to_stabilize_checkpoint(self, key: CheckpointKey):
        if not self._have_quorum_on_received_checkpoint(key):
            return

        if not self._have_own_checkpoint(key):
            return

        self._mark_checkpoint_stable(key.pp_seq_no)

    def _mark_checkpoint_stable(self, pp_seq_no):
        self._data.stable_checkpoint = pp_seq_no

        stable_checkpoints = self._data.checkpoints.irange_key(
            min_key=pp_seq_no, max_key=pp_seq_no)
        if len(list(stable_checkpoints)) == 0:
            # TODO: Is it okay to get view_no like this?
            view_no = self._data.last_ordered_3pc[0]
            checkpoint = Checkpoint(instId=self._data.inst_id,
                                    viewNo=view_no,
                                    seqNoStart=0,
                                    seqNoEnd=pp_seq_no,
                                    digest=self._audit_txn_root_hash(
                                        view_no, pp_seq_no))
            self._data.checkpoints.add(checkpoint)

        for cp in self._data.checkpoints.copy():
            if cp.seqNoEnd < pp_seq_no:
                self._logger.trace("{} removing previous checkpoint {}".format(
                    self, cp))
                self._data.checkpoints.remove(cp)

        self.set_watermarks(low_watermark=pp_seq_no)

        self._remove_received_checkpoints(till_3pc_key=(self.view_no,
                                                        pp_seq_no))
        self._bus.send(CheckpointStabilized(
            (self.view_no, pp_seq_no)))  # call OrderingService.gc()
        self._logger.info("{} marked stable checkpoint {}".format(
            self, pp_seq_no))

    def set_watermarks(self, low_watermark: int, high_watermark: int = None):
        self._data.low_watermark = low_watermark
        self._data.high_watermark = self._data.low_watermark + self._config.LOG_SIZE \
            if high_watermark is None else \
            high_watermark

        self._logger.info('{} set watermarks as {} {}'.format(
            self, self._data.low_watermark, self._data.high_watermark))
        self._stasher.process_all_stashed(STASH_WATERMARKS)

    def update_watermark_from_3pc(self):
        last_ordered_3pc = self.last_ordered_3pc
        if (last_ordered_3pc is not None) and (last_ordered_3pc[0]
                                               == self.view_no):
            self._logger.info(
                "update_watermark_from_3pc to {}".format(last_ordered_3pc))
            self.set_watermarks(last_ordered_3pc[1])
        else:
            self._logger.info(
                "try to update_watermark_from_3pc but last_ordered_3pc is None"
            )

    def _remove_received_checkpoints(self, till_3pc_key=None):
        """
        Remove received checkpoints up to `till_3pc_key` if provided,
        otherwise remove all received checkpoints
        """
        if till_3pc_key is None:
            self._received_checkpoints.clear()
            self._logger.info(
                '{} removing all received checkpoints'.format(self))
            return

        for cp in list(self._received_checkpoints.keys()):
            if self._is_below_3pc_key(cp, till_3pc_key):
                self._logger.info(
                    '{} removing received checkpoints: {}'.format(self, cp))
                del self._received_checkpoints[cp]

    def _reset_checkpoints(self):
        # That function most probably redundant in PBFT approach,
        # because according to paper, checkpoints cleared only when next stabilized.
        # Avoid using it while implement other services.
        self._data.checkpoints.clear()
        self._data.checkpoints.append(self._data.initial_checkpoint)

    def __str__(self) -> str:
        return "{} - checkpoint_service".format(self._data.name)

    def discard(self, msg, reason, sender):
        self._logger.trace("{} discard message {} from {} "
                           "with the reason: {}".format(
                               self, msg, sender, reason))

    def _have_own_checkpoint(self, key: CheckpointKey) -> bool:
        own_checkpoints = self._data.checkpoints.irange_key(
            min_key=key.pp_seq_no, max_key=key.pp_seq_no)
        return any(cp.viewNo == key.view_no and cp.digest == key.digest
                   for cp in own_checkpoints)

    def _have_quorum_on_received_checkpoint(self, key: CheckpointKey) -> bool:
        votes = self._received_checkpoints[key]
        return self._data.quorums.checkpoint.is_reached(len(votes))

    def _unknown_stabilized_checkpoints(self) -> List[CheckpointKey]:
        return [
            key for key in self._received_checkpoints
            if self._have_quorum_on_received_checkpoint(key)
            and not self._have_own_checkpoint(key)
            and not self._is_below_3pc_key(key, self.last_ordered_3pc)
        ]

    @staticmethod
    def _is_below_3pc_key(cp: CheckpointKey, key: Tuple[int, int]) -> bool:
        return compare_3PC_keys((cp.view_no, cp.pp_seq_no), key) >= 0

    @staticmethod
    def _checkpoint_key(checkpoint: Checkpoint) -> CheckpointKey:
        return CheckpointService.CheckpointKey(view_no=checkpoint.viewNo,
                                               pp_seq_no=checkpoint.seqNoEnd,
                                               digest=checkpoint.digest)

    @staticmethod
    def _audit_seq_no_from_3pc_key(audit_ledger: Ledger, view_no: int,
                                   pp_seq_no: int) -> int:
        # TODO: Should we put it into some common code?
        seq_no = audit_ledger.size
        while seq_no > 0:
            txn = audit_ledger.getBySeqNo(seq_no)
            txn_data = get_payload_data(txn)
            audit_view_no = txn_data[AUDIT_TXN_VIEW_NO]
            audit_pp_seq_no = txn_data[AUDIT_TXN_PP_SEQ_NO]
            if audit_view_no == view_no and audit_pp_seq_no == pp_seq_no:
                break
            seq_no -= 1
        return seq_no

    def _audit_txn_root_hash(self, view_no: int,
                             pp_seq_no: int) -> Optional[str]:
        audit_ledger = self._db_manager.get_ledger(AUDIT_LEDGER_ID)
        # TODO: Should we remove view_no at some point?
        seq_no = self._audit_seq_no_from_3pc_key(audit_ledger, view_no,
                                                 pp_seq_no)
        # TODO: What should we do if txn not found or audit ledger is empty?
        if seq_no == 0:
            return None
        root_hash = audit_ledger.tree.merkle_tree_hash(0, seq_no)
        return audit_ledger.hashToStr(root_hash)

    def process_new_view_accepted(self, msg: NewViewAccepted):
        # 1. update shared data
        cp = msg.checkpoint
        if cp not in self._data.checkpoints:
            self._data.checkpoints.append(cp)
        self._mark_checkpoint_stable(cp.seqNoEnd)
        self.set_watermarks(low_watermark=cp.seqNoEnd)

        # 2. send NewViewCheckpointsApplied
        self._bus.send(
            NewViewCheckpointsApplied(view_no=msg.view_no,
                                      view_changes=msg.view_changes,
                                      checkpoint=msg.checkpoint,
                                      batches=msg.batches))
        return PROCESS, None
示例#6
0
class CheckpointService:
    STASHED_CHECKPOINTS_BEFORE_CATCHUP = 1

    def __init__(
            self,
            data: ConsensusSharedData,
            bus: InternalBus,
            network: ExternalBus,
            stasher: StashingRouter,
            db_manager: DatabaseManager,
            metrics: MetricsCollector = NullMetricsCollector(),
    ):
        self._data = data
        self._bus = bus
        self._network = network
        self._checkpoint_state = SortedDict(lambda k: k[1])
        self._stasher = stasher
        self._subscription = Subscription()
        self._validator = CheckpointMsgValidator(self._data)
        self._db_manager = db_manager
        self.metrics = metrics

        # Stashed checkpoints for each view. The key of the outermost
        # dictionary is the view_no, value being a dictionary with key as the
        # range of the checkpoint and its value again being a mapping between
        # senders and their sent checkpoint
        # Dict[view_no, Dict[(seqNoStart, seqNoEnd),  Dict[sender, Checkpoint]]]
        self._stashed_recvd_checkpoints = {}

        self._config = getConfig()
        self._logger = getlogger()

        self._subscription.subscribe(stasher, Checkpoint,
                                     self.process_checkpoint)

        self._subscription.subscribe(bus, Ordered, self.process_ordered)
        self._subscription.subscribe(bus, BackupSetupLastOrdered,
                                     self.process_backup_setup_last_ordered)
        self._subscription.subscribe(bus, NewViewAccepted,
                                     self.process_new_view_accepted)

    def cleanup(self):
        self._subscription.unsubscribe_all()

    @property
    def view_no(self):
        return self._data.view_no

    @property
    def is_master(self):
        return self._data.is_master

    @property
    def last_ordered_3pc(self):
        return self._data.last_ordered_3pc

    @measure_consensus_time(MetricsName.PROCESS_CHECKPOINT_TIME,
                            MetricsName.BACKUP_PROCESS_CHECKPOINT_TIME)
    def process_checkpoint(self, msg: Checkpoint, sender: str) -> (bool, str):
        """
        Process checkpoint messages
        :return: whether processed (True) or stashed (False)
        """
        if msg.instId != self._data.inst_id:
            return None, None
        self._logger.info('{} processing checkpoint {} from {}'.format(
            self, msg, sender))
        result, reason = self._validator.validate(msg)
        if result == PROCESS:
            self._do_process_checkpoint(msg, sender)
        return result, reason

    def _do_process_checkpoint(self, msg: Checkpoint, sender: str) -> bool:
        """
        Process checkpoint messages

        :return: whether processed (True) or stashed (False)
        """
        seqNoEnd = msg.seqNoEnd
        seqNoStart = msg.seqNoStart
        key = (seqNoStart, seqNoEnd)

        if key not in self._checkpoint_state or not self._checkpoint_state[
                key].digest:
            self._stash_checkpoint(msg, sender)
            self._remove_stashed_checkpoints(self.last_ordered_3pc)
            self._start_catchup_if_needed()
            return False

        checkpoint_state = self._checkpoint_state[key]
        # Raise the error only if master since only master's last
        # ordered 3PC is communicated during view change
        if self.is_master and checkpoint_state.digest != msg.digest:
            self._logger.warning("{} received an incorrect digest {} for "
                                 "checkpoint {} from {}".format(
                                     self, msg.digest, key, sender))
            return True

        checkpoint_state.receivedDigests[sender] = msg.digest
        self._check_if_checkpoint_stable(key)
        return True

    def process_backup_setup_last_ordered(self, msg: BackupSetupLastOrdered):
        if msg.inst_id != self._data.inst_id:
            return
        self.update_watermark_from_3pc()

    def process_ordered(self, ordered: Ordered):
        if ordered.instId != self._data.inst_id:
            return
        for batch_id in reversed(self._data.preprepared):
            if batch_id.pp_seq_no == ordered.ppSeqNo:
                self._add_to_checkpoint(batch_id.pp_seq_no, batch_id.pp_digest,
                                        ordered.ledgerId, batch_id.view_no,
                                        ordered.auditTxnRootHash)
                return
        raise LogicError(
            "CheckpointService | Can't process Ordered msg because "
            "ppSeqNo {} not in preprepared".format(ordered.ppSeqNo))

    def _start_catchup_if_needed(self):
        stashed_checkpoint_ends = self._stashed_checkpoints_with_quorum()
        lag_in_checkpoints = len(stashed_checkpoint_ends)
        if self._checkpoint_state:
            (s, e) = firstKey(self._checkpoint_state)
            # If the first stored own checkpoint has a not aligned lower bound
            # (this means that it was started after a catch-up), is complete
            # and there is a quorumed stashed checkpoint from other replicas
            # with the same end then don't include this stashed checkpoint
            # into the lag
            if s % self._config.CHK_FREQ != 0 \
                    and self._checkpoint_state[(s, e)].seqNo == e \
                    and e in stashed_checkpoint_ends:
                lag_in_checkpoints -= 1
        is_stashed_enough = \
            lag_in_checkpoints > self.STASHED_CHECKPOINTS_BEFORE_CATCHUP
        if not is_stashed_enough:
            return

        if self.is_master:
            self._logger.display(
                '{} has lagged for {} checkpoints so updating watermarks to {}'
                .format(self, lag_in_checkpoints, stashed_checkpoint_ends[-1]))
            self.set_watermarks(low_watermark=stashed_checkpoint_ends[-1])
            if not self._data.is_primary:
                self._logger.display(
                    '{} has lagged for {} checkpoints so the catchup procedure starts'
                    .format(self, lag_in_checkpoints))
                self._bus.send(NeedMasterCatchup())
        else:
            self._logger.info(
                '{} has lagged for {} checkpoints so adjust last_ordered_3pc to {}, '
                'shift watermarks and clean collections'.format(
                    self, lag_in_checkpoints, stashed_checkpoint_ends[-1]))
            # Adjust last_ordered_3pc, shift watermarks, clean operational
            # collections and process stashed messages which now fit between
            # watermarks
            key_3pc = (self.view_no, stashed_checkpoint_ends[-1])
            self._bus.send(
                NeedBackupCatchup(inst_id=self._data.inst_id,
                                  caught_up_till_3pc=key_3pc))
            self.caught_up_till_3pc(key_3pc)

    def gc_before_new_view(self):
        self._reset_checkpoints()
        self._remove_stashed_checkpoints(till_3pc_key=(self.view_no, 0))

    def caught_up_till_3pc(self, caught_up_till_3pc):
        self._reset_checkpoints()
        self._remove_stashed_checkpoints(till_3pc_key=caught_up_till_3pc)
        self.update_watermark_from_3pc()

    def catchup_clear_for_backup(self):
        self._reset_checkpoints()
        self._remove_stashed_checkpoints()
        self.set_watermarks(low_watermark=0, high_watermark=sys.maxsize)

    def _add_to_checkpoint(self, ppSeqNo, digest, ledger_id, view_no,
                           audit_txn_root_hash):
        for (s, e) in self._checkpoint_state.keys():
            if s <= ppSeqNo <= e:
                state = self._checkpoint_state[s, e]  # type: CheckpointState
                state.digests.append(digest)
                state = updateNamedTuple(state, seqNo=ppSeqNo)
                self._checkpoint_state[s, e] = state
                break
        else:
            s, e = ppSeqNo, math.ceil(
                ppSeqNo / self._config.CHK_FREQ) * self._config.CHK_FREQ
            self._logger.debug("{} adding new checkpoint state for {}".format(
                self, (s, e)))
            state = CheckpointState(ppSeqNo, [
                digest,
            ], None, {}, False)
            self._checkpoint_state[s, e] = state

        if state.seqNo == e:
            if len(state.digests) == self._config.CHK_FREQ:
                self._do_checkpoint(state, s, e, ledger_id, view_no,
                                    audit_txn_root_hash)
            self._process_stashed_checkpoints((s, e), view_no)

    @measure_consensus_time(MetricsName.SEND_CHECKPOINT_TIME,
                            MetricsName.BACKUP_SEND_CHECKPOINT_TIME)
    def _do_checkpoint(self, state, s, e, ledger_id, view_no,
                       audit_txn_root_hash):
        # TODO CheckpointState/Checkpoint is not a namedtuple anymore
        # 1. check if updateNamedTuple works for the new message type
        # 2. choose another name

        # TODO: This is hack of hacks, should be removed when refactoring is complete
        if not self.is_master and audit_txn_root_hash is None:
            audit_txn_root_hash = "7RJ5bkAKRy2CCvarRij2jiHC16SVPjHcrpVdNsboiQGv"

        state = updateNamedTuple(state, digest=audit_txn_root_hash, digests=[])
        self._checkpoint_state[s, e] = state
        self._logger.info(
            "{} sending Checkpoint {} view {} checkpointState digest {}. Ledger {} "
            "txn root hash {}. Committed state root hash {} Uncommitted state root hash {}"
            .format(
                self, (s, e), view_no, state.digest, ledger_id,
                self._db_manager.get_txn_root_hash(ledger_id),
                self._db_manager.get_state_root_hash(ledger_id,
                                                     committed=True),
                self._db_manager.get_state_root_hash(ledger_id,
                                                     committed=False)))
        checkpoint = Checkpoint(self._data.inst_id, view_no, s, e,
                                state.digest)
        self._network.send(checkpoint)
        self._data.checkpoints.append(checkpoint)

    def _mark_checkpoint_stable(self, seqNo):
        previousCheckpoints = []
        for (s, e), state in self._checkpoint_state.items():
            if e == seqNo:
                # TODO CheckpointState/Checkpoint is not a namedtuple anymore
                # 1. check if updateNamedTuple works for the new message type
                # 2. choose another name
                state = updateNamedTuple(state, isStable=True)
                self._checkpoint_state[s, e] = state
                self._set_stable_checkpoint(e)
                break
            else:
                previousCheckpoints.append((s, e))
        else:
            self._logger.debug("{} could not find {} in checkpoints".format(
                self, seqNo))
            return
        self.set_watermarks(low_watermark=seqNo)
        for k in previousCheckpoints:
            self._logger.trace("{} removing previous checkpoint {}".format(
                self, k))
            self._checkpoint_state.pop(k)
        self._remove_stashed_checkpoints(till_3pc_key=(self.view_no, seqNo))
        self._bus.send(
            CheckpointStabilized(self._data.inst_id, (self.view_no, seqNo)))
        self._logger.info("{} marked stable checkpoint {}".format(
            self, (s, e)))

    def _check_if_checkpoint_stable(self, key: Tuple[int, int]):
        ckState = self._checkpoint_state[key]
        if self._data.quorums.checkpoint.is_reached(
                len(ckState.receivedDigests)):
            self._mark_checkpoint_stable(ckState.seqNo)
            return True
        else:
            self._logger.debug('{} has state.receivedDigests as {}'.format(
                self, ckState.receivedDigests.keys()))
            return False

    def _stash_checkpoint(self, ck: Checkpoint, sender: str):
        self._logger.debug('{} stashing {} from {}'.format(self, ck, sender))
        seqNoStart, seqNoEnd = ck.seqNoStart, ck.seqNoEnd
        if ck.viewNo not in self._stashed_recvd_checkpoints:
            self._stashed_recvd_checkpoints[ck.viewNo] = {}
        stashed_for_view = self._stashed_recvd_checkpoints[ck.viewNo]
        if (seqNoStart, seqNoEnd) not in stashed_for_view:
            stashed_for_view[seqNoStart, seqNoEnd] = {}
        stashed_for_view[seqNoStart, seqNoEnd][sender] = ck

    def _stashed_checkpoints_with_quorum(self):
        end_pp_seq_numbers = []
        quorum = self._data.quorums.checkpoint
        for (_, seq_no_end), senders in self._stashed_recvd_checkpoints.get(
                self.view_no, {}).items():
            if quorum.is_reached(len(senders)):
                end_pp_seq_numbers.append(seq_no_end)
        return sorted(end_pp_seq_numbers)

    def _process_stashed_checkpoints(self, key, view_no):
        # Remove all checkpoints from previous views if any
        self._remove_stashed_checkpoints(till_3pc_key=(self.view_no, 0))

        if key not in self._stashed_recvd_checkpoints.get(view_no, {}):
            self._logger.trace("{} have no stashed checkpoints for {}")
            return

        # Get a snapshot of all the senders of stashed checkpoints for `key`
        senders = list(self._stashed_recvd_checkpoints[view_no][key].keys())
        total_processed = 0
        consumed = 0

        for sender in senders:
            # Check if the checkpoint from `sender` is still in
            # `stashed_recvd_checkpoints` because it might be removed from there
            # in case own checkpoint was stabilized when we were processing
            # stashed checkpoints from previous senders in this loop
            if view_no in self._stashed_recvd_checkpoints \
                    and key in self._stashed_recvd_checkpoints[view_no] \
                    and sender in self._stashed_recvd_checkpoints[view_no][key]:
                if self.process_checkpoint(
                        self._stashed_recvd_checkpoints[view_no][key].pop(
                            sender), sender):
                    consumed += 1
                # Note that if `process_checkpoint` returned False then the
                # checkpoint from `sender` was re-stashed back to
                # `stashed_recvd_checkpoints`
                total_processed += 1

        # If we have consumed stashed checkpoints for `key` from all the
        # senders then remove entries which have become empty
        if view_no in self._stashed_recvd_checkpoints \
                and key in self._stashed_recvd_checkpoints[view_no] \
                and len(self._stashed_recvd_checkpoints[view_no][key]) == 0:
            del self._stashed_recvd_checkpoints[view_no][key]
            if len(self._stashed_recvd_checkpoints[view_no]) == 0:
                del self._stashed_recvd_checkpoints[view_no]

        restashed = total_processed - consumed
        self._logger.info('{} processed {} stashed checkpoints for {}, '
                          '{} of them were stashed again'.format(
                              self, total_processed, key, restashed))

        return total_processed

    def reset_watermarks_before_new_view(self):
        # Reset any previous view watermarks since for view change to
        # successfully complete, the node must have reached the same state
        # as other nodes
        self.set_watermarks(low_watermark=0)

    def should_reset_watermarks_before_new_view(self):
        if self.view_no <= 0:
            return False
        if self.last_ordered_3pc[
                0] == self.view_no and self.last_ordered_3pc[1] > 0:
            return False
        return True

    def set_watermarks(self, low_watermark: int, high_watermark: int = None):
        self._data.low_watermark = low_watermark
        self._data.high_watermark = self._data.low_watermark + self._config.LOG_SIZE \
            if high_watermark is None else \
            high_watermark

        self._logger.info('{} set watermarks as {} {}'.format(
            self, self._data.low_watermark, self._data.high_watermark))
        self._stasher.process_all_stashed(STASH_WATERMARKS)

    def update_watermark_from_3pc(self):
        last_ordered_3pc = self.last_ordered_3pc
        if (last_ordered_3pc is not None) and (last_ordered_3pc[0]
                                               == self.view_no):
            self._logger.info(
                "update_watermark_from_3pc to {}".format(last_ordered_3pc))
            self.set_watermarks(last_ordered_3pc[1])
        else:
            self._logger.info(
                "try to update_watermark_from_3pc but last_ordered_3pc is None"
            )

    def _remove_stashed_checkpoints(self, till_3pc_key=None):
        """
        Remove stashed received checkpoints up to `till_3pc_key` if provided,
        otherwise remove all stashed received checkpoints
        """
        if till_3pc_key is None:
            self._stashed_recvd_checkpoints.clear()
            self._logger.info(
                '{} removing all stashed checkpoints'.format(self))
            return

        for view_no in list(self._stashed_recvd_checkpoints.keys()):

            if view_no < till_3pc_key[0]:
                self._logger.info(
                    '{} removing stashed checkpoints for view {}'.format(
                        self, view_no))
                del self._stashed_recvd_checkpoints[view_no]

            elif view_no == till_3pc_key[0]:
                for (s, e) in list(
                        self._stashed_recvd_checkpoints[view_no].keys()):
                    if e <= till_3pc_key[1]:
                        self._logger.info(
                            '{} removing stashed checkpoints: '
                            'viewNo={}, seqNoStart={}, seqNoEnd={}'.format(
                                self, view_no, s, e))
                        del self._stashed_recvd_checkpoints[view_no][(s, e)]
                if len(self._stashed_recvd_checkpoints[view_no]) == 0:
                    del self._stashed_recvd_checkpoints[view_no]

    def _reset_checkpoints(self):
        # That function most probably redundant in PBFT approach,
        # because according to paper, checkpoints cleared only when next stabilized.
        # Avoid using it while implement other services.
        self._checkpoint_state.clear()
        self._data.checkpoints.clear()
        # TODO: change to = 1 in ViewChangeService integration.
        self._data.stable_checkpoint = 0

    def _set_stable_checkpoint(self, end_seq_no):
        if not list(self._data.checkpoints.irange_key(end_seq_no, end_seq_no)):
            raise LogicError('Stable checkpoint must be in checkpoints')
        self._data.stable_checkpoint = end_seq_no

        self._data.checkpoints = \
            SortedListWithKey([c for c in self._data.checkpoints if c.seqNoEnd >= end_seq_no],
                              key=lambda checkpoint: checkpoint.seqNoEnd)

    def __str__(self) -> str:
        return "{} - checkpoint_service".format(self._data.name)

    # TODO: move to OrderingService as a handler for Cleanup messages
    # def _clear_batch_till_seq_no(self, seq_no):
    #     self._data.preprepared = [pp for pp in self._data.preprepared if pp.ppSeqNo >= seq_no]
    #     self._data.prepared = [p for p in self._data.prepared if p.ppSeqNo >= seq_no]

    def discard(self, msg, reason, sender):
        self._logger.trace("{} discard message {} from {} "
                           "with the reason: {}".format(
                               self, msg, sender, reason))

    def process_new_view_accepted(self, msg: NewViewAccepted):
        # 1. update shared data
        cp = msg.checkpoint
        if cp not in self._data.checkpoints:
            self._data.checkpoints.append(cp)
        self._set_stable_checkpoint(cp.seqNoEnd)
        self.set_watermarks(low_watermark=cp.seqNoEnd)

        # 2. send NewViewCheckpointsApplied
        self._bus.send(
            NewViewCheckpointsApplied(view_no=msg.view_no,
                                      view_changes=msg.view_changes,
                                      checkpoint=msg.checkpoint,
                                      batches=msg.batches))
        return PROCESS, None
示例#7
0
class ViewChangeTriggerService:
    def __init__(self,
                 data: ConsensusSharedData,
                 timer: TimerService,
                 bus: InternalBus,
                 network: ExternalBus,
                 db_manager: DatabaseManager,
                 stasher: StashingRouter,
                 is_master_degraded: Callable[[], bool],
                 metrics: MetricsCollector = NullMetricsCollector()):
        self._data = data
        self._timer = timer
        self._bus = bus
        self._network = network
        self._stasher = stasher
        self._is_master_degraded = is_master_degraded
        self.metrics = metrics

        self._config = getConfig()

        self._instance_changes = \
            InstanceChangeProvider(outdated_ic_interval=self._config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL,
                                   node_status_db=db_manager.get_store(NODE_STATUS_DB_LABEL),
                                   time_provider=timer.get_current_time)

        self._subscription = Subscription()
        self._subscription.subscribe(bus, VoteForViewChange, self.process_vote_for_view_change)
        self._subscription.subscribe(bus, NewViewAccepted, self.process_new_view_accepted)
        self._subscription.subscribe(stasher, InstanceChange, self.process_instance_change)

    def cleanup(self):
        self._subscription.unsubscribe_all()

    @property
    def name(self):
        return replica_name_to_node_name(self._data.name)

    def __repr__(self):
        return self.name

    def process_vote_for_view_change(self, msg: VoteForViewChange):
        proposed_view_no = self._data.view_no
        # TODO: Some time ago it was proposed that view_no should not be increased during proposal
        #  if view change is already in progress, unless suspicion code is "view change is taking too long".
        #  Idea was to improve stability of view change triggering, however for some reason this change lead
        #  to lots of failing/flaky tests. This still needs to be investigated.
        # if suspicion == Suspicions.INSTANCE_CHANGE_TIMEOUT or not self.view_change_in_progress:
        if msg.suspicion != Suspicions.STATE_SIGS_ARE_NOT_UPDATED or not self._data.waiting_for_new_view:
            proposed_view_no += 1
        self._send_instance_change(proposed_view_no, msg.suspicion)

    def process_instance_change(self, msg: InstanceChange, frm: str):
        frm = replica_name_to_node_name(frm)

        # TODO: Do we really need this?
        if frm not in self._network.connecteds:
            return DISCARD, "instance change request: {} from {} which is not in connected list: {}".\
                format(msg, frm, self._network.connecteds)

        if not self._data.is_participating:
            return STASH_CATCH_UP, CATCHING_UP

        logger.info("{} received instance change request: {} from {}".format(self, msg, frm))

        if msg.viewNo <= self._data.view_no:
            return DISCARD, "instance change request with view no {} which is not more than its view no {}".\
                format(msg.viewNo, self._data.view_no)

        # Record instance changes for views but send instance change
        # only when found master to be degraded. if quorum of view changes
        #  found then change view even if master not degraded
        self._on_verified_instance_change_msg(msg, frm)

        if self._instance_changes.has_inst_chng_from(msg.viewNo, self.name):
            logger.info("{} received instance change message {} "
                        "but has already sent an instance change message".format(self, msg))
        elif not self._is_master_degraded():
            logger.info("{} received instance change message {} "
                        "but did not find the master to be slow".format(self, msg))
        else:
            logger.display("{}{} found master degraded after "
                           "receiving instance change message from {}".format(VIEW_CHANGE_PREFIX, self, frm))
            self._send_instance_change(msg.viewNo, Suspicions.PRIMARY_DEGRADED)

    def process_new_view_accepted(self, msg: NewViewAccepted):
        self._instance_changes.remove_view(self._data.view_no)

    def _send_instance_change(self, view_no: int, suspicion: Suspicion):
        logger.info("{}{} sending an instance change with view_no {} since {}".
                    format(VIEW_CHANGE_PREFIX, self, view_no, suspicion.reason))
        msg = InstanceChange(view_no, suspicion.code)
        self._network.send(msg)
        # record instance change vote for self and try to change the view if quorum is reached
        self._on_verified_instance_change_msg(msg, self.name)

    def _on_verified_instance_change_msg(self, msg: InstanceChange, frm: str):
        view_no = msg.viewNo

        if not self._instance_changes.has_inst_chng_from(view_no, frm):
            self._instance_changes.add_vote(msg, frm)
            if view_no > self._data.view_no:
                self._try_start_view_change_by_instance_change(view_no)

    def _try_start_view_change_by_instance_change(self, proposed_view_no: int) -> bool:
        # TODO: Need to handle skewed distributions which can arise due to
        #  malicious nodes sending messages early on
        can, why_not = self._can_view_change(proposed_view_no)
        if can:
            logger.display("{}{} initiating a view change to {} from {}".
                           format(VIEW_CHANGE_PREFIX, self, proposed_view_no, self._data.view_no))
            self._bus.send(NodeNeedViewChange(view_no=proposed_view_no))
        else:
            logger.info(why_not)
        return can

    def _can_view_change(self, proposed_view_no: int) -> (bool, str):
        quorum = self._data.quorums.view_change.value
        if not self._instance_changes.has_quorum(proposed_view_no, quorum):
            return False, '{} has no quorum for view {}'.format(self, proposed_view_no)
        if not proposed_view_no > self._data.view_no:
            return False, '{} is in higher view more than {}'.format(self, proposed_view_no)
        return True, ''
示例#8
0
class PrimaryConnectionMonitorService:
    def __init__(self,
                 data: ConsensusSharedData,
                 timer: TimerService,
                 bus: InternalBus,
                 network: ExternalBus,
                 metrics: MetricsCollector = NullMetricsCollector()):
        self._data = data
        self._timer = timer
        self._bus = bus
        self._network = network
        self.metrics = metrics

        self._config = getConfig()

        self._primary_disconnection_time = timer.get_current_time(
        )  # type: Optional[float]

        self._propose_view_change_timer = RepeatingTimer(
            timer=timer,
            interval=self._config.NEW_VIEW_TIMEOUT,
            callback=self._propose_view_change_if_needed,
            active=False)

        self._subscription = Subscription()
        self._subscription.subscribe(network, ExternalBus.Connected,
                                     self.process_connected)
        self._subscription.subscribe(network, ExternalBus.Disconnected,
                                     self.process_disconnected)
        self._subscription.subscribe(bus, PrimarySelected,
                                     self.process_primary_selected)
        self._subscription.subscribe(bus, NodeStatusUpdated,
                                     self.process_node_status_updated)

        if self._data.is_master:
            self._schedule_primary_connection_check(
                delay=self._config.INITIAL_PROPOSE_VIEW_CHANGE_TIMEOUT)

    def __str__(self):
        return self._data.name

    def cleanup(self):
        self._subscription.unsubscribe_all()
        self._propose_view_change_timer.stop()

    def process_connected(self, msg: ExternalBus.Connected, frm: str):
        if frm == replica_name_to_node_name(self._data.primary_name):
            self._primary_connected()

    def process_disconnected(self, msg: ExternalBus.Disconnected, frm: str):
        if frm == replica_name_to_node_name(self._data.primary_name):
            self._primary_disconnected()

    def process_primary_selected(self, msg: PrimarySelected):
        primary_name = replica_name_to_node_name(self._data.primary_name)
        primary_connected = self._data.is_primary or primary_name in self._network.connecteds
        if primary_connected:
            self._primary_connected()
        else:
            self._primary_disconnected()

    def process_node_status_updated(self, msg: NodeStatusUpdated):
        # TODO: This was ported from old code, but there is quite high chance
        #  that this functionality is not needed anymore
        if msg.old_status == Status.starting \
                and msg.new_status == Status.started_hungry \
                and self._primary_disconnection_time is not None \
                and self._data.primary_name is not None:
            """
            Such situation may occur if the pool has come back to reachable consensus but
            primary is still disconnected, so view change proposal makes sense now.
            """
            self._schedule_primary_connection_check()

    def _primary_connected(self):
        self._primary_disconnection_time = None

        if self._data.is_master:
            logger.display(
                '{} restored connection to primary of master'.format(self))
            self._timer.cancel(self._check_primary_connection)
            self._propose_view_change_timer.stop()

    def _primary_disconnected(self):
        self._primary_disconnection_time = self._timer.get_current_time()
        self._bus.send(PrimaryDisconnected(inst_id=self._data.inst_id))

        if self._data.is_master:
            logger.display(
                '{} lost connection to primary of master'.format(self))
            self._schedule_primary_connection_check()

    def _schedule_primary_connection_check(self,
                                           delay: Optional[float] = None):
        if delay is None:
            delay = self._config.ToleratePrimaryDisconnection

        logger.info('{} scheduling primary connection check in {} sec'.format(
            self, delay))
        self._timer.schedule(delay=delay,
                             callback=self._check_primary_connection)

    def _check_primary_connection(self):
        if self._primary_disconnection_time is None:
            logger.info('{} The primary is already connected '
                        'so view change will not be proposed'.format(self))
            return

        logger.display(
            "{} primary has been disconnected for too long".format(self))

        if not self._data.is_ready or not self._data.is_synced:
            logger.info(
                '{} The node is not ready yet so '
                'view change will not be proposed now, but re-scheduled.'.
                format(self))
            self._schedule_primary_connection_check()
            return

        self._propose_view_change()
        self._propose_view_change_timer.start()

    def _propose_view_change(self):
        self._bus.send(
            VoteForViewChange(suspicion=Suspicions.PRIMARY_DISCONNECTED))

    def _propose_view_change_if_needed(self):
        if self._primary_disconnection_time is not None:
            self._propose_view_change()
class ViewChangeService:
    def __init__(self, data: ConsensusSharedData, timer: TimerService,
                 bus: InternalBus, network: ExternalBus,
                 stasher: StashingRouter):
        self._config = getConfig()
        self._logger = getlogger()

        self._data = data
        self._new_view_builder = NewViewBuilder(self._data)
        self._timer = timer
        self._bus = bus
        self._network = network
        self._router = stasher
        self._votes = ViewChangeVotesForView(self._data.quorums)
        self._new_view = None  # type: Optional[NewView]

        self._router.subscribe(ViewChange, self.process_view_change_message)
        self._router.subscribe(ViewChangeAck,
                               self.process_view_change_ack_message)
        self._router.subscribe(NewView, self.process_new_view_message)

        self._old_prepared = {}  # type: Dict[int, BatchID]
        self._old_preprepared = {}  # type: Dict[int, List[BatchID]]
        self._primaries_selector = RoundRobinPrimariesSelector()

        self._subscription = Subscription()
        self._subscription.subscribe(self._bus, NeedViewChange,
                                     self.process_need_view_change)

    def __repr__(self):
        return self._data.name

    def process_need_view_change(self, msg: NeedViewChange):
        # 1. calculate new viewno
        view_no = msg.view_no
        if view_no is None:
            view_no = self._data.view_no + 1

        # 2. Do cleanup before new view change starts
        self._clean_on_view_change_start()

        # 3. Update shared data
        self._data.view_no = view_no
        self._data.waiting_for_new_view = True
        self._data.primaries = self._primaries_selector.select_primaries(
            view_no=self._data.view_no,
            instance_count=self._data.quorums.f + 1,
            validators=self._data.validators)
        self._data.primary_name = self._data.primaries[self._data.inst_id]

        # 4. Build ViewChange message
        vc = self._build_view_change_msg()

        # 5. Send ViewChangeStarted via internal bus to update other services
        self._bus.send(ViewChangeStarted(view_no=self._data.view_no))

        # 6. Send ViewChange msg to other nodes (via external bus)
        self._network.send(vc)
        self._votes.add_view_change(vc, self._data.name)

        # 6. Unstash messages for new view
        self._router.process_all_stashed()

    def _clean_on_view_change_start(self):
        self._clear_old_batches(self._old_prepared)
        self._clear_old_batches(self._old_preprepared)
        self._votes.clear()
        self._new_view = None

    def _clear_old_batches(self, batches: Dict[int, Any]):
        for pp_seq_no in list(batches.keys()):
            if pp_seq_no <= self._data.stable_checkpoint:
                del batches[pp_seq_no]

    def _build_view_change_msg(self):
        for batch_id in self._data.prepared:
            self._old_prepared[batch_id.pp_seq_no] = batch_id
        prepared = sorted([tuple(bid) for bid in self._old_prepared.values()])

        for new_bid in self._data.preprepared:
            pretenders = self._old_preprepared.get(new_bid.pp_seq_no, [])
            pretenders = [
                bid for bid in pretenders if bid.pp_digest != new_bid.pp_digest
            ]
            pretenders.append(new_bid)
            self._old_preprepared[new_bid.pp_seq_no] = pretenders
        preprepared = sorted([
            tuple(bid) for bids in self._old_preprepared.values()
            for bid in bids
        ])

        return ViewChange(viewNo=self._data.view_no,
                          stableCheckpoint=self._data.stable_checkpoint,
                          prepared=prepared,
                          preprepared=preprepared,
                          checkpoints=list(self._data.checkpoints))

    def process_view_change_message(self, msg: ViewChange, frm: str):
        result = self._validate(msg, frm)
        if result != PROCESS:
            return result, None

        self._votes.add_view_change(msg, frm)

        if self._data.is_primary:
            self._send_new_view_if_needed()
            return PROCESS, None

        vca = ViewChangeAck(viewNo=msg.viewNo,
                            name=frm,
                            digest=view_change_digest(msg))
        self._network.send(vca, self._data.primary_name)

        self._finish_view_change_if_needed()
        return PROCESS, None

    def process_view_change_ack_message(self, msg: ViewChangeAck, frm: str):
        result = self._validate(msg, frm)
        if result != PROCESS:
            return result, None

        if not self._data.is_primary:
            return PROCESS, None

        self._votes.add_view_change_ack(msg, frm)
        self._send_new_view_if_needed()
        return PROCESS, None

    def process_new_view_message(self, msg: NewView, frm: str):
        result = self._validate(msg, frm)
        if result != PROCESS:
            return result, None

        if frm != self._data.primary_name:
            self._logger.info(
                "{} Received NewView {} for view {} from non-primary {}; expected primary {}"
                .format(self._data.name, msg, self._data.view_no, frm,
                        self._data.primary_name))
            return DISCARD, "New View from non-Primary"

        self._new_view = msg
        self._finish_view_change_if_needed()
        return PROCESS, None

    def _validate(self, msg: Union[ViewChange, ViewChangeAck, NewView],
                  frm: str) -> int:
        # TODO: Proper validation

        if msg.viewNo < self._data.view_no:
            return DISCARD

        if msg.viewNo == self._data.view_no and not self._data.waiting_for_new_view:
            return DISCARD

        if msg.viewNo > self._data.view_no:
            return STASH_VIEW

        return PROCESS

    def _send_new_view_if_needed(self):
        confirmed_votes = self._votes.confirmed_votes
        if not self._data.quorums.view_change.is_reached(len(confirmed_votes)):
            return

        view_changes = [
            self._votes.get_view_change(*v) for v in confirmed_votes
        ]
        cp = self._new_view_builder.calc_checkpoint(view_changes)
        if cp is None:
            return

        batches = self._new_view_builder.calc_batches(cp, view_changes)
        if batches is None:
            return

        nv = NewView(viewNo=self._data.view_no,
                     viewChanges=sorted(confirmed_votes, key=itemgetter(0)),
                     checkpoint=cp,
                     batches=batches)
        self._network.send(nv)
        self._new_view = nv
        self._finish_view_change()

    def _finish_view_change_if_needed(self):
        if self._new_view is None:
            return

        view_changes = []
        for name, vc_digest in self._new_view.viewChanges:
            vc = self._votes.get_view_change(name, vc_digest)
            # We don't have needed ViewChange, so we cannot validate NewView
            if vc is None:
                return
            view_changes.append(vc)

        cp = self._new_view_builder.calc_checkpoint(view_changes)
        if cp is None or cp != self._new_view.checkpoint:
            # New primary is malicious
            self._logger.info(
                "{} Received invalid NewView {} for view {}: expected checkpoint {}"
                .format(self._data.name, self._new_view, self._data.view_no,
                        cp))
            self._bus.send(NeedViewChange())
            return

        batches = self._new_view_builder.calc_batches(cp, view_changes)
        if batches != self._new_view.batches:
            # New primary is malicious
            self._logger.info(
                "{} Received invalid NewView {} for view {}: expected batches {}"
                .format(self._data.name, self._new_view, self._data.view_no,
                        batches))
            self._bus.send(NeedViewChange())
            return

        self._finish_view_change()

    def _finish_view_change(self):
        # Update shared data
        self._data.waiting_for_new_view = False

        # send message to other services
        self._bus.send(
            NewViewAccepted(view_no=self._new_view.viewNo,
                            view_changes=self._new_view.viewChanges,
                            checkpoint=self._new_view.checkpoint,
                            batches=self._new_view.batches))
示例#10
0
class ViewChangeService:
    def __init__(self, data: ConsensusSharedData, timer: TimerService,
                 bus: InternalBus, network: ExternalBus,
                 stasher: StashingRouter,
                 primaries_selector: PrimariesSelector):
        self._config = getConfig()

        self._data = data
        self._new_view_builder = NewViewBuilder(self._data)
        self._timer = timer
        self._bus = bus
        self._network = network
        self._router = stasher

        # Last successful viewNo.
        # In some cases view_change process can be uncompleted in time.
        # In that case we want to know, which viewNo was successful (last completed view_change)
        self.last_completed_view_no = self._data.view_no

        self._resend_inst_change_timer = RepeatingTimer(
            self._timer,
            self._config.NEW_VIEW_TIMEOUT,
            self._propose_view_change_not_complete_in_time,
            active=False)

        self._old_prepared = {}  # type: Dict[int, BatchID]
        self._old_preprepared = {}  # type: Dict[int, List[BatchID]]
        self._stashed_vc_msgs = {}  # type: Dict[int, int]
        self._primaries_selector = primaries_selector

        self._subscription = Subscription()
        self._subscription.subscribe(self._router, ViewChange,
                                     self.process_view_change_message)
        self._subscription.subscribe(self._router, ViewChangeAck,
                                     self.process_view_change_ack_message)
        self._subscription.subscribe(self._router, NewView,
                                     self.process_new_view_message)
        self._subscription.subscribe(self._bus, NeedViewChange,
                                     self.process_need_view_change)

    def cleanup(self):
        self._subscription.unsubscribe_all()

    def __repr__(self):
        return self._data.name

    @property
    def view_change_votes(self):
        return self._data.view_change_votes

    def process_need_view_change(self, msg: NeedViewChange):
        logger.info("{} processing {}".format(self, msg))

        # 1. calculate new viewno
        view_no = msg.view_no
        if view_no is None:
            view_no = self._data.view_no + 1

        # 2. Do cleanup before new view change starts
        self._clean_on_view_change_start()

        # 3. Update shared data
        self._data.view_no = view_no
        self._data.waiting_for_new_view = True
        old_primary = self._data.primary_name
        self._data.primary_name = None
        if not self._data.is_master:
            self._data.master_reordered_after_vc = False
            return

        # Only the master primary is selected at the beginning of view change as we need to get a NEW_VIEW and do re-ordering on master
        # Backup primaries will not be selected (and backups will not order) until re-ordering of txns from previous view on master is finished
        # More precisely, it will be done after the first batch in a new view is committed
        # This is done so as N and F may change as a result of NODE txns ordered in last view,
        # so we need a synchronous point of updating N, F, number of replicas and backup primaris
        # Beginning of view (when the first batch in a view is ordered) is such a point.
        self._data.primary_name = generateName(
            self._primaries_selector.select_master_primary(self._data.view_no),
            self._data.inst_id)

        if old_primary and self._data.primary_name == old_primary:
            logger.info("Selected master primary is the same with the "
                        "current master primary (new_view {}). "
                        "Propose a new view {}".format(self._data.view_no,
                                                       self._data.view_no + 1))
            self._propose_view_change(Suspicions.INCORRECT_NEW_PRIMARY)

        logger.info(
            "{} started view change to view {}. Expected Master Primary: {}".
            format(self._data.name, self._data.view_no,
                   self._data.primary_name))

        # 4. Build ViewChange message
        vc = self._build_view_change_msg()

        # 5. Send ViewChangeStarted via internal bus to update other services
        logger.info("{} sending {}".format(self, vc))
        self._bus.send(ViewChangeStarted(view_no=self._data.view_no))

        # 6. Send ViewChange msg to other nodes (via external bus)
        self._network.send(vc)
        self.view_change_votes.add_view_change(vc, self._data.name)

        # 7. Unstash messages for view change
        self._router.process_all_stashed(STASH_WAITING_VIEW_CHANGE)
        self._stashed_vc_msgs.clear()

        # 8. Restart instance change timer
        self._resend_inst_change_timer.stop()
        self._resend_inst_change_timer.start()

    def _clean_on_view_change_start(self):
        self._clear_old_batches(self._old_prepared)
        self._clear_old_batches(self._old_preprepared)
        self.view_change_votes.clear()
        self._data.new_view_votes.clear()

    def _clear_old_batches(self, batches: Dict[int, Any]):
        for pp_seq_no in list(batches.keys()):
            if pp_seq_no <= self._data.stable_checkpoint:
                del batches[pp_seq_no]

    def _build_view_change_msg(self):
        for batch_id in self._data.prepared:
            self._old_prepared[batch_id.pp_seq_no] = batch_id
        prepared = sorted(list(self._old_prepared.values()))

        for new_bid in self._data.preprepared:
            pretenders = self._old_preprepared.get(new_bid.pp_seq_no, [])
            pretenders = [
                bid for bid in pretenders if bid.pp_digest != new_bid.pp_digest
            ]
            pretenders.append(new_bid)
            self._old_preprepared[new_bid.pp_seq_no] = pretenders
        preprepared = sorted(
            [bid for bids in self._old_preprepared.values() for bid in bids])

        return ViewChange(viewNo=self._data.view_no,
                          stableCheckpoint=self._data.stable_checkpoint,
                          prepared=prepared,
                          preprepared=preprepared,
                          checkpoints=list(self._data.checkpoints))

    def process_view_change_message(self, msg: ViewChange, frm: str):
        result, reason = self._validate(msg, frm)
        if result == STASH_WAITING_VIEW_CHANGE:
            self._stashed_vc_msgs.setdefault(msg.viewNo, 0)
            self._stashed_vc_msgs[msg.viewNo] += 1
            if self._data.quorums.view_change.is_reached(self._stashed_vc_msgs[msg.viewNo]) and \
                    not self._data.waiting_for_new_view:
                self._bus.send(NodeNeedViewChange(msg.viewNo))
        if result != PROCESS:
            return result, reason

        logger.info("{} processing {} from {}".format(self, msg, frm))

        self.view_change_votes.add_view_change(msg, frm)

        vca = ViewChangeAck(viewNo=msg.viewNo,
                            name=replica_name_to_node_name(frm),
                            digest=view_change_digest(msg))
        self.view_change_votes.add_view_change_ack(
            vca, replica_name_to_node_name(self._data.name))

        if self._data.is_primary:
            self._send_new_view_if_needed()
            return PROCESS, None

        primary_node_name = replica_name_to_node_name(self._data.primary_name)
        logger.info("{} sending {}".format(self, vca))
        self._network.send(vca, [primary_node_name])

        self._finish_view_change_if_needed()
        return PROCESS, None

    def process_view_change_ack_message(self, msg: ViewChangeAck, frm: str):
        result, reason = self._validate(msg, frm)
        if result != PROCESS:
            return result, reason

        logger.info("{} processing {} from {}".format(self, msg, frm))

        if not self._data.is_primary:
            return PROCESS, None

        self.view_change_votes.add_view_change_ack(msg, frm)
        self._send_new_view_if_needed()
        return PROCESS, None

    def process_new_view_message(self, msg: NewView, frm: str):
        result, reason = self._validate(msg, frm)
        if result != PROCESS:
            return result, reason

        logger.info("{} processing {} from {}".format(self, msg, frm))
        if frm != self._data.primary_name:
            logger.warning("Received NEW_VIEW message from non-primary node. "
                           "Selected primaries may differ on other nodes")

        self._data.new_view_votes.add_new_view(msg, frm)
        self._finish_view_change_if_needed()
        return PROCESS, None

    def _validate(self, msg: Union[ViewChange, ViewChangeAck, NewView],
                  frm: str) -> (int, str):
        # TODO: Proper validation
        if not self._data.is_master:
            return DISCARD, "not master instance"

        if msg.viewNo < self._data.view_no:
            return DISCARD, "message has old view {}, current view {}".format(
                msg.viewNo, self._data.view_no)

        if msg.viewNo == self._data.view_no and not self._data.waiting_for_new_view:
            return DISCARD, "message has current view {}, but we already finished view change".format(
                msg.viewNo)

        if not self._data.is_participating:
            return STASH_CATCH_UP, "we're not participating yet"

        if msg.viewNo > self._data.view_no:
            return STASH_WAITING_VIEW_CHANGE, "message is from future view {}".format(
                msg.viewNo)

        return PROCESS, None

    def _send_new_view_if_needed(self):
        confirmed_votes = self.view_change_votes.confirmed_votes
        if not self._data.quorums.view_change.is_reached(len(confirmed_votes)):
            logger.debug(
                "{} can not send NEW_VIEW: no quorum. Has {}, expects {} votes"
                .format(self._data.name, len(confirmed_votes),
                        self._data.quorums.view_change.value))
            return

        view_changes = [
            self.view_change_votes.get_view_change(*v) for v in confirmed_votes
        ]
        cp = self._new_view_builder.calc_checkpoint(view_changes)
        if cp is None:
            logger.info(
                "{} can not send NEW_VIEW: can not calculate Checkpoint".
                format(self._data.name))
            return

        batches = self._new_view_builder.calc_batches(cp, view_changes)
        if batches is None:
            logger.info(
                "{} can not send NEW_VIEW: can not calculate Batches".format(
                    self._data.name))
            return

        if cp not in self._data.checkpoints:
            logger.info(
                "{} can not send NEW_VIEW: does not have Checkpoint {}.".
                format(self._data.name, str(cp)))
            return

        nv = NewView(viewNo=self._data.view_no,
                     viewChanges=sorted(confirmed_votes, key=itemgetter(0)),
                     checkpoint=cp,
                     batches=batches)
        logger.info("{} sending {}".format(self, nv))
        self._network.send(nv)
        self._data.new_view_votes.add_new_view(nv, self._data.name)
        self._finish_view_change()

    def _finish_view_change_if_needed(self):
        if self._data.new_view is None:
            return

        view_changes = []
        for name, vc_digest in self._data.new_view.viewChanges:
            vc = self.view_change_votes.get_view_change(name, vc_digest)
            # We don't have needed ViewChange, so we cannot validate NewView
            if vc is None:
                self._request_view_change_message((name, vc_digest))
                return
            view_changes.append(vc)

        cp = self._new_view_builder.calc_checkpoint(view_changes)
        if cp is None or cp != self._data.new_view.checkpoint:
            # New primary is malicious
            logger.info(
                "{} Received invalid NewView {} for view {}: expected checkpoint {}"
                .format(self._data.name, self._data.new_view,
                        self._data.view_no, cp))
            self._propose_view_change(Suspicions.NEW_VIEW_INVALID_CHECKPOINTS)
            return

        batches = self._new_view_builder.calc_batches(cp, view_changes)
        if batches != self._data.new_view.batches:
            # New primary is malicious
            logger.info(
                "{} Received invalid NewView {} for view {}: expected batches {}"
                .format(self._data.name, self._data.new_view,
                        self._data.view_no, batches))
            self._propose_view_change(Suspicions.NEW_VIEW_INVALID_BATCHES)
            return

        self._finish_view_change()

    def _finish_view_change(self):
        logger.info(
            "{} finished view change to view {}. Master Primary: {}".format(
                self._data.name, self._data.view_no, self._data.primary_name))
        # Update shared data
        self._data.waiting_for_new_view = False
        self._data.prev_view_prepare_cert = self._data.new_view.batches[-1].pp_seq_no \
            if self._data.new_view.batches else self._data.new_view.checkpoint.seqNoEnd

        # Cancel View Change timeout task
        self._resend_inst_change_timer.stop()
        # send message to other services
        self._bus.send(
            NewViewAccepted(view_no=self._data.new_view.viewNo,
                            view_changes=self._data.new_view.viewChanges,
                            checkpoint=self._data.new_view.checkpoint,
                            batches=self._data.new_view.batches))
        self.last_completed_view_no = self._data.view_no

    def _propose_view_change_not_complete_in_time(self):
        self._propose_view_change(Suspicions.INSTANCE_CHANGE_TIMEOUT)
        if self._data.new_view is None:
            self._request_new_view_message(self._data.view_no)

    def _propose_view_change(self, suspicion: Suspicion):
        self._bus.send(VoteForViewChange(suspicion))

    def _request_new_view_message(self, view_no):
        self._bus.send(
            MissingMessage(msg_type=NEW_VIEW,
                           key=view_no,
                           inst_id=self._data.inst_id,
                           dst=None,
                           stash_data=None))

    def _request_view_change_message(self, key):
        self._bus.send(
            MissingMessage(msg_type=VIEW_CHANGE,
                           key=key,
                           inst_id=self._data.inst_id,
                           dst=None,
                           stash_data=None))