def __init__(self, data: ConsensusSharedData, timer: TimerService, bus: InternalBus, network: ExternalBus, freshness_checker: FreshnessChecker, get_time_for_3pc_batch: Optional[Callable[[], int]] = None, metrics: MetricsCollector = NullMetricsCollector()): self._data = data self._timer = timer self._bus = bus self._network = network self.metrics = metrics self._freshness_checker = freshness_checker self._get_time_for_3pc_batch = get_time_for_3pc_batch if get_time_for_3pc_batch is not None else get_utc_epoch self._config = getConfig() # Start periodic freshness check state_freshness_update_interval = self._config.STATE_FRESHNESS_UPDATE_INTERVAL if state_freshness_update_interval > 0: self._check_freshness_timer = RepeatingTimer( self._timer, state_freshness_update_interval, self._check_freshness)
def __init__(self, ledger_id: int, config: object, input: RxChannel, output: TxChannel, timer: TimerService, metrics: MetricsCollector, provider: CatchupDataProvider): router = Router(input) router.add(LedgerStatus, self.process_ledger_status) router.add(ConsistencyProof, self.process_consistency_proof) self._ledger_id = ledger_id self._ledger = provider.ledger(ledger_id) self._config = config self._output = output self._timer = timer self.metrics = metrics self._provider = provider self._is_working = False self._quorum = Quorums(len(self._provider.all_nodes_names())) self._same_ledger_status = set() self._cons_proofs = {} self._already_asked_for_cons_proofs_without_timeout = False self._last_txn_3PC_key = {} self._ledger_status_timer = \ RepeatingTimer(self._timer, self._config.LedgerStatusTimeout * (len(self._provider.all_nodes_names()) - 1), self._reask_for_ledger_status, active=False) self._consistency_proof_timer = \ RepeatingTimer(self._timer, self._config.ConsistencyProofsTimeout * (len(self._provider.all_nodes_names()) - 1), self._reask_for_last_consistency_proof, active=False)
def __init__(self, data: ConsensusSharedData, timer: TimerService, bus: InternalBus, network: ExternalBus, stasher: StashingRouter): self._config = getConfig() self._logger = getlogger() self._data = data self._new_view_builder = NewViewBuilder(self._data) self._timer = timer self._bus = bus self._network = network self._router = stasher self._new_view = None # type: Optional[NewView] self._resend_inst_change_timer = RepeatingTimer(self._timer, self._config.NEW_VIEW_TIMEOUT, partial(self._propose_view_change, Suspicions.INSTANCE_CHANGE_TIMEOUT.code), active=False) self._router.subscribe(ViewChange, self.process_view_change_message) self._router.subscribe(ViewChangeAck, self.process_view_change_ack_message) self._router.subscribe(NewView, self.process_new_view_message) self._old_prepared = {} # type: Dict[int, BatchID] self._old_preprepared = {} # type: Dict[int, List[BatchID]] self._primaries_selector = RoundRobinPrimariesSelector() self._subscription = Subscription() self._subscription.subscribe(self._bus, NeedViewChange, self.process_need_view_change)
def test_timer_model_works_with_repeating_timers(data): timer = TimerModel() cb1 = Callback() cb2 = Callback() RepeatingTimer(timer, 5, cb1) RepeatingTimer(timer, 2, cb2) assert cb1.call_count == 0 assert cb2.call_count == 0 event_log = process_model(data.draw, timer, 1) assert cb1.call_count == 0 assert cb2.call_count == 1 assert len(event_log) == 1 assert event_log[0].timestamp == 2 event_log = process_model(data.draw, timer, 1) assert cb1.call_count == 0 assert cb2.call_count == 2 assert len(event_log) == 1 assert event_log[0].timestamp == 4 event_log = process_model(data.draw, timer, 1) assert cb1.call_count == 1 assert cb2.call_count == 2 assert len(event_log) == 1 assert event_log[0].timestamp == 5 event_log = process_model(data.draw, timer, 1) assert cb1.call_count == 1 assert cb2.call_count == 3 assert len(event_log) == 1 assert event_log[0].timestamp == 6
def __init__(self, data: ConsensusSharedData, timer: TimerService, bus: InternalBus, network: ExternalBus, metrics: MetricsCollector = NullMetricsCollector()): self._data = data self._timer = timer self._bus = bus self._network = network self.metrics = metrics self._config = getConfig() self._primary_disconnection_time = timer.get_current_time() # type: Optional[float] self._propose_view_change_timer = RepeatingTimer(timer=timer, interval=self._config.NEW_VIEW_TIMEOUT, callback=self._propose_view_change_if_needed, active=False) self._subscription = Subscription() self._subscription.subscribe(network, ExternalBus.Connected, self.process_connected) self._subscription.subscribe(network, ExternalBus.Disconnected, self.process_disconnected) self._subscription.subscribe(bus, PrimarySelected, self.process_primary_selected) self._subscription.subscribe(bus, NodeStatusUpdated, self.process_node_status_updated) if self._data.is_master: self._schedule_primary_connection_check(delay=self._config.INITIAL_PROPOSE_VIEW_CHANGE_TIMEOUT)
def __init__(self, data: ConsensusSharedData, timer: TimerService, bus: InternalBus, network: ExternalBus, stasher: StashingRouter, primaries_selector: PrimariesSelector): self._config = getConfig() self._data = data self._new_view_builder = NewViewBuilder(self._data) self._timer = timer self._bus = bus self._network = network self._router = stasher # Last successful viewNo. # In some cases view_change process can be uncompleted in time. # In that case we want to know, which viewNo was successful (last completed view_change) self.last_completed_view_no = self._data.view_no self._resend_inst_change_timer = RepeatingTimer(self._timer, self._config.NEW_VIEW_TIMEOUT, self._propose_view_change_not_complete_in_time, active=False) self._old_prepared = {} # type: Dict[int, BatchID] self._old_preprepared = {} # type: Dict[int, List[BatchID]] self._stashed_vc_msgs = {} # type: Dict[int, int] self._primaries_selector = primaries_selector self._subscription = Subscription() self._subscription.subscribe(self._router, ViewChange, self.process_view_change_message) self._subscription.subscribe(self._router, ViewChangeAck, self.process_view_change_ack_message) self._subscription.subscribe(self._router, NewView, self.process_new_view_message) self._subscription.subscribe(self._bus, NeedViewChange, self.process_need_view_change)
def __init__(self, provider: ViewChangerDataProvider, timer: TimerService): self.provider = provider self._timer = timer self.inBox = deque() self.outBox = deque() self.inBoxRouter = Router( (InstanceChange, self.process_instance_change_msg) ) self.instance_changes = InstanceChangeProvider(self.config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db=self.provider.node_status_db) self.previous_view_no = None # Action for _schedule instanceChange messages self.instance_change_action = None # Count of instance change rounds self.instance_change_rounds = 0 # Time for view_change_starting self.start_view_change_ts = 0 # Force periodic view change if enabled in config force_view_change_freq = self.config.ForceViewChangeFreq if force_view_change_freq > 0: RepeatingTimer(self._timer, force_view_change_freq, self.on_master_degradation) # Start periodic freshness check state_freshness_update_interval = self.config.STATE_FRESHNESS_UPDATE_INTERVAL if state_freshness_update_interval > 0: RepeatingTimer(self._timer, state_freshness_update_interval, self.check_freshness)
def __init__(self, provider: ViewChangerDataProvider, timer: TimerService): self.provider = provider self._timer = timer self.pre_vc_strategy = None self._view_no = 0 # type: int self.inBox = deque() self.outBox = deque() self.inBoxRouter = Router( (InstanceChange, self.process_instance_change_msg), (ViewChangeDone, self.process_vchd_msg), (FutureViewChangeDone, self.process_future_view_vchd_msg)) self.instance_changes = InstanceChangeProvider( self.config.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL, node_status_db=self.provider.node_status_db) # Tracks if other nodes are indicating that this node is in lower view # than others. Keeps a map of view no to senders # TODO: Consider if sufficient ViewChangeDone for 2 different (and # higher views) are received, should one view change be interrupted in # between. self._next_view_indications = {} self._view_change_in_progress = False self.pre_view_change_in_progress = False self.previous_view_no = None self.previous_master_primary = None self.set_defaults() self.initInsChngThrottling() # Action for _schedule instanceChange messages self.instance_change_action = None # Count of instance change rounds self.instance_change_rounds = 0 # Time for view_change_starting self.start_view_change_ts = 0 # Last successful viewNo. # In some cases view_change process can be uncompleted in time. # In that case we want to know, which viewNo was successful (last completed view_change) self.last_completed_view_no = 0 # Force periodic view change if enabled in config force_view_change_freq = self.config.ForceViewChangeFreq if force_view_change_freq > 0: RepeatingTimer(self._timer, force_view_change_freq, self.on_master_degradation) # Start periodic freshness check state_freshness_update_interval = self.config.STATE_FRESHNESS_UPDATE_INTERVAL if state_freshness_update_interval > 0: RepeatingTimer(self._timer, state_freshness_update_interval, self.check_freshness)
class TestService: def __init__(self, interval: int, timer: TimerService): self.counter = 0 self._timer = RepeatingTimer(timer, interval, self._on_timer) self._timer.stop() self._timer.start() def _on_timer(self): self.counter += 1
class FreshnessMonitorService: def __init__(self, data: ConsensusSharedData, timer: TimerService, bus: InternalBus, network: ExternalBus, freshness_checker: FreshnessChecker, get_time_for_3pc_batch: Optional[Callable[[], int]] = None, metrics: MetricsCollector = NullMetricsCollector()): self._data = data self._timer = timer self._bus = bus self._network = network self.metrics = metrics self._freshness_checker = freshness_checker self._get_time_for_3pc_batch = get_time_for_3pc_batch if get_time_for_3pc_batch is not None else get_utc_epoch self._config = getConfig() # Start periodic freshness check state_freshness_update_interval = self._config.STATE_FRESHNESS_UPDATE_INTERVAL if state_freshness_update_interval > 0: self._check_freshness_timer = RepeatingTimer( self._timer, state_freshness_update_interval, self._check_freshness) def cleanup(self): self._check_freshness_timer.stop() def _check_freshness(self): if self._is_state_fresh_enough(): logger.debug( "{} not sending instance change because found state to be fresh enough" .format(self)) return self._bus.send(VoteForViewChange( Suspicions.STATE_SIGS_ARE_NOT_UPDATED)) def _is_state_fresh_enough(self) -> bool: # Don't trigger view change while catching up # TODO: INDY-2324 Do we really need to check waiting_for_new_view as well now? if not self._data.is_synced and not self._data.waiting_for_new_view: return True threshold = self._config.ACCEPTABLE_FRESHNESS_INTERVALS_COUNT * self._config.STATE_FRESHNESS_UPDATE_INTERVAL return self._state_freshness() < threshold def _state_freshness(self) -> float: timestamps_per_ledger = self._get_ledgers_last_update_time().values() return self._get_time_for_3pc_batch() - min(timestamps_per_ledger) def _get_ledgers_last_update_time(self) -> dict: return self._freshness_checker.get_last_update_time()
def __init__(self, timer: TimerService, bus: InternalBus): self._timer = timer self._bus = bus self._config = getConfig() # Force periodic view change if enabled in config force_view_change_freq = self._config.ForceViewChangeFreq if force_view_change_freq > 0: self._force_view_change_timer = RepeatingTimer(self._timer, force_view_change_freq, self._force_view_change)
def __init__(self, name: str, node_names: List[str], connections: PoolConnections): self._name = name self._node_names = node_names self._quorum = Quorums(len(node_names)) self._ts = 0 self._corrupted_name = None self._timer = TimerModel(name) self._connections = connections self._view_changer = create_view_changer(self) self._internal_outbox = ListEventStream() self.outbox = CompositeEventStream(self._internal_outbox, self._timer.outbox()) self._check_performance_timer = RepeatingTimer(self._timer, 30, self._check_performance)
def test_view_change_while_ordering_with_real_msgs(seed): # 1. Setup pool requests_count = REQUEST_COUNT batches_count = requests_count // MAX_BATCH_SIZE random = DefaultSimRandom(seed) pool = setup_pool(random, requests_count) # 2. Send 3pc batches random_interval = 1000 RepeatingTimer(pool.timer, random_interval, partial(order_requests, pool)) for node in pool.nodes: pool.timer.schedule( 3000, partial(node._view_changer.process_need_view_change, NeedViewChange(view_no=1))) # 3. Make sure that view_change is completed for node in pool.nodes: pool.timer.wait_for(lambda: node._view_changer._data.view_no == 1) # 3. Make sure all nodes ordered all the requests for node in pool.nodes: pool.timer.wait_for(partial(check_batch_count, node, batches_count)) # 4. Check data consistency check_consistency(pool)
def do_test(seed): # 1. Setup pool requests_count = REQUEST_COUNT batches_count = requests_count // MAX_BATCH_SIZE random = DefaultSimRandom(seed) reqs = create_requests(requests_count) pool = setup_pool(random) pool.sim_send_requests(reqs) initial_ledger_size = get_pools_ledger_size(pool) # 2. Send 3pc batches random_interval = 1000 RepeatingTimer(pool.timer, random_interval, partial(order_requests, pool)) for node in pool.nodes: pool.timer.schedule( 3000, partial(node._view_changer.process_need_view_change, NeedViewChange(view_no=1))) # 3. Make sure that view_change is completed for node in pool.nodes: pool.timer.wait_for(lambda: node._view_changer._data.view_no == 1) # 3. Make sure all nodes ordered all the requests for node in pool.nodes: pool.timer.wait_for(partial(check_batch_count, node, batches_count)) pool.timer.wait_for( partial(check_ledger_size, node, initial_ledger_size + REQUEST_COUNT)) # 4. Check data consistency pool.timer.wait_for(lambda: check_no_asserts(check_consistency, pool))
def __init__(self, provider: ViewChangerDataProvider, timer: TimerService): self.provider = provider self._timer = timer self.previous_view_no = None # Force periodic view change if enabled in config force_view_change_freq = self.config.ForceViewChangeFreq if force_view_change_freq > 0: RepeatingTimer(self._timer, force_view_change_freq, self.on_master_degradation) # Start periodic freshness check state_freshness_update_interval = self.config.STATE_FRESHNESS_UPDATE_INTERVAL if state_freshness_update_interval > 0: RepeatingTimer(self._timer, state_freshness_update_interval, self.check_freshness)
def test_node_txn_add_new_node(node_req_add, sim_pool, random): # Step 1. Prepare NODE requests and some of params to check # Count of NODE requests is random but less then pool size pool_reqs = [ node_req_add(sim_pool.size + i) for i in range(random.integer(1, sim_pool.size - 1)) ] domain_reqs = create_requests(DOMAIN_REQ_COUNT) reqs = pool_reqs + domain_reqs shuffle(reqs) sim_pool.sim_send_requests(reqs) current_view_no = sim_pool.view_no current_pool_ledger_size = get_pools_ledger_size(sim_pool, ledger_id=POOL_LEDGER_ID) current_domain_ledger_size = get_pools_ledger_size( sim_pool, ledger_id=DOMAIN_LEDGER_ID) expected_view_no = current_view_no + 1 expected_node_reg = sim_pool.validators + [ Greeks[sim_pool.size + i][0] for i in range(len(pool_reqs)) ] # Step 2. Start requests ordering random_interval = random.integer(10, 20) * 100 RepeatingTimer(sim_pool.timer, random_interval, partial(order_requests, sim_pool)) # Step 3. Initiate view change process during request's ordering for node in sim_pool.nodes: sim_pool.timer.schedule( random_interval + 1000, partial(node._view_changer.process_need_view_change, NeedViewChange(view_no=1))) # Step 4. Wait for VC completing for node in sim_pool.nodes: sim_pool.timer.wait_for(lambda: node._view_changer._data.view_no == 1) for node in sim_pool.nodes: sim_pool.timer.wait_for(lambda: not node._data.waiting_for_new_view) # Step 5. Check parameters like ordered txns count, node_reg state # For now we can run this checks only for old nodes with newly added. # Because we cannot run catchup process # ToDo: change this checks for making they on whole pool after INDY-2148 will be implemented sim_pool.timer.wait_for(lambda: all( [n._data.view_no == expected_view_no for n in sim_pool.nodes])) sim_pool.timer.wait_for( partial(check_node_reg, sim_pool, expected_node_reg)) for node in sim_pool.nodes: sim_pool.timer.wait_for( partial(check_ledger_size, node, current_pool_ledger_size + len(pool_reqs), POOL_LEDGER_ID)) sim_pool.timer.wait_for( partial(check_ledger_size, node, current_domain_ledger_size + len(domain_reqs), DOMAIN_LEDGER_ID))
class ForcedViewChangeService: def __init__(self, timer: TimerService, bus: InternalBus): self._timer = timer self._bus = bus self._config = getConfig() # Force periodic view change if enabled in config force_view_change_freq = self._config.ForceViewChangeFreq if force_view_change_freq > 0: self._force_view_change_timer = RepeatingTimer(self._timer, force_view_change_freq, self._force_view_change) def cleanup(self): self._force_view_change_timer.stop() def _force_view_change(self): self._bus.send(VoteForViewChange(Suspicions.DEBUG_FORCE_VIEW_CHANGE))
def test_repeating_timer_doesnt_repeat_too_much(): timer = MockTimer() cb = Callback() RepeatingTimer(timer, 5, cb) timer.advance(12) assert cb.call_count == 1 timer.advance(12) assert cb.call_count == 2
def test_repeating_timer_can_be_stopped(): timer = MockTimer() cb = Callback() repeating_timer = RepeatingTimer(timer, 5, cb) assert cb.call_count == 0 timer.advance(4) assert cb.call_count == 0 timer.advance(4) assert cb.call_count == 1 repeating_timer.stop() timer.advance(4) assert cb.call_count == 1 timer.advance(4) assert cb.call_count == 1
def __init__(self, name, config, prepare_to_send, metrics, mt_outgoing_size, timer=None, listener=None): self._name = name self.metrics = metrics self.listener = listener self._prepare_to_send = prepare_to_send self._mt_outgoing_size = mt_outgoing_size self._config = config self._timer = QueueTimer() if timer is None else timer self._pending_client_messages = OrderedDict() RepeatingTimer(self._timer, self._config.RESEND_CLIENT_MSG_TIMEOUT, self._send_pending_messages) RepeatingTimer(self._timer, self._config.REMOVE_CLIENT_MSG_TIMEOUT, self._remove_old_messages)
def test_repeating_timer_triggers_callback_on_time(): timer = MockTimer() cb = Callback() RepeatingTimer(timer, 5, cb) assert cb.call_count == 0 timer.advance(5) assert cb.call_count == 1 timer.advance(5) assert cb.call_count == 2
def test_multiple_repeating_timers_can_work_together(): timer = MockTimer() cb1 = Callback() cb2 = Callback() RepeatingTimer(timer, 5, cb1) RepeatingTimer(timer, 2, cb2) timer.advance(3) assert cb1.call_count == 0 assert cb2.call_count == 1 timer.advance(3) assert cb1.call_count == 1 assert cb2.call_count == 2 timer.advance(3) assert cb1.call_count == 1 assert cb2.call_count == 3 timer.advance(3) assert cb1.call_count == 2 assert cb2.call_count == 4
def test_node_txn_edge_primaries(pool_with_edge_primaries, node_req): pool = pool_with_edge_primaries current_view_no = pool.view_no expected_view_no = current_view_no + 1 expected_primaries = RoundRobinConstantNodesPrimariesSelector.select_primaries_round_robin( expected_view_no, pool.validators, pool.f + 1) req = node_req(pool.size) pool.sim_send_requests([req]) random_interval = 1000 RepeatingTimer(pool.timer, random_interval, partial(order_requests, pool)) pool.timer.wait_for( lambda: all([n._data.view_no == expected_view_no for n in pool.nodes])) pool.timer.wait_for(lambda: all( [n._data.primaries == expected_primaries for n in pool.nodes]))
def test_ordering_with_real_msgs(seed): # 1. Setup pool requests_count = REQUEST_COUNT batches_count = requests_count // MAX_BATCH_SIZE random = DefaultSimRandom(seed) pool = setup_pool(random, requests_count) # 2. Send 3pc batches random_interval = 1000 RepeatingTimer(pool.timer, random_interval, partial(order_requests, pool)) # 3. Make sure all nodes ordered all the requests for node in pool.nodes: pool.timer.wait_for(partial(check_batch_count, node, batches_count), timeout=20000) # 4. Check data consistency check_consistency(pool)
def test_repeating_timer_is_started_active(): timer = MockTimer() cb = Callback() RepeatingTimer(timer, 5, cb) assert cb.call_count == 0 timer.advance(3) assert cb.call_count == 0 timer.advance(3) assert cb.call_count == 1 timer.advance(3) assert cb.call_count == 1 timer.advance(3) assert cb.call_count == 2
def do_order_and_vc(pool, random, initial_view_no): # Step 1. Start requests ordering random_interval = random.float(1.0, 2.0) RepeatingTimer(pool.timer, random_interval, partial(order_requests, pool)) # Step 2. Initiate view change process during request's ordering for node in pool.nodes: pool.timer.schedule( random_interval + 1.0, partial(node._internal_bus.send, VoteForViewChange(Suspicions.DEBUG_FORCE_VIEW_CHANGE))) # Step 3. Wait for VC completing pool.timer.wait_for(lambda: all(not node._data.waiting_for_new_view and node._data.view_no > initial_view_no for node in pool.nodes), max_iterations=100000)
def test_repeating_timer_can_be_started_inactive(): timer = MockTimer() cb = Callback() RepeatingTimer(timer, 5, cb, active=False) assert cb.call_count == 0 timer.advance(3) assert cb.call_count == 0 timer.advance(3) assert cb.call_count == 0 timer.advance(3) assert cb.call_count == 0 timer.advance(3) assert cb.call_count == 0
def do_test(random): # 1. Setup pool requests_count = REQUEST_COUNT batches_count = requests_count // MAX_BATCH_SIZE reqs = create_requests(requests_count) pool = setup_pool(random) pool.sim_send_requests(reqs) # 2. Send 3pc batches random_interval = 1 RepeatingTimer(pool.timer, random_interval, partial(order_requests, pool)) # 3. Make sure all nodes ordered all the requests for node in pool.nodes: pool.timer.wait_for(partial(check_batch_count, node, batches_count)) # 4. Check data consistency check_consistency(pool)
class ViewChangeService: def __init__(self, data: ConsensusSharedData, timer: TimerService, bus: InternalBus, network: ExternalBus, stasher: StashingRouter): self._config = getConfig() self._logger = getlogger() self._data = data self._new_view_builder = NewViewBuilder(self._data) self._timer = timer self._bus = bus self._network = network self._router = stasher # Last successful viewNo. # In some cases view_change process can be uncompleted in time. # In that case we want to know, which viewNo was successful (last completed view_change) self.last_completed_view_no = self._data.view_no self._resend_inst_change_timer = RepeatingTimer( self._timer, self._config.NEW_VIEW_TIMEOUT, partial(self._propose_view_change_not_complete_in_time), active=False) self._router.subscribe(ViewChange, self.process_view_change_message) self._router.subscribe(ViewChangeAck, self.process_view_change_ack_message) self._router.subscribe(NewView, self.process_new_view_message) self._old_prepared = {} # type: Dict[int, BatchID] self._old_preprepared = {} # type: Dict[int, List[BatchID]] self._primaries_selector = RoundRobinPrimariesSelector() self._subscription = Subscription() self._subscription.subscribe(self._bus, NeedViewChange, self.process_need_view_change) def __repr__(self): return self._data.name @property def view_change_votes(self): return self._data.view_change_votes def process_need_view_change(self, msg: NeedViewChange): self._logger.info("{} processing {}".format(self, msg)) # 1. calculate new viewno view_no = msg.view_no if view_no is None: view_no = self._data.view_no + 1 # 2. Do cleanup before new view change starts self._clean_on_view_change_start() # 3. Update shared data self._data.view_no = view_no self._data.waiting_for_new_view = True self._data.primaries = self._primaries_selector.select_primaries( view_no=self._data.view_no, instance_count=self._data.quorums.f + 1, validators=self._data.validators) for i, primary_name in enumerate(self._data.primaries): self._logger.display( "{} selected primary {} for instance {} (view {})".format( PRIMARY_SELECTION_PREFIX, primary_name, i, self._data.view_no), extra={ "cli": "ANNOUNCE", "tags": ["node-election"] }) old_primary = self._data.primary_name self._data.primary_name = generateName( self._data.primaries[self._data.inst_id], self._data.inst_id) if not self._data.is_master: return if old_primary and self._data.primary_name == old_primary: self._logger.info("Selected master primary is the same with the " "current master primary (new_view {}). " "Propose a new view {}".format( self._data.view_no, self._data.view_no + 1)) self._propose_view_change(Suspicions.INCORRECT_NEW_PRIMARY.code) # 4. Build ViewChange message vc = self._build_view_change_msg() # 5. Send ViewChangeStarted via internal bus to update other services self._logger.info("{} sending {}".format(self, vc)) self._bus.send(ViewChangeStarted(view_no=self._data.view_no)) # 6. Send ViewChange msg to other nodes (via external bus) self._network.send(vc) self.view_change_votes.add_view_change(vc, self._data.name) # 7. Unstash messages for view change self._router.process_all_stashed(STASH_WAITING_VIEW_CHANGE) # 8. Restart instance change timer self._resend_inst_change_timer.stop() self._resend_inst_change_timer.start() def _clean_on_view_change_start(self): self._clear_old_batches(self._old_prepared) self._clear_old_batches(self._old_preprepared) self.view_change_votes.clear() self._data.new_view = None def _clear_old_batches(self, batches: Dict[int, Any]): for pp_seq_no in list(batches.keys()): if pp_seq_no <= self._data.stable_checkpoint: del batches[pp_seq_no] def _build_view_change_msg(self): for batch_id in self._data.prepared: self._old_prepared[batch_id.pp_seq_no] = batch_id prepared = sorted(list(self._old_prepared.values())) for new_bid in self._data.preprepared: pretenders = self._old_preprepared.get(new_bid.pp_seq_no, []) pretenders = [ bid for bid in pretenders if bid.pp_digest != new_bid.pp_digest ] pretenders.append(new_bid) self._old_preprepared[new_bid.pp_seq_no] = pretenders preprepared = sorted( [bid for bids in self._old_preprepared.values() for bid in bids]) return ViewChange(viewNo=self._data.view_no, stableCheckpoint=self._data.stable_checkpoint, prepared=prepared, preprepared=preprepared, checkpoints=list(self._data.checkpoints)) def process_view_change_message(self, msg: ViewChange, frm: str): result = self._validate(msg, frm) if result != PROCESS: return result, None self._logger.info("{} processing {} from {}".format(self, msg, frm)) self.view_change_votes.add_view_change(msg, frm) vca = ViewChangeAck(viewNo=msg.viewNo, name=getNodeName(frm), digest=view_change_digest(msg)) self.view_change_votes.add_view_change_ack( vca, getNodeName(self._data.name)) if self._data.is_primary: self._send_new_view_if_needed() return PROCESS, None primary_node_name = getNodeName(self._data.primary_name) self._logger.info("{} sending {}".format(self, vca)) self._network.send(vca, [primary_node_name]) self._finish_view_change_if_needed() return PROCESS, None def process_view_change_ack_message(self, msg: ViewChangeAck, frm: str): result = self._validate(msg, frm) if result != PROCESS: return result, None self._logger.info("{} processing {} from {}".format(self, msg, frm)) if not self._data.is_primary: return PROCESS, None self.view_change_votes.add_view_change_ack(msg, frm) self._send_new_view_if_needed() return PROCESS, None def process_new_view_message(self, msg: NewView, frm: str): result = self._validate(msg, frm) if result != PROCESS: return result, None self._logger.info("{} processing {} from {}".format(self, msg, frm)) if frm != self._data.primary_name: self._logger.info( "{} Received NewView {} for view {} from non-primary {}; expected primary {}" .format(self._data.name, msg, self._data.view_no, frm, self._data.primary_name)) return DISCARD, "New View from non-Primary" self._data.new_view = msg self._finish_view_change_if_needed() return PROCESS, None def _validate(self, msg: Union[ViewChange, ViewChangeAck, NewView], frm: str) -> int: # TODO: Proper validation if not self._data.is_master: return DISCARD if msg.viewNo < self._data.view_no: return DISCARD if msg.viewNo == self._data.view_no and not self._data.waiting_for_new_view: return DISCARD if msg.viewNo > self._data.view_no: return STASH_WAITING_VIEW_CHANGE return PROCESS def _send_new_view_if_needed(self): confirmed_votes = self.view_change_votes.confirmed_votes if not self._data.quorums.view_change.is_reached(len(confirmed_votes)): return view_changes = [ self.view_change_votes.get_view_change(*v) for v in confirmed_votes ] cp = self._new_view_builder.calc_checkpoint(view_changes) if cp is None: return batches = self._new_view_builder.calc_batches(cp, view_changes) if batches is None: return if cp not in self._data.checkpoints: return nv = NewView(viewNo=self._data.view_no, viewChanges=sorted(confirmed_votes, key=itemgetter(0)), checkpoint=cp, batches=batches) self._logger.info("{} sending {}".format(self, nv)) self._network.send(nv) self._data.new_view = nv self._finish_view_change() def _finish_view_change_if_needed(self): if self._data.new_view is None: return view_changes = [] for name, vc_digest in self._data.new_view.viewChanges: vc = self.view_change_votes.get_view_change(name, vc_digest) # We don't have needed ViewChange, so we cannot validate NewView if vc is None: self._request_view_change_message((name, vc_digest)) return view_changes.append(vc) cp = self._new_view_builder.calc_checkpoint(view_changes) if cp is None or cp != self._data.new_view.checkpoint: # New primary is malicious self._logger.info( "{} Received invalid NewView {} for view {}: expected checkpoint {}" .format(self._data.name, self._data.new_view, self._data.view_no, cp)) self._propose_view_change( Suspicions.NEW_VIEW_INVALID_CHECKPOINTS.code) return batches = self._new_view_builder.calc_batches(cp, view_changes) if batches != self._data.new_view.batches: # New primary is malicious self._logger.info( "{} Received invalid NewView {} for view {}: expected batches {}" .format(self._data.name, self._data.new_view, self._data.view_no, batches)) self._propose_view_change(Suspicions.NEW_VIEW_INVALID_BATCHES.code) return self._finish_view_change() def _finish_view_change(self): # Update shared data self._data.waiting_for_new_view = False self._data.prev_view_prepare_cert = self._data.new_view.batches[-1].pp_seq_no \ if self._data.new_view.batches else 0 # Cancel View Change timeout task self._resend_inst_change_timer.stop() # send message to other services self._bus.send( NewViewAccepted(view_no=self._data.new_view.viewNo, view_changes=self._data.new_view.viewChanges, checkpoint=self._data.new_view.checkpoint, batches=self._data.new_view.batches)) self.last_completed_view_no = self._data.view_no def _propose_view_change_not_complete_in_time(self): self._propose_view_change(Suspicions.INSTANCE_CHANGE_TIMEOUT.code) if self._data.new_view is None: self._request_new_view_message(self._data.view_no) def _propose_view_change(self, suspision_code): proposed_view_no = self._data.view_no # TODO: For some reason not incrementing view_no in most cases leads to lots of failing/flaky tests # if suspicion == Suspicions.INSTANCE_CHANGE_TIMEOUT or not self.view_change_in_progress: if suspision_code != Suspicions.STATE_SIGS_ARE_NOT_UPDATED or not self._data.waiting_for_new_view: proposed_view_no += 1 self._logger.info( "{} proposing a view change to {} with code {}".format( self, proposed_view_no, suspision_code)) msg = InstanceChange(proposed_view_no, suspision_code) self._network.send(msg) def _request_new_view_message(self, view_no): self._bus.send( MissingMessage(msg_type=NEW_VIEW, key=view_no, inst_id=self._data.inst_id, dst=[getNodeName(self._data.primary_name)], stash_data=None)) def _request_view_change_message(self, key): self._bus.send( MissingMessage(msg_type=VIEW_CHANGE, key=key, inst_id=self._data.inst_id, dst=None, stash_data=None))
class ConsProofService: def __init__(self, ledger_id: int, config: object, input: RxChannel, output: TxChannel, timer: TimerService, metrics: MetricsCollector, provider: CatchupDataProvider): router = Router(input) router.add(LedgerStatus, self.process_ledger_status) router.add(ConsistencyProof, self.process_consistency_proof) self._ledger_id = ledger_id self._ledger = provider.ledger(ledger_id) self._config = config self._output = output self._timer = timer self.metrics = metrics self._provider = provider self._is_working = False self._quorum = Quorums(len(self._provider.all_nodes_names())) self._same_ledger_status = set() self._cons_proofs = {} self._already_asked_for_cons_proofs_without_timeout = False self._last_txn_3PC_key = {} self._ledger_status_timer = \ RepeatingTimer(self._timer, self._config.LedgerStatusTimeout * (len(self._provider.all_nodes_names()) - 1), self._reask_for_ledger_status, active=False) self._consistency_proof_timer = \ RepeatingTimer(self._timer, self._config.ConsistencyProofsTimeout * (len(self._provider.all_nodes_names()) - 1), self._reask_for_last_consistency_proof, active=False) def __repr__(self) -> str: return "{}:ConsProofService:{}".format(self._provider.node_name(), self._ledger_id) def start(self, request_ledger_statuses: bool): logger.info("{} starts".format(self)) self._is_working = True self._quorum = Quorums(len(self._provider.all_nodes_names())) self._same_ledger_status = set() self._cons_proofs = {} self._already_asked_for_cons_proofs_without_timeout = False self._last_txn_3PC_key = {} if request_ledger_statuses: self._request_ledger_status_from_nodes() self._schedule_reask_ledger_status() def process_ledger_status(self, ledger_status: LedgerStatus, frm: str): if not self._can_process_ledger_status(ledger_status): return if self._is_ledger_old(ledger_status): self._process_newer_ledger_status(ledger_status, frm) else: self._process_same_ledger_status(ledger_status, frm) @measure_time(MetricsName.PROCESS_CONSISTENCY_PROOF_TIME) def process_consistency_proof(self, proof: ConsistencyProof, frm: str): if not self._can_process_consistency_proof(proof): return logger.info("{} received consistency proof: {} from {}".format( self, proof, frm)) self._cons_proofs[frm] = ConsistencyProof(*proof) if not self._is_catchup_needed(): self._finish_no_catchup() return self._try_asking_for_cons_proofs_without_timeout() if self._should_schedule_reask_cons_proofs(): self._schedule_reask_cons_proof() cp = self._get_cons_proof_for_catchup() if not cp: return self._finish(cp) def _finish(self, cons_proof: Optional[ConsistencyProof] = None): logger.info("{} finished with consistency proof {}".format( self, cons_proof)) till = CatchupTill( start_size=cons_proof.seqNoStart, final_size=cons_proof.seqNoEnd, final_hash=cons_proof.newMerkleRoot) if cons_proof else None nodes_ledger_sizes = { frm: proof.seqNoEnd for frm, proof in self._cons_proofs.items() if proof is not None } # Stop requesting last consistency proofs and ledger statuses. self._is_working = False self._same_ledger_status = set() self._cons_proofs = {} self._cancel_reask() self._output.put_nowait( LedgerCatchupStart(ledger_id=self._ledger_id, catchup_till=till, nodes_ledger_sizes=nodes_ledger_sizes)) def _finish_no_catchup(self): root = Ledger.hashToStr(self._ledger.tree.root_hash) last_3pc = self._get_last_txn_3PC_key() if not last_3pc: self._finish() return view_no, pp_seq_no = last_3pc cons_proof = ConsistencyProof(self._ledger_id, self._ledger.size, self._ledger.size, view_no, pp_seq_no, root, root, []) self._finish(cons_proof) def _get_last_txn_3PC_key(self): quorumed_3PC_keys = \ [ most_common_element for most_common_element, freq in Counter(self._last_txn_3PC_key.values()).most_common() if self._quorum.ledger_status_last_3PC.is_reached( freq) and most_common_element[0] is not None and most_common_element[1] is not None ] if len(quorumed_3PC_keys) == 0: return None min_quorumed_3PC_key = min_3PC_key(quorumed_3PC_keys) return min_quorumed_3PC_key @measure_time(MetricsName.SEND_MESSAGE_REQ_TIME) def _request_ledger_status_from_nodes(self, nodes=None): logger.info("{} asking for ledger status of ledger {}".format( self, self._ledger_id)) ledger_status_req = MessageReq(LEDGER_STATUS, {f.LEDGER_ID.nm: self._ledger_id}) self._provider.send_to_nodes(ledger_status_req, nodes=nodes) def _can_process_ledger_status(self, ledger_status: LedgerStatus): if ledger_status.ledgerId != self._ledger_id: return False if not self._is_working: logger.info( '{} ignoring {} since it is not gathering ledger statuses'. format(self, ledger_status)) return False return True def _can_process_consistency_proof(self, proof: ConsistencyProof): if proof.ledgerId != self._ledger_id: return False if not self._is_working: logger.info( '{} ignoring {} since it is not gathering consistency proofs'. format(self, proof)) return False start = getattr(proof, f.SEQ_NO_START.nm) end = getattr(proof, f.SEQ_NO_END.nm) # TODO: Should we discard where start is older than the ledger size ledger_size = self._ledger.size if start > ledger_size: self._provider.discard(proof, reason="Start {} is greater than " "ledger size {}".format(start, ledger_size), logMethod=logger.warning) return False if end <= start: self._provider.discard(proof, reason="End {} is not greater than " "start {}".format(end, start), logMethod=logger.warning) return False return True def _process_newer_ledger_status(self, ledger_status: LedgerStatus, frm: str): # If we are behind the node which has sent the ledger status # then send our ledger status to it # in order to get the consistency proof from it my_ledger_status = build_ledger_status(self._ledger_id, self._provider) self._provider.send_to(my_ledger_status, frm) self._schedule_reask_last_cons_proof() def _process_same_ledger_status(self, ledger_status: LedgerStatus, frm: str): # We are not behind the node which has sent the ledger status, # so our ledger is OK in comparison with that node # and we will not get a consistency proof from it self._same_ledger_status.add(frm) self._cons_proofs[frm] = None # If we are even with the node which has sent the ledger status # then save the last txn master 3PC-key from it if self._is_ledger_same(ledger_status): self._last_txn_3PC_key[frm] = (ledger_status.viewNo, ledger_status.ppSeqNo) if not self._is_catchup_needed(): self._finish_no_catchup() def _is_ledger_old(self, status: LedgerStatus): # Is self ledger older than the `LedgerStatus` return self._compare_ledger(status) < 0 def _is_ledger_same(self, status: LedgerStatus): # Is self ledger same as the `LedgerStatus` return self._compare_ledger(status) == 0 def _compare_ledger(self, status: LedgerStatus): ledgerId = getattr(status, f.LEDGER_ID.nm) seqNo = getattr(status, f.TXN_SEQ_NO.nm) ledger = self._provider.ledger(self._ledger_id) logger.info("{} comparing its ledger {} of size {} with {}".format( self, ledgerId, ledger.seqNo, seqNo)) return ledger.seqNo - seqNo def _is_catchup_needed(self): # If we gathered the quorum of ledger statuses indicating # that our ledger is OK then we do not need to perform actual # synchronization of our ledger (however, we still need to do # the ledger pre-catchup and post-catchup procedures) if self._quorum.ledger_status.is_reached(len( self._same_ledger_status)): return False # If at least n-f-1 nodes were found to be at the same state # then this node's state is good too if not self._quorum.consistency_proof.is_reached(len( self._cons_proofs)): return True grpd_prf, null_proofs_count = self._get_group_consistency_proofs( self._cons_proofs) if self._quorum.ledger_status.is_reached(null_proofs_count): return False return True def _try_asking_for_cons_proofs_without_timeout(self): if self._already_asked_for_cons_proofs_without_timeout: return if len(self._cons_proofs) < self._quorum.n - self._quorum.f - 1: return sizes = [ proof.seqNoEnd if proof else self._ledger.size for proof in self._cons_proofs.values() ] sizes.sort(reverse=True) start = self._ledger.size end = sizes[self._quorum.f] # This is actually f+1's reply if start >= end: logger.info( "{} not asking for equal cons proofs because " "it appears that less than f+1 nodes have more transactions". format(self)) return logger.info("{} asking for consistency proof request: {}".format( self, self._ledger_id, start, end)) cons_proof_req = MessageReq( CONSISTENCY_PROOF, { f.LEDGER_ID.nm: self._ledger_id, f.SEQ_NO_START.nm: start, f.SEQ_NO_END.nm: end }) self._provider.send_to_nodes(cons_proof_req, nodes=self._provider.eligible_nodes()) self._already_asked_for_cons_proofs_without_timeout = True def _should_schedule_reask_cons_proofs(self): return len([v for v in self._cons_proofs.values() if v is not None]) == self._quorum.f + 1 def _get_cons_proof_for_catchup(self): if not self._quorum.consistency_proof.is_reached(len( self._cons_proofs)): return None grpd_prf, _ = self._get_group_consistency_proofs(self._cons_proofs) result = self._get_latest_reliable_proof(grpd_prf) if not result: logger.info("{} cannot start catchup since received only {} " "consistency proofs but need at least {}".format( self, len(self._cons_proofs), self._quorum.consistency_proof.value)) return None return ConsistencyProof(self._ledger_id, *result) def _get_group_consistency_proofs(self, proofs): logger.info("{} deciding on the basis of CPs {} and f {}".format( self, proofs, self._quorum.f)) recvdPrf = {} # For the case where the other node is at the same state as # this node nullProofs = 0 for nodeName, proof in proofs.items(): if proof: start, end = getattr(proof, f.SEQ_NO_START.nm), getattr( proof, f.SEQ_NO_END.nm) if (start, end) not in recvdPrf: recvdPrf[(start, end)] = {} key = (getattr(proof, f.VIEW_NO.nm), getattr(proof, f.PP_SEQ_NO.nm), getattr(proof, f.OLD_MERKLE_ROOT.nm), getattr(proof, f.NEW_MERKLE_ROOT.nm), tuple(getattr(proof, f.HASHES.nm))) recvdPrf[(start, end)][key] = recvdPrf[(start, end)].get(key, 0) + 1 else: logger.info("{} found proof by {} null".format(self, nodeName)) nullProofs += 1 return recvdPrf, nullProofs def _get_reliable_proofs(self, grouped_proofs): result = {} for (start, end), val in grouped_proofs.items(): for (view_no, lastPpSeqNo, oldRoot, newRoot, hashes), count in val.items(): if self._quorum.same_consistency_proof.is_reached(count): result[(start, end)] = (view_no, lastPpSeqNo, oldRoot, newRoot, hashes) # There would be only one correct proof for a range of # sequence numbers break return result def _get_latest_reliable_proof(self, grouped_proofs): reliableProofs = self._get_reliable_proofs(grouped_proofs) latest = None for (start, end), (view_no, last_pp_seq_no, oldRoot, newRoot, hashes) in reliableProofs.items(): # TODO: Can we do something where consistency proof's start is older # than the current ledger's size and proof's end is larger # than the current ledger size. # Ignore if proof's start is not the same as the ledger's end if start != self._ledger.size: continue if latest is None or latest[1] < end: latest = (start, end) + (view_no, last_pp_seq_no, oldRoot, newRoot, hashes) return latest def _reask_for_ledger_status(self): nodes = [ node_name for node_name in self._provider.all_nodes_names() if node_name not in self._same_ledger_status and node_name != self._provider.node_name() ] self._request_ledger_status_from_nodes(nodes) def _reask_for_last_consistency_proof(self): ledger_status = build_ledger_status(self._ledger_id, self._provider) nodes = [ node_name for node_name in self._provider.all_nodes_names() if node_name not in self._cons_proofs and node_name != self._provider.node_name() ] self._provider.send_to_nodes(ledger_status, nodes=nodes) def _request_CPs_if_needed(self): if not self._is_working: return proofs = self._cons_proofs # there is no any received ConsistencyProofs if not proofs: return logger.info( "{} requesting consistency proofs after timeout".format(self)) if not self._is_catchup_needed(): return grouped_proofs, null_proofs_count = self._get_group_consistency_proofs( proofs) if self._quorum.ledger_status.is_reached(null_proofs_count) \ or len(grouped_proofs) == 0: return if self._get_latest_reliable_proof(grouped_proofs): return with self.metrics.measure_time(MetricsName.SEND_MESSAGE_REQ_TIME): start, end = self._get_consistency_proof_request_params( grouped_proofs) logger.info("{} sending consistency proof request: {}".format( self, self._ledger_id, start, end)) cons_proof_req = MessageReq( CONSISTENCY_PROOF, { f.LEDGER_ID.nm: self._ledger_id, f.SEQ_NO_START.nm: start, f.SEQ_NO_END.nm: end }) self._provider.send_to_nodes(cons_proof_req, nodes=self._provider.eligible_nodes()) def _get_consistency_proof_request_params(self, groupedProofs): # Choose the consistency proof which occurs median number of times in # grouped proofs. Not choosing the highest since some malicious nodes # might be sending non-existent sequence numbers and not choosing the # lowest since that might not be enough as some nodes must be lagging # behind a lot or some malicious nodes might send low sequence numbers. proofs = sorted(groupedProofs.items(), key=lambda t: max(t[1].values())) return self._ledger.size, proofs[len(proofs) // 2][0][1] def _schedule_reask_cons_proof(self): # At least once correct node believes that this node is behind. # Stop requesting last consistency proofs and ledger statuses. self._cancel_reask() # Start timer that will expire in some time and if till that time # enough CPs are not received, then explicitly request CPs # from other nodes, see `request_CPs_if_needed` self._timer.schedule(delay=self._config.ConsistencyProofsTimeout * (len(self._provider.all_nodes_names()) - 1), callback=self._request_CPs_if_needed) def _schedule_reask_ledger_status(self): self._ledger_status_timer.update_interval( self._config.LedgerStatusTimeout * (len(self._provider.all_nodes_names()) - 1)) self._ledger_status_timer.start() def _schedule_reask_last_cons_proof(self): self._consistency_proof_timer.update_interval( self._config.ConsistencyProofsTimeout * (len(self._provider.all_nodes_names()) - 1)) self._consistency_proof_timer.start() def _cancel_reask(self): if self._consistency_proof_timer: self._consistency_proof_timer.stop() if self._ledger_status_timer: self._ledger_status_timer.stop() self._timer.cancel(self._request_CPs_if_needed)