def start_heartbeat(self): while self.running: try: # do heartbeat self.storage.clear_dead_members(self.ttl_ms) self.storage.update_member(self.server_uri, self.uuid) self.living_members = self.storage.list_living_members( self.ttl_ms) if not self.notified_others_after_start: for member in self.living_members: if member.server_uri == self.server_uri: continue channel = grpc.insecure_channel(member.server_uri) self.member_connections[member.server_uri] = \ notification_service_pb2_grpc.NotificationServiceStub(channel) try: self.member_connections[ member.server_uri].notifyNewMember( NotifyNewMemberRequest( member=member_to_proto( Member( 1, self.server_uri, int(time.time_ns() / 1000000))))) except grpc.RpcError: logging.error("Notify new member to '%s' failed." % member.server_uri, exc_info=True) self.notified_others_after_start = True except Exception as e: logging.error( "Exception thrown when send heartbeat to the HA storage.", exc_info=True) sleep_and_detecting_running(self.ttl_ms / 2, lambda: self.running)
def call_with_retry(*args, **kwargs): current_func = getattr(self.notification_stub, func.__name__).inner_func start_time = time.time_ns() / 1000000 failed_members = set() while True: try: return current_func(*args, **kwargs) except grpc.RpcError: logging.error( "Exception thrown when calling rpc, change the connection.", exc_info=True) with self.ha_change_lock: # check the current_uri to ensure thread safety if current_func.server_uri == self.current_uri: living_members = list(self.living_members) failed_members.add(self.current_uri) shuffle(living_members) found_new_member = False for server_uri in living_members: if server_uri in failed_members: continue next_uri = server_uri channel = grpc.insecure_channel(next_uri) notification_stub = self._wrap_rpcs( notification_service_pb2_grpc. NotificationServiceStub(channel), next_uri) self.notification_stub = notification_stub current_func = getattr( self.notification_stub, current_func.__name__).inner_func self.current_uri = next_uri found_new_member = True if not found_new_member: logging.error( "No available living members currently. Sleep and retry." ) failed_members.clear() sleep_and_detecting_running( self.retry_interval_ms, lambda: self.ha_running) # break if stopped or timeout if not self.ha_running or \ time.time_ns() / 1000000 > start_time + self.retry_timeout_ms: if not self.ha_running: raise Exception("HA has been disabled.") else: raise Exception("Rpc retry timeout!")