def start_heartbeat(self):
     while self.running:
         try:
             # do heartbeat
             self.storage.clear_dead_members(self.ttl_ms)
             self.storage.update_member(self.server_uri, self.uuid)
             self.living_members = self.storage.list_living_members(
                 self.ttl_ms)
             if not self.notified_others_after_start:
                 for member in self.living_members:
                     if member.server_uri == self.server_uri:
                         continue
                     channel = grpc.insecure_channel(member.server_uri)
                     self.member_connections[member.server_uri] = \
                         notification_service_pb2_grpc.NotificationServiceStub(channel)
                     try:
                         self.member_connections[
                             member.server_uri].notifyNewMember(
                                 NotifyNewMemberRequest(
                                     member=member_to_proto(
                                         Member(
                                             1, self.server_uri,
                                             int(time.time_ns() /
                                                 1000000)))))
                     except grpc.RpcError:
                         logging.error("Notify new member to '%s' failed." %
                                       member.server_uri,
                                       exc_info=True)
                 self.notified_others_after_start = True
         except Exception as e:
             logging.error(
                 "Exception thrown when send heartbeat to the HA storage.",
                 exc_info=True)
         sleep_and_detecting_running(self.ttl_ms / 2, lambda: self.running)
        def call_with_retry(*args, **kwargs):
            current_func = getattr(self.notification_stub,
                                   func.__name__).inner_func
            start_time = time.time_ns() / 1000000
            failed_members = set()
            while True:
                try:
                    return current_func(*args, **kwargs)
                except grpc.RpcError:
                    logging.error(
                        "Exception thrown when calling rpc, change the connection.",
                        exc_info=True)
                    with self.ha_change_lock:
                        # check the current_uri to ensure thread safety
                        if current_func.server_uri == self.current_uri:
                            living_members = list(self.living_members)
                            failed_members.add(self.current_uri)
                            shuffle(living_members)
                            found_new_member = False
                            for server_uri in living_members:
                                if server_uri in failed_members:
                                    continue
                                next_uri = server_uri
                                channel = grpc.insecure_channel(next_uri)
                                notification_stub = self._wrap_rpcs(
                                    notification_service_pb2_grpc.
                                    NotificationServiceStub(channel), next_uri)
                                self.notification_stub = notification_stub
                                current_func = getattr(
                                    self.notification_stub,
                                    current_func.__name__).inner_func
                                self.current_uri = next_uri
                                found_new_member = True
                            if not found_new_member:
                                logging.error(
                                    "No available living members currently. Sleep and retry."
                                )
                                failed_members.clear()
                                sleep_and_detecting_running(
                                    self.retry_interval_ms,
                                    lambda: self.ha_running)

                # break if stopped or timeout
                if not self.ha_running or \
                        time.time_ns() / 1000000 > start_time + self.retry_timeout_ms:
                    if not self.ha_running:
                        raise Exception("HA has been disabled.")
                    else:
                        raise Exception("Rpc retry timeout!")