def start_heartbeat(self): while self.running: try: # do heartbeat self.storage.clear_dead_members(self.ttl_ms) self.storage.update_member(self.server_uri, self.uuid) self.living_members = self.storage.list_living_members(self.ttl_ms) if not self.notified_others_after_start: for member in self.living_members: if member.server_uri == self.server_uri: continue channel = grpc.insecure_channel(member.server_uri) self.member_connections[member.server_uri] = \ HighAvailabilityManagerStub(channel) try: self.member_connections[member.server_uri].notifyNewMember( NotifyNewMemberRequest(member=member_to_proto( Member(1, self.server_uri, int(time.time_ns() / 1000000)) ))) except grpc.RpcError: logging.error("Notify new member to '%s' failed." % member.server_uri, exc_info=True) self.notified_others_after_start = True except Exception: logging.error("Exception thrown when send heartbeat to the HA storage.", exc_info=True) sleep_and_detecting_running(self.ttl_ms / 2, lambda: self.running)
def _replace_aiflow_stubs(self, server_uri): high_availability_channel = grpc.insecure_channel(server_uri) high_availability_stub = self._wrap_aiflow_rpcs( HighAvailabilityManagerStub(high_availability_channel), server_uri, "high_availability_stub") self.high_availability_stub = high_availability_stub metadata_channel = grpc.insecure_channel(server_uri) metadata_store_stub = self._wrap_aiflow_rpcs( MetadataServiceStub(metadata_channel), server_uri, "metadata_store_stub") self.metadata_store_stub = metadata_store_stub model_center_channel = grpc.insecure_channel(server_uri) model_center_stub = self._wrap_aiflow_rpcs( ModelCenterServiceStub(model_center_channel), server_uri, "model_center_stub") self.model_center_stub = model_center_stub deploy_channel = grpc.insecure_channel(server_uri) deploy_stub = self._wrap_aiflow_rpcs(DeployServiceStub(deploy_channel), server_uri, "deploy_stub") self.deploy_stub = deploy_stub metric_channel = grpc.insecure_channel(server_uri) metric_stub = self._wrap_aiflow_rpcs(MetricServiceStub(metric_channel), server_uri, "metric_stub") self.metric_stub = metric_stub
def __init__(self, server_uri=_SERVER_URI, notification_service_uri=None, project_config: ProjectConfig = None): MetadataClient.__init__(self, server_uri) ModelCenterClient.__init__(self, server_uri) DeployClient.__init__(self, server_uri) MetricClient.__init__(self, server_uri) self.enable_ha = False self.list_member_interval_ms = 5000 self.retry_interval_ms = 1000 self.retry_timeout_ms = 10000 if project_config is not None: if server_uri is None: server_uri = project_config.get_master_uri() if notification_service_uri is None: notification_service_uri = project_config.get_notification_service_uri( ) self.enable_ha = project_config.get_enable_ha() self.list_member_interval_ms = project_config.get_list_member_interval_ms( ) self.retry_interval_ms = project_config.get_retry_interval_ms() self.retry_timeout_ms = project_config.get_retry_timeout_ms() if notification_service_uri is None: NotificationClient.__init__( self, server_uri, enable_ha=self.enable_ha, list_member_interval_ms=self.list_member_interval_ms, retry_interval_ms=self.retry_interval_ms, retry_timeout_ms=self.retry_timeout_ms) else: NotificationClient.__init__( self, notification_service_uri, enable_ha=self.enable_ha, list_member_interval_ms=self.list_member_interval_ms, retry_interval_ms=self.retry_interval_ms, retry_timeout_ms=self.retry_timeout_ms) if self.enable_ha: server_uris = server_uri.split(",") self.living_aiflow_members = [] self.current_aiflow_uri = None last_error = None for server_uri in server_uris: channel = grpc.insecure_channel(server_uri) high_availability_stub = HighAvailabilityManagerStub(channel) try: request = ListMembersRequest(timeout_seconds=0) response = high_availability_stub.listMembers(request) if response.return_code == ReturnStatus.CALL_SUCCESS: self.living_aiflow_members = [ proto_to_member(proto).server_uri for proto in response.members ] else: raise Exception(response.return_msg) self.current_aiflow_uri = server_uri self.high_availability_stub = high_availability_stub break except grpc.RpcError as e: last_error = e if self.current_aiflow_uri is None: raise Exception( "No available aiflow server uri!") from last_error self.aiflow_ha_change_lock = threading.Lock() self.aiflow_ha_running = True self._replace_aiflow_stubs(self.current_aiflow_uri) self.list_aiflow_member_thread = threading.Thread( target=self._list_aiflow_members, daemon=True) self.list_aiflow_member_thread.start()