async def _send_to_state_service( self, request: ReportStatesRequest, ) -> None: state_client = self._grpc_client_manager.get_client() try: response = await grpc_async_wrapper( state_client.ReportStates.future( request, self._mconfig.checkin_timeout, ), self._loop, ) for idAndError in response.unreportedStates: logging.error( "Failed to report state for (%s,%s): %s", idAndError.type, idAndError.deviceID, idAndError.error, ) # Report that the gateway successfully connected to the cloud CHECKIN_STATUS.set(1) self._error_handler.num_failed_state_reporting = 0 logging.info( "Checkin Successful! " "Successfully sent states to the cloud!", ) except grpc.RpcError as err: self._error_handler.report_to_cloud_error(err) finally: # reset timeout to config-specified + some buffer self.set_timeout(self._interval * 2)
def _checkin_error(self, err): logging.error("Checkin Error! [%s] %s", err.code(), err.details()) CHECKIN_STATUS.set(0) self.num_failed_checkins += 1 if self.num_failed_checkins == self.CHECKIN_FAIL_THRESHOLD: logging.info('Checkin failure threshold met, remediating...') if self._checkin_failure_cb is not None: asyncio.ensure_future(self._checkin_failure_cb(err.code()), loop=self._loop) self._try_reuse_checkin_client(err.code())
def __init__(self, service, service_poller): super().__init__(max(5, service.mconfig.checkin_interval), service.loop) self._service = service self._service_poller = service_poller # Number of consecutive failed checkins before we check for an outdated # cert self.CHECKIN_FAIL_THRESHOLD = 10 # Current number of consecutive failed checkins self.num_failed_checkins = 0 self._checkin_failure_cb = None # cloud controller's client stub self._checkin_client = None self.MAX_CLIENT_REUSE = 60 # skip checkin based on missing status meta self.num_skipped_checkins = 0 # Initially set status to 1, otherwise on first checkin we report # a failure. This is particularly an issue if magmad restarts frequenty. CHECKIN_STATUS.set(1) # One time status info self._boot_time = psutil.boot_time() self._kernel_version = platform.uname().release cpu_info = get_cpu_info() if cpu_info.error is not None: logging.error('Failed to get cpu info: %s', cpu_info.error) self._cpu_info = CPUInfo( core_count=cpu_info.core_count, threads_per_core=cpu_info.threads_per_core, architecture=cpu_info.architecture, model_name=cpu_info.model_name, ) self.native_gw_status_generator = GatewayStatusNative(service) self._kernel_versions_installed = [] self._periodically_check_kernel_versions = \ service.config.get('enable_kernel_version_checking', False) # Save for the state manager to also send to the state service self.gw_status_json = None # set initial checkin timeout to "large" since no checkins occur until # bootstrap succeeds. self.set_timeout(60 * 60 * 2) # initially set task as alive to wait for bootstrap, where try_checkin() # will recheck alive status self.heartbeat() # Start try_checkin loop self.start()
def report_to_cloud_error(self, err): """ report_to_cloud_error checks if the number of failed reporting exceeds the threshold specified in the config. If it does, it will trigger a bootstrap if the certificate is invalid. """ logging.error("Checkin Error! Failed to report states. [%s] %s", err.code(), err.details()) CHECKIN_STATUS.set(0) self.num_failed_state_reporting += 1 if self.num_failed_state_reporting >= self.fail_threshold: logging.info('StateReporting (Checkin) failure threshold met, ' 'remediating...') asyncio.ensure_future( self._schedule_bootstrap_if_cert_is_invalid(err.code())) self._grpc_client_manager.on_grpc_fail(err.code())
def _checkin_done(self, future): err = future.exception() if err: logging.error("Checkin Error! [%s] %s", err.code(), err.details()) CHECKIN_STATUS.set(0) self.num_failed_checkins += 1 if self.num_failed_checkins == self.CHECKIN_FAIL_THRESHOLD: logging.info('Checkin failure threshold met, remediating...') if self._checkin_failure_cb is not None: self._checkin_failure_cb(err.code()) self._try_reuse_checkin_client(err.code()) else: CHECKIN_STATUS.set(1) self._checkin_client = None self.num_failed_checkins = 0 logging.info("Checkin Successful!")
def __init__( self, config: Any, mconfig: Any, loop: asyncio.AbstractEventLoop, bootstrap_manager: BootstrapManager, gw_status_factory: GatewayStatusFactory, grpc_client_manager: GRPCClientManager, ): super().__init__( interval=max(mconfig.checkin_interval, 5), loop=loop, ) self._loop = loop # keep a pointer to mconfig since config stored can change over time self._mconfig = mconfig # Manages all metadata and methods on dealing with failures. # (invalid gateway status, cloud reporting error) self._error_handler = StateReporterErrorHandler( loop=loop, config=config, grpc_client_manager=grpc_client_manager, bootstrap_manager=bootstrap_manager, ) # gateway status factory to bundle various information about this # gateway into an object. self._gw_status_factory = gw_status_factory # grpc_client_manager to manage grpc client recycling self._grpc_client_manager = grpc_client_manager # A dictionary of all services registered with a service303 interface. # Holds service name to service info gathered from the config self._service_info_by_name = self._construct_service_info_by_name( config=config, ) # Initially set status to 1, otherwise on the first round we report a # failure. This is particularly an issue if magmad restarts frequenty. CHECKIN_STATUS.set(1) # set initial timeout to "large" since no reporting can occur until # bootstrap succeeds. self.set_timeout(60 * 60 * 2) # initially set task as alive to wait for bootstrap self.heartbeat()
def _checkin_done(self): CHECKIN_STATUS.set(1) self._checkin_client = None self.num_failed_checkins = 0 logging.info("Checkin Successful!")