예제 #1
0
 async def _send_to_state_service(
     self,
     request: ReportStatesRequest,
 ) -> None:
     state_client = self._grpc_client_manager.get_client()
     try:
         response = await grpc_async_wrapper(
             state_client.ReportStates.future(
                 request,
                 self._mconfig.checkin_timeout,
             ),
             self._loop,
         )
         for idAndError in response.unreportedStates:
             logging.error(
                 "Failed to report state for (%s,%s): %s",
                 idAndError.type,
                 idAndError.deviceID,
                 idAndError.error,
             )
         # Report that the gateway successfully connected to the cloud
         CHECKIN_STATUS.set(1)
         self._error_handler.num_failed_state_reporting = 0
         logging.info(
             "Checkin Successful! "
             "Successfully sent states to the cloud!", )
     except grpc.RpcError as err:
         self._error_handler.report_to_cloud_error(err)
     finally:
         # reset timeout to config-specified + some buffer
         self.set_timeout(self._interval * 2)
예제 #2
0
 def _checkin_error(self, err):
     logging.error("Checkin Error! [%s] %s", err.code(), err.details())
     CHECKIN_STATUS.set(0)
     self.num_failed_checkins += 1
     if self.num_failed_checkins == self.CHECKIN_FAIL_THRESHOLD:
         logging.info('Checkin failure threshold met, remediating...')
         if self._checkin_failure_cb is not None:
             asyncio.ensure_future(self._checkin_failure_cb(err.code()),
                                   loop=self._loop)
     self._try_reuse_checkin_client(err.code())
예제 #3
0
    def __init__(self, service, service_poller):
        super().__init__(max(5, service.mconfig.checkin_interval),
                         service.loop)

        self._service = service
        self._service_poller = service_poller

        # Number of consecutive failed checkins before we check for an outdated
        # cert
        self.CHECKIN_FAIL_THRESHOLD = 10
        # Current number of consecutive failed checkins
        self.num_failed_checkins = 0
        self._checkin_failure_cb = None

        # cloud controller's client stub
        self._checkin_client = None
        self.MAX_CLIENT_REUSE = 60

        # skip checkin based on missing status meta
        self.num_skipped_checkins = 0

        # Initially set status to 1, otherwise on first checkin we report
        # a failure. This is particularly an issue if magmad restarts frequenty.
        CHECKIN_STATUS.set(1)

        # One time status info
        self._boot_time = psutil.boot_time()
        self._kernel_version = platform.uname().release
        cpu_info = get_cpu_info()
        if cpu_info.error is not None:
            logging.error('Failed to get cpu info: %s', cpu_info.error)
        self._cpu_info = CPUInfo(
            core_count=cpu_info.core_count,
            threads_per_core=cpu_info.threads_per_core,
            architecture=cpu_info.architecture,
            model_name=cpu_info.model_name,
        )
        self.native_gw_status_generator = GatewayStatusNative(service)

        self._kernel_versions_installed = []
        self._periodically_check_kernel_versions = \
            service.config.get('enable_kernel_version_checking', False)
        # Save for the state manager to also send to the state service
        self.gw_status_json = None
        # set initial checkin timeout to "large" since no checkins occur until
        #   bootstrap succeeds.
        self.set_timeout(60 * 60 * 2)
        # initially set task as alive to wait for bootstrap, where try_checkin()
        #   will recheck alive status
        self.heartbeat()

        # Start try_checkin loop
        self.start()
예제 #4
0
 def report_to_cloud_error(self, err):
     """
     report_to_cloud_error checks if the number of failed reporting exceeds
     the threshold specified in the config. If it does, it will trigger a
     bootstrap if the certificate is invalid.
     """
     logging.error("Checkin Error! Failed to report states. [%s] %s",
                   err.code(), err.details())
     CHECKIN_STATUS.set(0)
     self.num_failed_state_reporting += 1
     if self.num_failed_state_reporting >= self.fail_threshold:
         logging.info('StateReporting (Checkin) failure threshold met, '
                      'remediating...')
         asyncio.ensure_future(
             self._schedule_bootstrap_if_cert_is_invalid(err.code()))
     self._grpc_client_manager.on_grpc_fail(err.code())
예제 #5
0
 def _checkin_done(self, future):
     err = future.exception()
     if err:
         logging.error("Checkin Error! [%s] %s", err.code(), err.details())
         CHECKIN_STATUS.set(0)
         self.num_failed_checkins += 1
         if self.num_failed_checkins == self.CHECKIN_FAIL_THRESHOLD:
             logging.info('Checkin failure threshold met, remediating...')
             if self._checkin_failure_cb is not None:
                 self._checkin_failure_cb(err.code())
         self._try_reuse_checkin_client(err.code())
     else:
         CHECKIN_STATUS.set(1)
         self._checkin_client = None
         self.num_failed_checkins = 0
         logging.info("Checkin Successful!")
예제 #6
0
    def __init__(
        self,
        config: Any,
        mconfig: Any,
        loop: asyncio.AbstractEventLoop,
        bootstrap_manager: BootstrapManager,
        gw_status_factory: GatewayStatusFactory,
        grpc_client_manager: GRPCClientManager,
    ):
        super().__init__(
            interval=max(mconfig.checkin_interval, 5),
            loop=loop,
        )
        self._loop = loop
        # keep a pointer to mconfig since config stored can change over time
        self._mconfig = mconfig

        # Manages all metadata and methods on dealing with failures.
        # (invalid gateway status, cloud reporting error)
        self._error_handler = StateReporterErrorHandler(
            loop=loop,
            config=config,
            grpc_client_manager=grpc_client_manager,
            bootstrap_manager=bootstrap_manager,
        )

        # gateway status factory to bundle various information about this
        # gateway into an object.
        self._gw_status_factory = gw_status_factory

        # grpc_client_manager to manage grpc client recycling
        self._grpc_client_manager = grpc_client_manager

        # A dictionary of all services registered with a service303 interface.
        # Holds service name to service info gathered from the config
        self._service_info_by_name = self._construct_service_info_by_name(
            config=config, )

        # Initially set status to 1, otherwise on the first round we report a
        # failure. This is particularly an issue if magmad restarts frequenty.
        CHECKIN_STATUS.set(1)
        # set initial timeout to "large" since no reporting can occur until
        # bootstrap succeeds.
        self.set_timeout(60 * 60 * 2)
        # initially set task as alive to wait for bootstrap
        self.heartbeat()
예제 #7
0
 def _checkin_done(self):
     CHECKIN_STATUS.set(1)
     self._checkin_client = None
     self.num_failed_checkins = 0
     logging.info("Checkin Successful!")