예제 #1
0
class PanoptesPluginScheduler(object):
    """
    This class implements methods to start and manage Celery App and Celery Beat for Plugin Schedulers

    Args:
        panoptes_context(PanoptesContext): The Panoptes Context instance that should be used by the Plugin Scheduler
        plugin_type (str): The type of the plugins the Plugin Scheduler would handle
        plugin_type_display_name (str): The display name that should be used by the Plugin Scheduler in logs and errors
        celery_config (PanoptesCeleryConfig): The Celery Config instance that should be used by the Plugin Scheduler
        lock_timeout (int): The number of seconds to wait before a lock times out and is retried
        plugin_scheduler_task (callable): The callback function that the Plugin Scheduler should call every interval

    Returns:
        None
    """
    def __init__(self,
                 panoptes_context,
                 plugin_type,
                 plugin_type_display_name,
                 celery_config,
                 lock_timeout,
                 plugin_scheduler_task,
                 plugin_subtype=None):
        assert PanoptesContextValidators.valid_panoptes_context(
            panoptes_context
        ), u'panoptes_context must be an instance of PanoptesContext'
        assert PanoptesValidators.valid_nonempty_string(
            plugin_type), u'plugin_type must be a non-empty str'
        assert PanoptesValidators.valid_nonempty_string(plugin_type_display_name), \
            u'plugin_type_display_name must be a non-empty str'
        assert PanoptesCeleryValidators.valid_celery_config(
            celery_config
        ), u'celery_config must be an instance of PanoptesCeleryConfig'
        assert PanoptesValidators.valid_nonzero_integer(
            lock_timeout), u'lock_timeout must be an int greater than zero'
        assert PanoptesValidators.valid_callback(
            plugin_scheduler_task), u'plugin_scheduler_task must be a callable'
        assert plugin_type is None or PanoptesValidators.valid_nonempty_string(plugin_type), u'plugin_type must be a ' \
                                                                                             u'None or a non-empty str'

        self._panoptes_context = panoptes_context
        self._config = self._panoptes_context.config_dict
        self._logger = self._panoptes_context.logger
        self._shutdown_plugin_scheduler = threading.Event()
        self._plugin_scheduler_celery_beat_service = None
        self._celery_config = celery_config
        self._celery = None
        self._t = None
        self._lock = None
        self._plugin_type = plugin_type
        self._plugin_subtype = plugin_subtype
        self._plugin_type_display_name = plugin_type_display_name
        self._lock_timeout = lock_timeout
        self._plugin_scheduler_task = plugin_scheduler_task
        self._tour_of_duty = PanoptesTourOfDuty(splay_percent=50)
        self._cycles_without_lock = 0

    def start(self):
        """
        This function starts the Plugin Scheduler. It installs signal handlers, acquire an distributed lock and then
        return a Celery application instance

        The flow of the startup process is follows:
        start -> _celery_beat_service_started (starts) -> plugin_scheduler_task_thread

        The plugin_scheduler_task_thread runs the plugin_scheduler_task every "['plugin_type']['plugin_scan_interval']"
        seconds, which comes from the system wide configuration file

        The reason for this slightly convoluted startup is that because the plugin_scheduler_task needs the Celery Beat
        Service instance object so that it can update the schedule periodically and this only available after the
        _celery_beat_service_started callback function is called by Celery Beat

        Returns:
            celery.app: The Celery App instance to be used by the scheduler

        """

        logger = self._logger

        logger.info(u'%s Plugin Scheduler main thread: OS PID: %d' %
                    (self._plugin_type_display_name, get_os_tid()))

        logger.info(
            u'"Tour Of Duty" adjusted values: tasks: %d count, time: %d seconds, memory: %dMB'
            % (self._tour_of_duty.adjusted_tasks,
               self._tour_of_duty.adjusted_seconds,
               self._tour_of_duty.adjusted_memory_growth_mb))

        logger.info(u'Setting up signal handlers')
        self._install_signal_handlers()

        client_id = get_client_id(str(const.PLUGIN_CLIENT_ID_PREFIX))
        lock_path = str(
            const.LOCK_PATH_DELIMITER.join([
                _f for _f in [
                    const.PLUGIN_SCHEDULER_LOCK_PATH, self._plugin_type,
                    self._plugin_subtype,
                    str('lock')
                ] if _f
            ]))

        logger.info(
            u'Creating lock object for %s Plugin Scheduler under lock path "%s"'
            % (self._plugin_type, lock_path))
        try:
            self._lock = PanoptesLock(context=self._panoptes_context,
                                      path=lock_path,
                                      timeout=self._lock_timeout,
                                      retries=0,
                                      identifier=client_id)
        except Exception as e:
            sys.exit(u'Failed to create lock object: %s' % repr(e))

        if self._lock.locked:
            logger.info(u'Starting Celery Beat Service')
            try:
                self._celery = PanoptesCeleryInstance(
                    self._panoptes_context, self._celery_config).celery
                self._celery.conf.update(
                    CELERYBEAT_MAX_LOOP_INTERVAL=self._config[
                        self._plugin_type][u'celerybeat_max_loop_interval'])
            except:
                logger.exception(u'Error trying to start Celery Beat Service')

        return self._celery

    def run(self, sender=None, args=None, **kwargs):
        """
        This function is called after the Celery Beat Service has finished initialization.
        The function (re)installs the signal handlers, since they are overwritten by the Celery Beat Service.
        It stores the reference to the Celery Beat Service instance and starts the Plugin Scheduler thread

        Args:
            sender (celery.beat.Service): The Celery Beat Service instance
            args: Variable length argument list
            **kwargs: Arbitrary keyword argument

        Returns:
            None
        """
        logger = self._logger
        logger.info(
            u'Reinstalling signal handlers after Celery Beat Service setup')
        self._install_signal_handlers()
        self._plugin_scheduler_celery_beat_service = sender
        self._t = threading.Thread(target=self._plugin_scheduler_task_thread)
        self._t.start()

    def _plugin_scheduler_task_thread(self):
        """
        This function is the entry point of the Plugin Scheduler thread. It checks if the Plugin Scheduler is shutdown
        mode and if not, then calls the plugin_scheduler_task function every 'plugin_scan_interval'
        seconds

        Returns:
            None

        """
        logger = self._logger
        logger.info(u'%s Plugin Scheduler Task thread: OS PID: %d' %
                    (self._plugin_type_display_name, get_os_tid()))
        while not self._shutdown_plugin_scheduler.is_set():
            if self._lock.locked:
                self._cycles_without_lock = 0
                try:
                    self._plugin_scheduler_task(
                        self._plugin_scheduler_celery_beat_service,
                        self._tour_of_duty.iterations)
                    self._tour_of_duty.increment_task_count()
                except Exception:
                    logger.exception(
                        u'Error trying to execute plugin scheduler task')
            else:
                self._cycles_without_lock += 1
                if self._cycles_without_lock < const.PLUGIN_SCHEDULER_MAX_CYCLES_WITHOUT_LOCK:
                    logger.warn(
                        u'%s Plugin Scheduler lock not held, skipping scheduling cycle'
                        % self._plugin_type_display_name)
                else:
                    logger.warn(
                        u'%s Plugin Scheduler lock not held for %d cycles, shutting down'
                        % (self._plugin_type_display_name,
                           self._cycles_without_lock))
                    self._shutdown()

            if self._tour_of_duty.completed:
                why = []
                why += [u'tasks'] if self._tour_of_duty.tasks_completed else []
                why += [u'time'] if self._tour_of_duty.time_completed else []
                why += [
                    u'memory growth'
                ] if self._tour_of_duty.memory_growth_completed else []

                logger.info(
                    u'%s Plugin Scheduler "Tour Of Duty" completed because of %s going to shutdown'
                    % (self._plugin_type_display_name, ', '.join(why)))
                self._shutdown()
            self._shutdown_plugin_scheduler.wait(
                self._config[self._plugin_type][u'plugin_scan_interval'])

        logger.critical(u'%s Plugin Scheduler Task thread shutdown' %
                        self._plugin_type_display_name)

    def _shutdown(self):
        """
        The main shutdown method, which handles two scenarios
        * The Plugin Scheduler thread is alive: sets an event to shutdown the thread
        * The Plugin Scheduler thread is not alive: this can happen if we have not been able to acquire the lock or
        the if the Plugin Scheduler thread quits unexpectedly. In this case, this handler proceeds to call the function
        to shutdown other services (e.g. Celery Beat Service)

        Returns:
            None
        """
        logger = self._logger

        if self._shutdown_plugin_scheduler.is_set():
            print(
                u'%s Plugin Scheduler already in the process of shutdown, ignoring redundant call'
            )
            return

        shutdown_interval = int(
            int(self._config[self._plugin_type][u'plugin_scan_interval']) * 2)
        logger.info(u'Shutdown/restart requested - may take up to %s seconds' %
                    shutdown_interval)

        logger.info(
            u'Signalling for %s Plugin Scheduler Task Thread to shutdown' %
            self._plugin_type_display_name)
        self._shutdown_plugin_scheduler.set()

        if self._t != threading.currentThread():
            if (self._t is not None) and (self._t.isAlive()):
                self._t.join()

            if (self._t is None) or (not self._t.isAlive()):
                logger.info(
                    u'%s Plugin Scheduler Task Thread is not active - shutting down other services'
                    % self._plugin_type_display_name)
        else:
            logger.info(
                u'%s Plugin Scheduler shutdown called from plugin scheduler task thread'
                % self._plugin_type_display_name)

        if self._plugin_scheduler_celery_beat_service:
            logger.info(u'Shutting down Celery Beat Service')
            self._plugin_scheduler_celery_beat_service.stop()

        if self._lock:
            logger.info(u'Releasing lock')
            self._lock.release()

        logger.info(u'Plugin Scheduler shutdown complete')
        sys.exit()

    def _install_signal_handlers(self):
        """
        Installs signal handlers for SIGTERM, SIGINT and SIGHUP

        Returns:
            None
        """
        signal.signal(signal.SIGTERM, self._signal_handler)
        signal.signal(signal.SIGINT, self._signal_handler)
        signal.signal(signal.SIGHUP, self._signal_handler)

    def _signal_handler(self, signal_number, _):
        """
        Signal handler - wraps the _shutdown method with some checks

        Args:
            signal_number (int): The received signal number
            _ (frame): Current stack frame object

        Returns:
            None
        """
        print(u'Caught %s, shutting down %s Plugin Scheduler' %
              (const.SIGNALS_TO_NAMES_DICT[signal_number],
               self._plugin_type_display_name))

        # If the Plugin Scheduler is already in the process of shutdown, then do nothing - prevents issues
        # with re-entrancy
        if self._shutdown_plugin_scheduler.is_set():
            print(
                u'%s Plugin Scheduler already in the process of shutdown, ignoring %s'
                % (self._plugin_type_display_name,
                   const.SIGNALS_TO_NAMES_DICT[signal_number]))
            return

        self._shutdown()
예제 #2
0
    def test_panoptes_lock(self):
        global kazoo_client
        kazoo_client = self.client

        connected = threading.Event()
        lost_connection = threading.Event()

        def _listener(state):
            if state == KazooState.CONNECTED:
                connected.set()
            else:
                lost_connection.set()

        self.client.add_listener(_listener)

        panoptes_context = PanoptesContext(
            config_file='tests/test_panoptes_config.ini',
            create_zookeeper_client=True)

        # Test that bad parameters fail
        with self.assertRaises(AssertionError):
            PanoptesLock(context=None,
                         path='/lock',
                         timeout=5,
                         retries=0,
                         identifier='test')
        with self.assertRaises(AssertionError):
            PanoptesLock(context=panoptes_context,
                         path='/lock',
                         timeout=5,
                         retries=None,
                         identifier='test')
        with self.assertRaises(AssertionError):
            PanoptesLock(context=panoptes_context,
                         path='/lock',
                         timeout=5,
                         retries=-1,
                         identifier='test')
        with self.assertRaises(AssertionError):
            PanoptesLock(context=panoptes_context,
                         path='/lock',
                         timeout=5,
                         retries='1',
                         identifier='test')
        with self.assertRaises(AssertionError):
            PanoptesLock(context=panoptes_context,
                         path=None,
                         timeout=5,
                         retries=0,
                         identifier='test')
        with self.assertRaises(AssertionError):
            PanoptesLock(context=panoptes_context,
                         path='lock',
                         timeout=5,
                         retries=0,
                         identifier='test')
        with self.assertRaises(AssertionError):
            PanoptesLock(context=panoptes_context,
                         path='/lock',
                         timeout=None,
                         retries=0,
                         identifier='test')
        with self.assertRaises(AssertionError):
            PanoptesLock(context=panoptes_context,
                         path='/lock',
                         timeout=-1,
                         retries=0,
                         identifier='test')
        with self.assertRaises(AssertionError):
            PanoptesLock(context=panoptes_context,
                         path='/lock',
                         timeout=-1,
                         retries=0,
                         identifier='test')
        with self.assertRaises(AssertionError):
            PanoptesLock(context=panoptes_context,
                         path='/lock',
                         timeout=5,
                         retries=0,
                         identifier=None)
        with self.assertRaises(AssertionError):
            PanoptesLock(context=panoptes_context,
                         path='/lock',
                         timeout=5,
                         retries=0,
                         identifier=1)

        # Acquire lock with unlimited retries
        lock = PanoptesLock(context=panoptes_context,
                            path='/lock',
                            timeout=5,
                            retries=0,
                            identifier='test')
        self.assertEquals(lock.locked, True)

        # Release the lock
        lock.release()
        self.assertEquals(lock.locked, False)

        # Acquire lock with only one retry
        lock1 = PanoptesLock(context=panoptes_context,
                             path='/lock',
                             timeout=5,
                             identifier='test')
        self.assertEquals(lock1.locked, True)

        # Try an acquire an acquired lock - this should fail
        lock2 = PanoptesLock(context=panoptes_context,
                             path='/lock',
                             timeout=5,
                             identifier='test')
        self.assertEquals(lock2.locked, False)
        lock1.release()
        self.assertEquals(lock1.locked, False)
        lock2.release()

        # Acquire the lock, lose connection and lose the lock and acquire it again on reconnection
        lock = PanoptesLock(context=panoptes_context,
                            path='/lock',
                            timeout=5,
                            identifier='test')
        self.assertEquals(lock.locked, True)
        self.lose_connection(self.make_event)
        # Block till the client disconnects - or 30 seconds pass
        lost_connection.wait(30)
        # Verify that the client actually lost the connection
        self.assertEquals(lost_connection.is_set(), True)
        # Give it time to cleanup the lock
        # TODO: There is a timing issue here - if we sleep too long before checking the state of the lock, we
        # might get reconnected and reacquire the lock, which is why we check if connected.is_set is NOT set before the
        # assert
        sleep(0.1)
        # The lock should be not be set after we loose a connection
        if not connected.is_set():
            self.assertEquals(lock.locked, False)
        # Block till the client reconnects - or 30 seconds pass
        connected.wait(30)
        # Verify that the client actually reconnected
        self.assertEquals(connected.is_set(), True)
        # Give it time to reacquire the lock
        sleep(3)
        self.assertEquals(lock.locked, True)