class TimeWindow(SimplePredicate): """ Predicate for comparing current time to start/stop times. It will set the 'met' value based on start > current_time > stop. """ def __init__(self, comp_name, begin=None, end=None, weekdays=None, operational=False, parent=None, interval=5): """ :type comp_name: str :type begin: str or None :type end: str or None :type weekdays: str or None :type operational: bool :type parent: str or None :type interval: int or float """ SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent) self.begin = self.get_datetime_object(begin) self.end = self.get_datetime_object(end) self.day_range = self.parse_range(weekdays) self.interval = interval self._log = logging.getLogger( 'sent.{0}.pred.timewin'.format(comp_name)) self._log.info('Registered {0}'.format(self)) self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._started = False def weekday(self): """ :rtype: int 0=Monday, 1=Tuesday, etc. """ return datetime.date.today().weekday() def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._thread.start() self._block_until_started() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _run_loop(self): while self._operate == True: self._process_met() sleep(self.interval) self._log.info('Done comparing times.') def _process_met(self): results = [] if self.begin is not None: compare_to_begin = self._get_comparison(self.begin) results.append(self.begin < compare_to_begin) if self.end is not None: compare_to_end = self._get_comparison(self.end) results.append(compare_to_end < self.end) if self.day_range is not None: results.append(self.weekday() in self.day_range) if not results: results.append(False) self.set_met(all(results)) # every comparison returned True def _get_comparison(self, obj): if isinstance(obj, datetime.datetime): return datetime.datetime.now() elif isinstance(obj, datetime.time): return datetime.datetime.now().time() @staticmethod def get_datetime_object(data): """ Create datetime object from string value :type data: str or None :rtype: datetime.datetime or datetime.time or None """ if data is None: return dt_object = None dt_dict = TimeWindow.create_datetime_dict(data) try: # All of year, month and day are not None if all([ dt_dict.get(i, None) is not None for i in ('year', 'month', 'day') ]): dt_object = datetime.datetime(year=dt_dict['year'], month=dt_dict['month'], day=dt_dict['day'], hour=dt_dict['hour'], minute=dt_dict['minute'], microsecond=0) if dt_dict['second'] is not None: dt_object = dt_object.replace(second=dt_dict['second']) # both hour and minute are not None elif all( [dt_dict.get(i, None) is not None for i in ('hour', 'minute')]): dt_object = datetime.time(hour=dt_dict['hour'], minute=dt_dict['minute'], microsecond=0) if dt_dict.get('second', None) is not None: dt_object = dt_object.replace(second=dt_dict['second']) else: logging.getLogger('PredicateTime').error( 'data "{0}" did not match regex. This will result in the ' 'paramter returning as None. The predicate will never be ' 'met for this parameter. '.format(data)) except (ValueError, TypeError) as ex: logging.getLogger('PredicateTime').error( 'Problem with parsing data "{0}": {1}'.format(data, ex)) finally: return dt_object @staticmethod def create_datetime_dict(datetime_string): """ :type datetime_string: str :rtype: dict """ datetime_regex = ( "^((?P<year>\d{4})\-(?P<month>\d{2})\-(?P<day>\d{2})\s)?" "(?P<hour>\d{1,2}):(?P<minute>\d{2})(:(?P<second>\d{2}))?") regex_dict = dict() match = re.search(datetime_regex, datetime_string) if match: regex_dict = dict(year=match.group('year'), month=match.group('month'), day=match.group('day'), hour=match.group('hour'), minute=match.group('minute'), second=match.group('second')) # convert all values to integers for k, v in regex_dict.iteritems(): if v is not None: regex_dict[k] = int(v) logging.getLogger('PredicateTime').debug( 'datetime_dict returning {0}'.format(regex_dict)) return regex_dict @staticmethod def parse_range(astr): """ https://www.darklaunch.com/2012/11/05/python-parse-range-and-parse-group-range Return a range list given a string. As this is for weekdays, only return 0-6 :type astr: str or None :rtype: list or None """ if astr is None: return None try: result = set() for part in astr.split(','): x = part.split('-') result.update(range(int(x[0]), int(x[-1]) + 1)) # only accept 0-6 return [i for i in sorted(result) if 0 <= i <= 6] except ValueError: logging.warning('Error parsing day range. Returning [].') return [] def __repr__(self): return ('{0}(component={1}, parent={2}, begin="{3}", ' 'end="{4}", days={5}, started={6}, operational={7}, met={8})'. format(self.__class__.__name__, self._comp_name, self._parent, self.begin, self.end, self.day_range, self.started, self._operational, self._met)) def __eq__(self, other): return all([ type(self) == type(other), self.begin == getattr(other, 'begin', None), self.end == getattr(other, 'end', None), self.day_range == getattr(other, 'day_range', None), self.interval == getattr(other, 'interval', None) ]) def __ne__(self, other): return any([ type(self) != type(other), self.begin != getattr(other, 'begin', None), self.end != getattr(other, 'end', None), self.day_range != getattr(other, 'day_range', None), self.interval != getattr(other, 'interval', None) ])
class ZookeeperGoodUntilTime(SimplePredicate): def __init__(self, comp_name, zkclient, nodepath, operational=False, parent=None, interval=5): """ :type comp_name: str :type zkclient: kazoo.client.KazooClient :type nodepath: str :type operational: bool :type parent: str or None :type interval: int or float """ SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent) self.node = nodepath self.zkclient = zkclient self.interval = interval self._start = None self._stop = None self._log = logging.getLogger('sent.{0}.pred.gut'.format(comp_name)) self._log.info('Registered {0}'.format(self)) self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._started = False self._datetime_regex = ( "^((?P<year>\d{4})\-(?P<month>\d{2})\-(?P<day>\d{2})\s)?" "(?P<hour>\d{2}):(?P<minute>\d{2})(:(?P<second>\d{2}))?" ) @property def current_time(self): return datetime.datetime.now().time() @property def current_datetime(self): return datetime.datetime.now() def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._watch_node() self._thread.start() self._block_until_started() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _run_loop(self): while self._operate == True: self._process_met() sleep(self.interval) self._log.info('Done comparing guts.') def _process_met(self): results = [] if self._start is not None: compare_to_start = self._get_comparison(self._start) results.append(self._start < compare_to_start) if self._stop is not None: compare_to_stop = self._get_comparison(self._stop) results.append(compare_to_stop < self._stop) if not results: results.append(False) self.set_met(all(results)) # every comparison returned True def _get_comparison(self, obj): if isinstance(obj, datetime.datetime): return self.current_datetime elif isinstance(obj, datetime.time): return self.current_time def _parse_data(self, gut_data): """ :type gut_data: dict """ start_data = gut_data.get(u'start', None) self._log.debug('raw start from zk is "{0}"'.format(start_data)) if start_data is not None: self._start = TimeWindow.get_datetime_object(start_data) stop_data = gut_data.get(u'stop', None) self._log.debug('raw stop from zk is "{0}"'.format(stop_data)) if stop_data is not None: self._stop = TimeWindow.get_datetime_object(stop_data) if start_data is None and stop_data is None: self._log.error('Start and Stop time not specified!') self._log.info('The current time is: {0}. Start time is: {1}. ' 'Stop time is: {2}' .format(self.current_time, self._start, self._stop)) @connected def _watch_node(self, event=None): """ :type event: kazoo.protocol.states.WatchedEvent or None """ try: exists = self.zkclient.exists(self.node, watch=self._watch_node) if exists: data, stat = self.zkclient.get(self.node, watch=self._watch_node) j = json.loads(data) self._parse_data(j) else: self._log.info('No gut node was found. Watcher is set at {0}' .format(self.node)) except ValueError as ex: self._log.error('Invalid GUT JSON object: {0}'.format(ex)) finally: self._process_met() def __repr__(self): return ('{0}(component={1}, parent={2}, start="{3}", stop="{4}", ' 'zkpath={5}, started={6}, operational={7}, met={8})' .format(self.__class__.__name__, self._comp_name, self._parent, self._start, self._stop, self.node, self.started, self._operational, self._met)) def __eq__(self, other): return all([ type(self) == type(other), self.node == getattr(other, 'node', None) ]) def __ne__(self, other): return any([ type(self) != type(other), self.node != getattr(other, 'node', None) ])
class ChildProcess(object): """ Wraps a threading.Thread, providing a Queue for communication between the SentinelDaemon and the ChildProcess. """ def __init__(self, config, system, settings): """ :type config: xml.etree.ElementTree.Element :type system: zoom.common.types.PlatformType :type settings: dict """ self._log = logging.getLogger('sent.child') self._action_queue = UniqueQueue() self._cancel_flag = ThreadSafeObject(False) self.name = verify_attribute(config, 'id') self._application_type = verify_attribute(config, 'type') self._config = config self._system = system # Linux or Windows self._settings = settings self._process = self._create_process() def add_work(self, work, immediate=False): """ :type work: zoom.agent.task.task.Task :type immediate: bool :rtype: bool """ return self._action_queue.append_unique(work, sender=str(self), first=immediate) def cancel_current_task(self): """ Set the cancel flag that is used in the process client. """ # this seems like a hack. There must be a better way of cancelling while # still allowing the agent to report up/down status DONT_REMOVE = ('register', 'unregister') self._log.info('Setting Cancel Flag and clearing queue.') self._cancel_flag.set_value(True) for i in list(self._action_queue): if i.name not in DONT_REMOVE: self._action_queue.remove(i) self._log.info('Removing task {0}'.format(i)) def stop(self): """ Stops the Process/Thread """ try: self._log.info('Terminating {0} child process'.format(self.name)) self.cancel_current_task() self.add_work(Task('terminate', block=True), immediate=True) except Exception as e: self._log.warning( 'Exception with stopping {0} child process: {1}'.format( self.name, e)) def join(self): """ Block until underlying process completes. """ self._process.join() self._log.info('{0} stopped.'.format(self)) def _create_process(self): """ :rtype: threading.Thread """ self._log.debug('Starting worker process for %s' % self.name) if self._application_type == ApplicationType.APPLICATION: s = Application(self._config, self._settings, self._action_queue, self._system, self._application_type, self._cancel_flag) elif self._application_type == ApplicationType.JOB: s = Job(self._config, self._settings, self._action_queue, self._system, self._application_type, self._cancel_flag) t = Thread(target=s.run, name=self.name) t.daemon = True t.start() return t def __str__(self): return 'ChildProcess(name={0}, type={1})'.format( self.name, self._application_type)
class PredicateProcess(SimplePredicate): def __init__(self, comp_name, proc_client, interval, operational=False, parent=None): """ :type comp_name: str :type proc_client: zoom.agent.client.process_client.ProcessClient :type interval: int or float :type operational: bool :type parent: str or None """ SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent) self._log = logging.getLogger( 'sent.{0}.pred.process'.format(comp_name)) self._proc_client = proc_client # lock for synchronous decorator if proc_client: self.process_client_lock = proc_client.process_client_lock else: self.process_client_lock = Lock() self.interval = interval self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._started = False def running(self): """ With the synchronous decorator, this shares a Lock object with the ProcessClient. While ProcessClient.start is running, this will not return. :rtype: bool """ return self._proc_client.running() def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._thread.start() self._block_until_started() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _run_loop(self): cancel_counter = 0 while self._operate == True: if self._proc_client.cancel_flag == False: self.set_met(self.running()) cancel_counter = 0 elif cancel_counter > 1: self._log.info('Waited long enough. Resetting cancel flag.') self._proc_client.cancel_flag.set_value(False) cancel_counter = 0 else: cancel_counter += 1 self._log.info('Cancel Flag detected, skipping status check.') sleep(self.interval) self._log.info('Done watching process.') def __repr__(self): return ('{0}(component={1}, parent={2}, interval={3}, started={4}, ' 'operational={5}, met={6})'.format( self.__class__.__name__, self._comp_name, self._parent, self.interval, self.started, self._operational, self._met)) def __eq__(self, other): return all([ type(self) == type(other), self.interval == getattr(other, 'interval', None) ]) def __ne__(self, other): return any([ type(self) != type(other), self.interval != getattr(other, 'interval', None) ])
class PredicateHealth(SimplePredicate): def __init__(self, comp_name, command, interval, system, operational=False, parent=None): """ :type comp_name: str :type command: str :type interval: int or float :type system: zoom.common.types.PlatformType :type operational: bool :type parent: str or None """ SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent) self._log = logging.getLogger('sent.{0}.pred.health'.format(comp_name)) self.interval = interval self.rawcmd = command self._runcmd = str() self._system = system self._verify() self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._log.info('Registered {0}'.format(self)) self._started = False def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._thread.start() self._block_until_started() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _verify(self): if self._system == PlatformType.LINUX: self._runcmd = shlex.split(self.rawcmd) elif self._system == PlatformType.WINDOWS: self._runcmd = self.rawcmd else: self._runcmd = "" exe = shlex.split(self.rawcmd)[0] exists = os.path.exists(exe) if not exists: searchpath = os.environ['PATH'] for i in searchpath.split(':'): newpath = os.path.join(i, exe) if os.path.exists(newpath): exists = True break if not exists: err = ('Cannot register check "{0}". The path does not exist.'. format(exe)) self._log.error(err) raise OSError(err) def _run(self): """ Run the check as a subprocess and return the results as a bool based on return code. (Non-zero equals failure) :rtype: bool """ p = Popen(self._runcmd, stdout=PIPE, stderr=PIPE) out, err = p.communicate() if err: self._log.error( 'There was some error with the check "{0}"\n{1}'.format( self.rawcmd, err)) self.set_met(False) if p.returncode != 0: self._log.error('Check "{0}" has failed.'.format(self.rawcmd)) self.set_met(False) else: self._log.debug('Check "{0}" has succeeded.'.format(self.rawcmd)) self.set_met(True) def _run_loop(self): while self._operate == True: self._run() sleep(self.interval) self._log.info('Done running {0}'.format(self)) def __repr__(self): return ('{0}(component={1}, parent={2}, cmd="{3}", interval={4} ' 'started={5}, operational={6}, met={7})'.format( self.__class__.__name__, self._comp_name, self._parent, self.rawcmd, self.interval, self.started, self._operational, self._met)) def __eq__(self, other): return all([ type(self) == type(other), self.rawcmd == getattr(other, 'rawcmd', None), self.interval == getattr(other, 'interval', None) ]) def __ne__(self, other): return any([ type(self) != type(other), self.rawcmd != getattr(other, 'rawcmd', None), self.interval != getattr(other, 'interval', None) ])
class PredicateProcess(SimplePredicate): def __init__(self, comp_name, settings, proc_client, interval, parent=None): """ :type comp_name: str :type settings: zoom.agent.entities.thread_safe_object.ThreadSafeObject :type proc_client: zoom.agent.client.process_client.ProcessClient :type interval: int or float :type parent: str or None """ SimplePredicate.__init__(self, comp_name, settings, parent=parent) self._log = logging.getLogger('sent.{0}.pred.process'.format(comp_name)) self._proc_client = proc_client # lock for synchronous decorator if proc_client: self.process_client_lock = proc_client.process_client_lock else: self.process_client_lock = Lock() self.interval = interval self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._started = False def running(self): """ With the synchronous decorator, this shares a Lock object with the ProcessClient. While ProcessClient.start is running, this will not return. :rtype: bool """ return self._proc_client.running() def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._thread.start() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _run_loop(self): while self._operate == True: self.set_met(self.running()) sleep(self.interval) self._log.info('Done watching process.') def __repr__(self): return ('{0}(component={1}, parent={2}, interval={3}, started={4}, ' 'met={5})' .format(self.__class__.__name__, self._comp_name, self._parent, self.interval, self.started, self._met) ) def __eq__(self, other): return all([ type(self) == type(other), self.interval == getattr(other, 'interval', None) ]) def __ne__(self, other): return any([ type(self) != type(other), self.interval != getattr(other, 'interval', None) ])
class PredicateHoliday(SimplePredicate): def __init__(self, comp_name, zkclient, path, operational=False, parent=None, interval=10): """ :type comp_name: str :type zkclient: kazoo.client.KazooClient :type path: str or None :type operational: bool :type parent: str or None :type interval: int or float """ SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent) self.zkclient = zkclient self.interval = interval self.path = path self._log = logging.getLogger('sent.{0}.holiday'.format(comp_name)) self._log.info('Registered {0}'.format(self)) self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._started = False self._holidays = list() @property def date_string(self): """ :rtype: str Example: 20140101 """ return datetime.date.today().strftime('%Y%m%d') def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._watch_node() self._thread.start() self._block_until_started() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _run_loop(self): while self._operate == True: self._process_met() sleep(self.interval) self._log.info('Done checking for holiday.') def _process_met(self): self.set_met(self.date_string in self._holidays) @connected def _watch_node(self, event=None): """ :type event: kazoo.protocol.states.WatchedEvent or None """ if self.path is None: self._log.warning('No zookeeper path given. This predicate will' ' nevr be met.') return exists = self.zkclient.exists(self.path, watch=self._watch_node) if exists: self._holidays = self.zkclient.get_children(self.path, watch=self._watch_node) self._log.info('Got holidays {0}'.format(self._holidays)) self._process_met() else: self._log.info( 'No gut node was found. Watcher is set at {0}'.format( self.path)) def __repr__(self): return ('{0}(component={1}, parent={2}, started={3}, ' 'operational={4}, met={5})'.format(self.__class__.__name__, self._comp_name, self._parent, self.started, self._operational, self._met)) def __eq__(self, other): return type(self) == type(other) def __ne__(self, other): return type(self) != type(other)
class TimeWindow(SimplePredicate): """ Predicate for comparing current time to start/stop times. It will set the 'met' value based on start > current_time > stop. """ def __init__(self, comp_name, begin=None, end=None, weekdays=None, operational=False, parent=None, interval=5): """ :type comp_name: str :type begin: str or None :type end: str or None :type weekdays: str or None :type operational: bool :type parent: str or None :type interval: int or float """ SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent) self.begin = self.get_datetime_object(begin) self.end = self.get_datetime_object(end) self.day_range = self.parse_range(weekdays) self.interval = interval self._log = logging.getLogger('sent.{0}.pred.timewin'.format(comp_name)) self._log.info('Registered {0}'.format(self)) self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._started = False def weekday(self): """ :rtype: int 0=Monday, 1=Tuesday, etc. """ return datetime.date.today().weekday() def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._thread.start() self._block_until_started() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _run_loop(self): while self._operate == True: self._process_met() sleep(self.interval) self._log.info('Done comparing times.') def _process_met(self): results = [] if self.begin is not None: compare_to_begin = self._get_comparison(self.begin) results.append(self.begin < compare_to_begin) if self.end is not None: compare_to_end = self._get_comparison(self.end) results.append(compare_to_end < self.end) if self.day_range is not None: results.append(self.weekday() in self.day_range) if not results: results.append(False) self.set_met(all(results)) # every comparison returned True def _get_comparison(self, obj): if isinstance(obj, datetime.datetime): return datetime.datetime.now() elif isinstance(obj, datetime.time): return datetime.datetime.now().time() @staticmethod def get_datetime_object(data): """ Create datetime object from string value :type data: str or None :rtype: datetime.datetime or datetime.time or None """ if data is None: return dt_object = None dt_dict = TimeWindow.create_datetime_dict(data) try: # All of year, month and day are not None if all([dt_dict.get(i, None) is not None for i in ('year', 'month', 'day')]): dt_object = datetime.datetime(year=dt_dict['year'], month=dt_dict['month'], day=dt_dict['day'], hour=dt_dict['hour'], minute=dt_dict['minute'], microsecond=0) if dt_dict['second'] is not None: dt_object = dt_object.replace(second=dt_dict['second']) # both hour and minute are not None elif all([dt_dict.get(i, None) is not None for i in ('hour', 'minute')]): dt_object = datetime.time(hour=dt_dict['hour'], minute=dt_dict['minute'], microsecond=0) if dt_dict.get('second', None) is not None: dt_object = dt_object.replace(second=dt_dict['second']) else: logging.getLogger('PredicateTime').error( 'data "{0}" did not match regex. This will result in the ' 'paramter returning as None. The predicate will never be ' 'met for this parameter. '.format(data)) except (ValueError, TypeError) as ex: logging.getLogger('PredicateTime').error( 'Problem with parsing data "{0}": {1}'.format(data, ex)) finally: return dt_object @staticmethod def create_datetime_dict(datetime_string): """ :type datetime_string: str :rtype: dict """ datetime_regex = ( "^((?P<year>\d{4})\-(?P<month>\d{2})\-(?P<day>\d{2})\s)?" "(?P<hour>\d{1,2}):(?P<minute>\d{2})(:(?P<second>\d{2}))?" ) regex_dict = dict() match = re.search(datetime_regex, datetime_string) if match: regex_dict = dict(year=match.group('year'), month=match.group('month'), day=match.group('day'), hour=match.group('hour'), minute=match.group('minute'), second=match.group('second')) # convert all values to integers for k, v in regex_dict.iteritems(): if v is not None: regex_dict[k] = int(v) logging.getLogger('PredicateTime').debug( 'datetime_dict returning {0}'.format(regex_dict)) return regex_dict @staticmethod def parse_range(astr): """ https://www.darklaunch.com/2012/11/05/python-parse-range-and-parse-group-range Return a range list given a string. As this is for weekdays, only return 0-6 :type astr: str or None :rtype: list or None """ if astr is None: return None try: result = set() for part in astr.split(','): x = part.split('-') result.update(range(int(x[0]), int(x[-1]) + 1)) # only accept 0-6 return [i for i in sorted(result) if 0 <= i <= 6] except ValueError: logging.warning('Error parsing day range. Returning [].') return [] def __repr__(self): return ('{0}(component={1}, parent={2}, begin="{3}", ' 'end="{4}", days={5}, started={6}, operational={7}, met={8})' .format(self.__class__.__name__, self._comp_name, self._parent, self.begin, self.end, self.day_range, self.started, self._operational, self._met)) def __eq__(self, other): return all([ type(self) == type(other), self.begin == getattr(other, 'begin', None), self.end == getattr(other, 'end', None), self.day_range == getattr(other, 'day_range', None), self.interval == getattr(other, 'interval', None) ]) def __ne__(self, other): return any([ type(self) != type(other), self.begin != getattr(other, 'begin', None), self.end != getattr(other, 'end', None), self.day_range != getattr(other, 'day_range', None), self.interval != getattr(other, 'interval', None) ])
class APIPredicate(SimplePredicate): """ Predicate that polls a url for a specific code. """ def __init__(self, comp_name, url, verb='GET', expected_code=200, interval=5.0, operational=False, parent=None): """ :type comp_name: str :type url: str :type verb: str :type expected_code: int :type interval: int or float :type operational: bool :type parent: str or None """ SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent) self._log = logging.getLogger('sent.{0}.pred.api'.format(comp_name)) logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.WARNING) self.url = url self.verb = verb self.expected_code = expected_code self.interval = interval self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._log.info('Registered {0}'.format(self)) self._started = False def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._thread.start() self._block_until_started() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _run(self): """ Query the given url, and report whether we get the expected code. """ try: r = requests.request(self.verb, self.url, timeout=2) self.set_met(r.status_code == self.expected_code) except requests.ConnectionError: self._log.debug('URL {0} is not available.'.format(self.url)) self.set_met(False) except requests.Timeout: self._log.debug('Timed out to URL {0}.'.format(self.url)) self.set_met(False) def _run_loop(self): while self._operate == True: self._run() sleep(self.interval) self._log.info('Done querying {0}'.format(self.url)) def __repr__(self): return ('{0}(component={1}, parent={2}, url="{3}", verb={4}, ' 'interval={5} started={6}, operational={7}, met={8})' .format(self.__class__.__name__, self._comp_name, self._parent, self.url, self.verb, self.interval, self.started, self._operational, self._met)) def __eq__(self, other): return all([ type(self) == type(other), self.url == getattr(other, 'url', None), self.verb == getattr(other, 'verb', None), self.interval == getattr(other, 'interval', None) ]) def __ne__(self, other): return any([ type(self) != type(other), self.url != getattr(other, 'url', None), self.verb != getattr(other, 'verb', None), self.interval != getattr(other, 'interval', None) ])
class APIPredicate(SimplePredicate): """ Predicate that polls a url for a specific code. """ def __init__(self, comp_name, url, verb='GET', expected_code=200, interval=5.0, operational=False, parent=None): """ :type comp_name: str :type url: str :type verb: str :type expected_code: int :type interval: int or float :type operational: bool :type parent: str or None """ SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent) self._log = logging.getLogger('sent.{0}.pred.api'.format(comp_name)) logging.getLogger('requests.packages.urllib3.connectionpool').setLevel( logging.WARNING) self.url = url self.verb = verb self.expected_code = expected_code self.interval = interval self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._log.info('Registered {0}'.format(self)) self._started = False def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._thread.start() self._block_until_started() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _run(self): """ Query the given url, and report whether we get the expected code. """ try: r = requests.request(self.verb, self.url, timeout=2) self.set_met(r.status_code == self.expected_code) except requests.ConnectionError: self._log.debug('URL {0} is not available.'.format(self.url)) self.set_met(False) except requests.Timeout: self._log.debug('Timed out to URL {0}.'.format(self.url)) self.set_met(False) def _run_loop(self): while self._operate == True: self._run() sleep(self.interval) self._log.info('Done querying {0}'.format(self.url)) def __repr__(self): return ('{0}(component={1}, parent={2}, url="{3}", verb={4}, ' 'interval={5} started={6}, operational={7}, met={8})'.format( self.__class__.__name__, self._comp_name, self._parent, self.url, self.verb, self.interval, self.started, self._operational, self._met)) def __eq__(self, other): return all([ type(self) == type(other), self.url == getattr(other, 'url', None), self.verb == getattr(other, 'verb', None), self.interval == getattr(other, 'interval', None) ]) def __ne__(self, other): return any([ type(self) != type(other), self.url != getattr(other, 'url', None), self.verb != getattr(other, 'verb', None), self.interval != getattr(other, 'interval', None) ])
class ChildProcess(object): """ Wraps a threading.Thread, providing a Queue for communication between the SentinelDaemon and the ChildProcess. """ def __init__(self, config, system, settings): """ :type config: xml.etree.ElementTree.Element :type system: zoom.common.types.PlatformType :type settings: dict """ self._log = logging.getLogger("sent.child") self._action_queue = UniqueQueue() self._cancel_flag = ThreadSafeObject(False) self.name = verify_attribute(config, "id") self._application_type = verify_attribute(config, "type") self._config = config self._system = system # Linux or Windows self._settings = settings self._process = self._create_process() def add_work(self, work, immediate=False): """ :type work: zoom.agent.task.task.Task :type immediate: bool :rtype: bool """ return self._action_queue.append_unique(work, sender=str(self), first=immediate) def cancel_current_task(self): """ Set the cancel flag that is used in the process client. """ # this seems like a hack. There must be a better way of cancelling while # still allowing the agent to report up/down status DONT_REMOVE = ("register", "unregister") self._log.info("Setting Cancel Flag and clearing queue.") self._cancel_flag.set_value(True) for i in list(self._action_queue): if i.name not in DONT_REMOVE: self._action_queue.remove(i) self._log.info("Removing task {0}".format(i)) def stop(self): """ Stops the Process/Thread """ try: self._log.info("Terminating {0} child process".format(self.name)) self.cancel_current_task() self.add_work(Task("terminate", block=True), immediate=True) except Exception as e: self._log.warning("Exception with stopping {0} child process: {1}".format(self.name, e)) def join(self): """ Block until underlying process completes. """ self._process.join() self._log.info("{0} stopped.".format(self)) def _create_process(self): """ :rtype: threading.Thread """ self._log.debug("Starting worker process for %s" % self.name) if self._application_type == ApplicationType.APPLICATION: s = Application( self._config, self._settings, self._action_queue, self._system, self._application_type, self._cancel_flag, ) elif self._application_type == ApplicationType.JOB: s = Job( self._config, self._settings, self._action_queue, self._system, self._application_type, self._cancel_flag, ) t = Thread(target=s.run, name=self.name) t.daemon = True t.start() return t def __str__(self): return "ChildProcess(name={0}, type={1})".format(self.name, self._application_type)
class Application(object): """ Service object to represent an deployed service. """ def __init__(self, config, settings, queue, system, application_type, cancel_flag): """ :type config: dict (xml) :type settings: dict :type queue: zoom.agent.entities.unique_queue.UniqueQueue :type system: zoom.common.types.PlatformType :type application_type: zoom.common.types.ApplicationType :type cancel_flag: zoom.agent.entities.thread_safe_object.ThreadSafeObject """ self.config = config self._settings = settings self.name = verify_attribute(self.config, 'id', none_allowed=False) self._log = logging.getLogger('sent.{0}.app'.format(self.name)) # informational attributes self._host = socket.getfqdn() self._system = system self._predicates = list() self._running = True # used to manually stop the run loop self._prev_state = None self._actions = dict() # created in _reset_watches on zk connect self._env = os.environ.get('EnvironmentToUse', 'Staging') self._apptype = application_type self._restart_on_crash = \ verify_attribute(self.config, 'restart_on_crash', none_allowed=True) self._post_stop_sleep = verify_attribute(self.config, 'post_stop_sleep', none_allowed=True, cast=int, default=5) # tool-like attributes self.listener_lock = Lock() self._action_queue = queue self._mode = ApplicationMode( ApplicationMode.MANUAL, callback=self._update_agent_node_with_app_details) self._state = ThreadSafeObject( ApplicationState.OK, callback=self._update_agent_node_with_app_details) self._start_stop_time = '' # Default to empty string for comparison self._login_user = '******' # Default to Zoom self._user_set_in_react = False self._run_check_mode = False self._pd_svc_key = verify_attribute(config, 'pagerduty_service', none_allowed=True) restartmax = verify_attribute(config, 'restartmax', none_allowed=True, cast=int, default=3) self._rl = RestartLogic( self.name, restartmax, count_callback=self._update_agent_node_with_app_details) self._read_only = False self._paths = self._init_paths(self.config, settings, application_type) # clients self.zkclient = KazooClient( hosts=get_zk_conn_string(), timeout=60.0, handler=SequentialThreadingHandler(), logger=logging.getLogger('kazoo.app.{0}'.format(self.name))) self.zkclient.add_listener(self._zk_listener) self._proc_client = self._init_proc_client(self.config, application_type, cancel_flag) self._actions = self._init_actions(settings) self._work_manager = self._init_work_manager(self._action_queue) def app_details(self): return {'name': self.name, 'host': self._host, 'platform': self._system, 'mode': self._mode.value, 'state': self._state.value, 'start_stop_time': self._start_stop_time, 'login_user': self._login_user, 'read_only': self._read_only, 'restart_count': self._rl.count} def run(self): """ - Start the zookeeper client - Check for already running instances. - Start main loop, periodically checking whether the process has failed. """ try: self.zkclient.start() # make all action objects start processing predicates self._log.info('Starting to process Actions.') map(lambda x: x.start(), self._actions.values()) # start actions started = all([i.started for i in self._actions.values()]) if not started: self._log.critical('All actions are not started!') else: self._log.info('All actions started.'.format(started)) self._check_mode() # get global mode AFTER starting actions while self._running: sleep(5) self.uninitialize() except Exception as ex: self._log.critical('There was an exception in the main loop. ' 'In a bad state. ({0})'.format(ex)) @catch_exception(NodeExistsError) @connected def register(self, **kwargs): """ Add entry to the state tree """ action_name = kwargs.get('action_name', 'register') if not self.zkclient.exists(self._paths['zk_state_path']): if self._action_is_ready(action_name): self._log.info('Registering %s in state tree.' % self.name) self.zkclient.create(self._paths['zk_state_path'], ephemeral=True, makepath=True) # resolve any pager duty alarms self._create_alert_node(AlertActionType.RESOLVE, AlertReason.RESOLVED) # reset restart counters, etc self._proc_client.reset_counters() self._state.set_value(ApplicationState.STARTED) else: self._log.info('Action {0} is not ready. Not registering.' .format(action_name)) else: self._log.info('Already registered (node exists).') return 0 @catch_exception(NoNodeError) @connected def unregister(self, **kwargs): """Remove entry from state tree""" action_name = kwargs.get('action_name', 'unregister') if self._action_is_ready(action_name): self._log.info('Un-registering %s from state tree.' % self.name) self.zkclient.delete(self._paths['zk_state_path']) return 0 @catch_exception(RuntimeError) def uninitialize(self): """ Gracefully stop this Zookeeper session, then free any resentinels held by the client. """ self._log.info('Stopping Zookeeper client') self._work_manager.stop() map(lambda x: x.stop(), self._actions.values()) # stop actions del self._predicates[:] # make sure we delete old predicates self.zkclient.stop() self.zkclient.close() return 0 @time_this def start(self, **kwargs): """ Start actual process :param kwargs: passed from zoom.handlers.control_agent_handlers """ # Restart from UI: ran_stop=True, stay_down=False # Stop from UI: ran_stop=True, stay_down=True # Crash: ran_stop=False, stay_down=False if self._proc_client.restart_logic.ran_stop \ and self._proc_client.restart_logic.stay_down \ and self._apptype == ApplicationType.APPLICATION: self._log.info('Not starting. App was stopped with Zoom.') # set to OK just in case we're staggered self._state.set_value(ApplicationState.OK) return 0 elif self._proc_client.restart_logic.crashed and \ not self._restart_on_crash: self._log.info('Not starting. The application has crashed.') self._state.set_value(ApplicationState.NOTIFY) return 0 else: self._log.debug('Start allowed.') if kwargs.get('reset', True): self._proc_client.reset_counters() if kwargs.get('pause', False): self.ignore() pd_enabled = kwargs.get('pd_enabled', True) self._start_stop_time = self._get_current_time() # set login user if not set in react if not self._user_set_in_react: self._login_user = kwargs.get('login_user', 'Zoom') self._state.set_value(ApplicationState.STARTING) result = self._proc_client.start() if self._run_check_mode: # Reset to global mode if restart with dep self._check_mode() self._run_check_mode = False if result == 0 or result == ApplicationStatus.CANCELLED: self._state.set_value(ApplicationState.STARTED) else: self._state.set_value(ApplicationState.ERROR) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, AlertReason.FAILEDTOSTART) else: self._log.debug('PD is disabled, not sending alert.') return result @time_this def stop(self, **kwargs): """ Stop actual process :param kwargs: Passed from: zoom.www.handlers.control_agent_handler.ControlAgentHandler, zoom.agent.action.action.Action """ if kwargs.get('reset', True): self._proc_client.reset_counters() if kwargs.get('pause', False): self.ignore() self._start_stop_time = self._get_current_time() self._login_user = kwargs.get('login_user', 'Zoom') self._state.set_value(ApplicationState.STOPPING) result = self._proc_client.stop(**kwargs) if result != ApplicationStatus.CANCELLED: # give everything time to catch up, not sure why anymore... self._log.info('Sleeping for the configured {0}s after stop.' .format(self._post_stop_sleep)) sleep(self._post_stop_sleep) # reset this value back to False self._user_set_in_react = False if result == ApplicationStatus.CANCELLED: self._state.set_value(ApplicationState.STOPPED) elif result != 0: self._state.set_value(ApplicationState.ERROR) else: self._state.set_value(ApplicationState.STOPPED) return result def status(self): """ Log out the status of each configured action. :rtype: str """ out = '\n' out += '#' * 40 + ' STATUS ' + '#' * 40 out += '\n{0}'.format(self) out += '\n' for i in self._actions.values(): out += '\n{0}'.format(i.status) out += '\n' out += '#' * 40 + ' STATUS ' + '#' * 40 out += '\n' self._log.info(out) return out def restart(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ # if not self._action_is_ready('restart', allow_undefined=True): # self._log.info('Restart action not ready.') # return self._log.info('Running Restart. Queuing stop, unregister, start.') self._action_queue.clear() self._action_queue.append_unique(Task('stop', kwargs=kwargs)) self._action_queue.append_unique(Task('unregister')) self._action_queue.append_unique(Task('start', kwargs=kwargs)) return 0 def dep_restart(self, **kwargs): self._run_check_mode = True # only used in self.start() self._action_queue.append(Task('start_if_ready', kwargs=kwargs)) return 0 def start_if_ready(self, **kwargs): if self._action_is_ready('start'): self.start(**kwargs) # if start action doesn't exist, a.k.a. read only elif self._actions.get('start', None) is None: self.start(**kwargs) else: self._action_queue.append(Task('react', kwargs=kwargs)) return 0 @time_this @connected def ignore(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ self._mode.set_value(ApplicationMode.MANUAL) self._log.info('Mode is now "{0}"'.format(self._mode)) return 0 @time_this @connected def react(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ self._mode.set_value(ApplicationMode.AUTO) self._log.info('Mode is now "{0}"'.format(self._mode)) # when react is called through "restart with dependencies" command self._user_set_in_react = True self._login_user = kwargs.get('login_user', 'Zoom') return 0 @time_this @connected def notify(self, **kwargs): """ Send notification based on arbitrary predicates """ action_name = kwargs.get('action_name', 'notify') pd_enabled = kwargs.get('pd_enabled', True) pd_reason = kwargs.get('pd_reason', None) if pd_reason is None: pd_reason = AlertReason.CRASHED if not self._action_is_ready(action_name): self._log.info('notify action not defined or not ready.') return 1 self._state.set_value(ApplicationState.NOTIFY) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, pd_reason) else: self._log.debug('PD is disabled, not sending alert.') return 0 @time_this @connected def ensure_running(self, **kwargs): """ Essentially a clone of `notify`, but tailored for process monitoring. """ # Application failed to start. Already sent PD alert if self._state == ApplicationState.ERROR: return 1 action_name = kwargs.get('action_name', 'ensure_running') pd_enabled = kwargs.get('pd_enabled', True) pd_reason = kwargs.get('pd_reason', None) if pd_reason is None: pd_reason = AlertReason.CRASHED if not self._action_is_ready(action_name): self._log.info('notify action not defined or not ready.') return if not self._proc_client.restart_logic.ran_stop: # the application has crashed self._state.set_value(ApplicationState.NOTIFY) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, pd_reason) else: self._log.debug('PD is disabled, not sending alert.') else: self._log.debug("Service shut down gracefully") return 0 def terminate(self): """Terminate child thread/process""" self._running = False return 0 def _action_is_ready(self, action_name, allow_undefined=False): """ Check if a configured action's predicates are met :type action_name: str :type allow_undefined: bool :rtype: bool """ action = self._actions.get(action_name, None) if allow_undefined: if action is None: return True return action is not None and action.ready @catch_exception(NoNodeError) @connected def _update_agent_node_with_app_details(self, event=None): """ Register app data with the agent in the state tree. :type event: kazoo.protocol.states.WatchedEvent or None """ if self._running and \ not self.zkclient.exists(self._paths['zk_state_base']): self.zkclient.create(self._paths['zk_state_base'], makepath=True) data, stat = self.zkclient.get(self._paths['zk_state_base']) try: agent_apps = json.loads(data) except ValueError: agent_apps = dict() # check for config conflict other_host = agent_apps.get('host') if other_host is not None and self._host != other_host: self._log.error('There is a config conflict with {0}. Updates ' 'will no longer be sent until it is resolved.' .format(other_host)) self._state.set_value(ApplicationState.CONFIG_ERROR, run_callback=False) # make sure data is the most recent if self.app_details() != agent_apps: self.zkclient.set(self._paths['zk_state_base'], json.dumps(self.app_details())) self._log.debug('Registering app data {0}' .format(self.app_details())) # set watch if self._state != ApplicationState.CONFIG_ERROR: self.zkclient.get( self._paths['zk_state_base'], watch=self._update_agent_node_with_app_details) else: self._log.error('Shutting down because of config error.') self.terminate() def _init_paths(self, config, settings, atype): """ :rtype: dict """ paths = dict() paths['zk_state_base'] = verify_attribute( config, 'registrationpath', none_allowed=True, default=self._pathjoin(settings.get('zookeeper', {}).get('state'), atype, self.name) ) paths['zk_state_path'] = \ self._pathjoin(paths['zk_state_base'], self._host) paths['zk_config_path'] = \ self._pathjoin(settings.get('zookeeper', {}).get('config'), atype, self.name) paths['zk_agent_path'] = \ self._pathjoin(settings.get('zookeeper', {}).get('agent_state'), self._host) return paths def _init_proc_client(self, config, atype, cancel_flag): """Create the process client.""" start_cmd = verify_attribute(config, 'start_cmd', none_allowed=True) stop_cmd = verify_attribute(config, 'stop_cmd', none_allowed=True) status_cmd = verify_attribute(config, 'status_cmd', none_allowed=True) script = verify_attribute(config, 'script', none_allowed=True) g_names = self._get_graphite_metric_names() return ProcessClient(name=self.name, start_cmd=start_cmd, stop_cmd=stop_cmd, status_cmd=status_cmd, script=script, apptype=atype, restart_logic=self._rl, graphite_metric_names=g_names, cancel_flag=cancel_flag) def _init_actions(self, settings): """ :rtype: dict """ action_factory = ActionFactory(component=self, zkclient=self.zkclient, proc_client=self._proc_client, action_queue=self._action_queue, mode=self._mode, system=self._system, pred_list=self._predicates, app_state=self._state, settings=settings) actions = action_factory.create(self.config) self._determine_read_only(actions) return actions def _determine_read_only(self, actions): start_action = actions.get('start', None) if start_action is None: self._read_only = True elif start_action.disabled is True: self._read_only = True else: self._read_only = False def _init_work_manager(self, queue): """ :rtype: zoom.agent.entities.work_manager.WorkManager """ acceptable_work = dict() # actions have additional logic, so use those if available for k, v in self._actions.iteritems(): acceptable_work[k] = v.run # if action is not available, add public methods for attribute in [a for a in dir(self) if not a.startswith('_')]: obj = getattr(self, attribute) if hasattr(obj, '__call__'): if attribute not in acceptable_work: acceptable_work[attribute] = obj else: self._log.debug('Method {0} already assigned to action.' .format(attribute)) manager = WorkManager(self.name, queue, acceptable_work) manager.start() return manager @connected def _check_mode(self, event=None): """ Check global run mode for the agents. :type event: kazoo.protocol.states.WatchedEvent or None """ global_path = self._settings.get('zookeeper', {}).get('global_config') if global_path is None: self._log.warning('Received no global config path. Zoom will be ' 'unable to change the global mode.') return modepath = self._pathjoin(global_path, 'mode') try: data, stat = self.zkclient.get(modepath, watch=self._check_mode) j = json.loads(data) self._log.info('Getting mode from Zookeeper from path: {0}'. format(modepath)) self._mode.set_value(str(j.get(u'mode', ApplicationMode.MANUAL))) self._log.info('Setting mode to "{0}"'.format(self._mode)) except NoNodeError: self._log.info('ZK path {0} does not exist. Assuming mode "manual"' .format(modepath)) except Exception: self._log.exception('An uncaught exception has occurred.') def _pathjoin(self, *args): """ Helper function to join paths. Uses string joining if it is a Windows box. :rtype: str """ if self._system == PlatformType.LINUX: return os.path.join(*args) elif self._system == PlatformType.WINDOWS: return '/'.join(args) def _get_graphite_metric_names(self): """ splits the state path at 'application' and returns the latter index :rtype: dict """ names = {"result": None, "runtime": None, "updown": None} type_path = self._paths.get('zk_state_base')\ .split(self._settings.get('zookeeper', {}).get('state') + '/', 1)[1] type_metric = type_path.replace('/', '.') graphite = self._settings.get('graphite') if graphite is not None: result_path = str(graphite.get('result')) runtime_path = str(graphite.get('runtime')) updown_path = str(graphite.get('updown')) names["result"] = result_path.format(type_metric) names["runtime"] = runtime_path.format(type_metric) names["updown"] = updown_path.format(type_metric) return names def _get_current_time(self): return datetime.now().strftime("%Y-%m-%d %H:%M:%S") def _get_alert_details(self, alert_action, reason): return { "action": alert_action, "service_key": self._pd_svc_key, "incident_key": self._pathjoin('sentinel', self.name, self._host), "description": ('Sentinel Error: name={0}, host={1}, issue="{2}".' .format(self.name, self._host, reason)), "details": ('Sentinel Error: name={0}, host={1}, issue="{2}".\n' 'Review the application log and contact the appropriate' ' development group.' .format(self.name, self._host, reason)) } @catch_exception(NoNodeError) @connected def _create_alert_node(self, alert_action, reason): """ Create Node in ZooKeeper that will result in a PagerDuty alarm :type alert_action: zoom.common.types.AlertActionType """ alert_details = self._get_alert_details(alert_action, reason) # path example: /foo/sentinel.bar.baz.HOSTFOO alert = self._settings.get('zookeeper', {}).get('alert') if alert is None: self._log.warning('Was given no alert path. This sentinel will be ' 'unable to forward alerts to Zoom.') return alert_path = self._pathjoin(alert, re.sub('/', '.', alert_details['incident_key'])) if self._env in self._settings.get('pagerduty', {}).get('enabled_environments', []): self._log.info('Creating alert "{0}" node for env: {1}' .format(alert_action, self._env)) if self.zkclient.exists(alert_path): self.zkclient.set(alert_path, value=json.dumps(alert_details)) else: self.zkclient.create(alert_path, value=json.dumps(alert_details)) else: self._log.info('Not creating alert "{0}" node for env: {1}' .format(alert_action, self._env)) self._log.info('Would have created path {0}'.format(alert_path)) @catch_exception(Exception, traceback=True) @run_only_one('listener_lock') def _reset_after_connection_loss(self): """ Recreates all actions and predicates after connection loss. Recheck the mode and allowed instances. """ if self._running: self._log.info('Application listener callback triggered') map(lambda x: x.stop(), self._actions.values()) # stop actions self._actions.clear() self._predicates = [] self._actions = self._init_actions(self._settings) map(lambda x: x.reset(), self._predicates) # reset predicates map(lambda x: x.start(), self._actions.values()) # start actions self._check_mode() self._log.info('Application listener callback complete!') else: self._log.info('The daemon has called for termination. ' 'Not trying to reset after connection loss.') def _zk_listener(self, state): """ The callback function that runs when the connection state to Zookeeper changes. Either passes or immediately spawns a new thread that resets any watches, etc., so that it can listen to future connection state changes. """ try: self._log.info('Zookeeper Connection went from {0} to {1}' .format(self._prev_state, state)) if self._prev_state is None and state == KazooState.CONNECTED: pass elif self._prev_state == KazooState.LOST and state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) elif self._prev_state == KazooState.CONNECTED and state == KazooState.SUSPENDED: pass elif self._prev_state == KazooState.CONNECTED and state == KazooState.LOST: pass elif self._prev_state == KazooState.SUSPENDED and state == KazooState.LOST: pass elif self._prev_state == KazooState.SUSPENDED and state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) elif state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) else: self._log.info('Zookeeper Connection in unknown state: {0}' .format(state)) return self._prev_state = state except Exception as ex: self._log.exception('An uncaught exception has occurred in the ' 'listener: {0}'.format(ex)) def __str__(self): return self.__repr__() def __repr__(self): return ("{0}(name={1}, runmode={2})" .format(self.__class__.__name__, self.name, self._mode))
class WorkManager(object): def __init__(self, comp_name, queue, work_dict): """ :type comp_name: str :type queue: zoom.agent.entities.unique_queue.UniqueQueue :type work_dict: dict """ self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run, name='work_manager', args=(self._operate, queue, work_dict)) self._thread.daemon = True self._log = logging.getLogger('sent.{0}.wm'.format(comp_name)) def start(self): self._log.info('starting work manager') self._thread.start() def stop(self): self._log.info('Stopping work manager.') self._operate.set_value(False) self._thread.join() self._log.info('Stopped work manager.') def _run(self, operate, queue, work_dict): """ :type operate: zoom.agent.entities.thread_safe_object.ThreadSafeObject :type queue: zoom.agent.entities.unique_queue.UniqueQueue :type work_dict: dict """ while operate == True: if queue: # if queue is not empty self._log.info('Current Task Queue:\n{0}' .format(pprint.pformat(list(queue)))) task = queue[0] # grab task, but keep it in the queue if task.func is None: func_to_run = work_dict.get(task.name, None) else: func_to_run = task.func if func_to_run is not None: self._log.info('Found work "{0}" in queue.' .format(task.name)) t = ThreadWithReturn(target=func_to_run, name=task.name, args=task.args, kwargs=task.kwargs) t.start() if task.block: task.result = t.join() else: self._log.warning('Cannot do "{0}", it is not a valid ' 'action.'.format(task.name)) try: queue.remove(task) except ValueError: self._log.debug('Item no longer exists in the queue: {0}' .format(task)) else: time.sleep(1) self._log.info('Done listening for work.') return
class SentinelDaemon(object): def __init__(self): """ Read config and spawn child processes. """ self._log = logging.getLogger('sent.daemon') self._log.info('Creating Sentinel') self.children = dict() self._settings = ThreadSafeObject(dict()) self._system = self._get_system() self._hostname = platform.node().upper() # must be uppercase self._prev_state = None self.listener_lock = Lock() if self._system == PlatformType.LINUX: self.zkclient = KazooClient(hosts=ZK_CONN_STRING, handler=SequentialThreadingHandler(), logger=logging.getLogger('kazoo.daemon')) elif self._system == PlatformType.WINDOWS: self.zkclient = KazooClient(hosts=ZK_CONN_STRING, handler=SequentialThreadingHandler()) self.zkclient.add_listener(self._zk_listener) # this will run self._reset_after_connection_loss self.zkclient.start() self.task_client = None self.task_client = TaskClient(self.children, self.zkclient, self._settings) self._rest_server = tornado.httpserver.HTTPServer( RestServer(self.children, self._settings)) signal.signal(signal.SIGINT, self._handle_sigint) signal.signal(signal.SIGTERM, self._handle_sigint) self._log.info('Created Sentinel') def __enter__(self): logging.info('Starting Sentinel') self._rest_server.listen('9000') logging.info('Started Sentinel') def __exit__(self, exc_type, exc_val, exc_tb): pass def stop(self): """Terminate all child processes and exit.""" self._log.info('Stopping Sentinel') self._terminate_children() self._rest_server.stop() self._log.info('Stopped Sentinel. Exiting.') sys.exit(0) def _handle_sigint(self, sig, frame): self._log.info('Caught signal %s.' % sig) self.stop() @connected def _get_settings(self, event=None): """ Populate self._settings dict. :type event: kazoo.protocol.states.WatchedEvent or None """ data, stat = self.zkclient.get(ZK_AGENT_CONFIG, watch=self._get_settings) self._settings.set_value(json.loads(data)) self._log.info('Got settings:\n{0}' .format(pprint.pformat(self._settings.value))) @catch_exception(NodeExistsException) @connected def _register(self, event=None): """ :type event: kazoo.protocol.states.WatchedEvent or None """ agent_state_path = self._settings.get('ZK_AGENT_STATE_PATH') path = '/'.join([agent_state_path, self._hostname]) if not self.zkclient.exists(path, watch=self._register): self.zkclient.create(path, value=json.dumps({}), ephemeral=True) @connected def _get_config_and_run(self, event=None): """ Grab config from Zookeeper. Spawn ChildProcess instances. :type event: kazoo.protocol.states.WatchedEvent or None """ agent_config_path = self._settings.get('ZK_AGENT_CONFIG_PATH') config_path = '/'.join([agent_config_path, self._hostname]) try: if not self.zkclient.exists(config_path, watch=self._get_config_and_run): self._log.warning('Node does not exist at: {0}. Creating.' .format(config_path)) self.zkclient.create( config_path, value='<?xml version= "1.0"?><Application />') return data, stat = self.zkclient.get(config_path) config = ElementTree.fromstring(data.strip()) self._terminate_children() self._spawn_children(config) except ParseError as e: self._log.error('Incomplete XML config found in path {0}: {1}' .format(config_path, e)) except ZookeeperError as e: self._log.error('ZK server returned a non-zero error code: {0}' .format(e)) except Exception as e: self._log.exception('There were some Exception: {0}'.format(e)) def _spawn_children(self, config): """ Populate the self.children dictionary :type config: xml.etree.ElementTree.Element """ for component in config.iter('Component'): try: name = verify_attribute(component, 'id') self._log.info('Spawning %s' % name) self.children[name] = { 'config': component, 'process': ChildProcess(component, self._system, self._settings) } except ValueError as e: self._log.error('Error with ID in config: {0}'.format(e)) continue def _terminate_children(self): """ Stop all children in the self.children dictionary, and clear it. """ self._log.info('Stopping children.') for child in self.children.values(): process = child['process'] self._log.info('Terminating child -- {0}'.format(process)) process.stop() # sent stop to all, now wait for all to complete map(lambda i: i.join(), [x['process'] for x in self.children.values()]) self.children.clear() @catch_exception(Exception, traceback=True) @run_only_one('listener_lock') @connected def _reset_after_connection_loss(self): """ Used for spawning child process and resetting watches ZK connection changes. This includes the first connection to Zookeeper (on startup). """ self._log.info('Daemon listener callback triggered') self._get_settings() self._register() self._get_config_and_run() if self.task_client is not None: self.task_client.reset_watches() self._log.info('Daemon listener callback complete!') def _zk_listener(self, state): """ The callback function that runs when the connection state to Zookeeper changes. Either passes or immediately spawns a new thread that resets any watches, etc., so that it can listen to future connection state changes. """ try: self._log.info('Zookeeper Connection went from {0} to {1}' .format(self._prev_state, state)) if (self._prev_state == KazooState.LOST and state == KazooState.CONNECTED): self.zkclient.handler.spawn(self._reset_after_connection_loss) elif (self._prev_state == KazooState.CONNECTED and state == KazooState.SUSPENDED): pass elif (self._prev_state == KazooState.CONNECTED and state == KazooState.LOST): pass elif (self._prev_state == KazooState.SUSPENDED and state == KazooState.LOST): pass elif (self._prev_state == KazooState.SUSPENDED and state == KazooState.CONNECTED): self.zkclient.handler.spawn(self._reset_after_connection_loss) elif state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) else: self._log.info('Zookeeper Connection in unknown state: {0}' .format(state)) return self._prev_state = state except Exception as e: self._log.error('Listener excepted out with error: {0}'.format(e)) def _get_system(self): system_str = platform.platform(terse=True) if 'Linux' in system_str: return PlatformType.LINUX elif 'Windows' in system_str: return PlatformType.WINDOWS else: return PlatformType.UNKNOWN
class PredicateHealth(SimplePredicate): def __init__(self, comp_name, command, interval, system, operational=False, parent=None): """ :type comp_name: str :type command: str :type interval: int or float :type system: zoom.common.types.PlatformType :type operational: bool :type parent: str or None """ SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent) self._log = logging.getLogger('sent.{0}.pred.health'.format(comp_name)) self.interval = interval self.rawcmd = command self._runcmd = str() self._system = system self._verify() self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._log.info('Registered {0}'.format(self)) self._started = False def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._thread.start() self._block_until_started() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _verify(self): if self._system == PlatformType.LINUX: self._runcmd = shlex.split(self.rawcmd) elif self._system == PlatformType.WINDOWS: self._runcmd = self.rawcmd else: self._runcmd = "" exe = shlex.split(self.rawcmd)[0] exists = os.path.exists(exe) if not exists: searchpath = os.environ['PATH'] for i in searchpath.split(':'): newpath = os.path.join(i, exe) if os.path.exists(newpath): exists = True break if not exists: err = ('Cannot register check "{0}". The path does not exist.' .format(exe)) self._log.error(err) raise OSError(err) def _run(self): """ Run the check as a subprocess and return the results as a bool based on return code. (Non-zero equals failure) :rtype: bool """ p = Popen(self._runcmd, stdout=PIPE, stderr=PIPE) out, err = p.communicate() if err: self._log.error('There was some error with the check "{0}"\n{1}' .format(self.rawcmd, err)) self.set_met(False) if p.returncode != 0: self._log.error('Check "{0}" has failed.'.format(self.rawcmd)) self.set_met(False) else: self._log.debug('Check "{0}" has succeeded.'.format(self.rawcmd)) self.set_met(True) def _run_loop(self): while self._operate == True: self._run() sleep(self.interval) self._log.info('Done running {0}'.format(self)) def __repr__(self): return ('{0}(component={1}, parent={2}, cmd="{3}", interval={4} ' 'started={5}, operational={6}, met={7})' .format(self.__class__.__name__, self._comp_name, self._parent, self.rawcmd, self.interval, self.started, self._operational, self._met)) def __eq__(self, other): return all([ type(self) == type(other), self.rawcmd == getattr(other, 'rawcmd', None), self.interval == getattr(other, 'interval', None) ]) def __ne__(self, other): return any([ type(self) != type(other), self.rawcmd != getattr(other, 'rawcmd', None), self.interval != getattr(other, 'interval', None) ])
class Application(object): """ Service object to represent an deployed service. """ def __init__(self, config, settings, conn, queue, system, application_type): """ :type config: dict (xml) :type settings: zoom.agent.entities.thread_safe_object.ThreadSafeObject :type conn: multiprocessing.Connection :type queue: zoom.agent.entities.unique_queue.UniqueQueue :type system: zoom.common.types.PlatformType :type application_type: zoom.common.types.ApplicationType """ self.config = config self._settings = settings self.name = verify_attribute(self.config, 'id', none_allowed=False) self._log = logging.getLogger('sent.{0}.app'.format(self.name)) # informational attributes self._host = platform.node().upper() self._fqdn = socket.getfqdn() self._system = system self._predicates = list() self._running = True # used to manually stop the run loop self._prev_state = None self._actions = dict() # created in _reset_watches on zk connect self._env = os.environ.get('EnvironmentToUse', 'Staging') self._apptype = application_type # tool-like attributes self.listener_lock = Lock() self._action_queue = queue self._mode = ApplicationMode(ApplicationMode.MANUAL) self._state = ThreadSafeObject(ApplicationState.OK) self._trigger_time = '' # Default to empty string for comparison self._login_user = '******' # Default to Zoom self._run_check_mode = False self._pd_svc_key = verify_attribute(config, 'pagerduty_service', none_allowed=True) self._paths = self._init_paths(self.config, settings, application_type) # clients if self._system == PlatformType.LINUX: self.zkclient = KazooClient( hosts=ZK_CONN_STRING, handler=SequentialThreadingHandler(), logger=logging.getLogger('kazoo.app.{0}'.format(self.name))) elif self._system == PlatformType.WINDOWS: self.zkclient = KazooClient(hosts=ZK_CONN_STRING, handler=SequentialThreadingHandler()) self.zkclient.add_listener(self._zk_listener) self._proc_client = self._init_proc_client(self.config, settings, application_type) self._actions = self._init_actions(settings) self._work_manager = self._init_work_manager(self._action_queue, conn) @property def app_details(self): return {'name': self.name, 'host': self._host, 'fqdn': self._fqdn, 'platform': self._system, 'mode': self._mode.value, 'state': self._state.value, 'trigger_time': self._trigger_time, 'login_user': self._login_user} def run(self): """ - Start the zookeeper client - Check for already running instances. - Start main loop, periodically checking whether the process has failed. """ self.zkclient.start() # make all action objects start processing predicates self._log.info('Starting to process Actions.') map(lambda x: x.start(), self._actions.values()) # start actions self._check_mode() # get global mode AFTER starting actions while self._running: sleep(5) self.uninitialize() @catch_exception(NodeExistsError) @connected def register(self, **kwargs): """ Add entry to the state tree """ if not self.zkclient.exists(self._paths['zk_state_path']): if self._action_is_ready('register'): self._log.info('Registering %s in state tree.' % self.name) self.zkclient.create(self._paths['zk_state_path'], ephemeral=True, makepath=True) # resolve any pager duty alarms self._create_alert_node(AlertActionType.RESOLVE, AlertReason.RESOLVED) # reset restart counters, etc self._proc_client.reset_counters() self._state.set_value(ApplicationState.OK) self._update_agent_node_with_app_details() @catch_exception(NoNodeError) @connected def unregister(self, **kwargs): """Remove entry from state tree""" if self._action_is_ready('unregister'): self._log.info('Un-registering %s from state tree.' % self.name) self.zkclient.delete(self._paths['zk_state_path']) @catch_exception(RuntimeError) def uninitialize(self): """ Gracefully stop this Zookeeper session, then free any resentinels held by the client. """ self._log.info('Stopping Zookeeper client') self._work_manager.stop() map(lambda x: x.stop(), self._actions.values()) # stop actions self.zkclient.stop() self.zkclient.close() @time_this def start(self, **kwargs): """ Start actual process :param kwargs: passed from zoom.handlers.control_agent_handlers """ # Same check as self.notify() but needed when start action is # called after process crashes and all predicates are met when on Auto if not self._proc_client.restart_logic.ran_stop \ and self._apptype == ApplicationType.APPLICATION: self._log.info('Not starting. App was stopped with Zoom.') return 0 else: self._log.debug('Start allowed.') if kwargs.get('reset', True): self._proc_client.reset_counters() if kwargs.get('pause', False): self.ignore() pd_enabled = kwargs.get('pd_enabled', True) self._trigger_time = self._get_current_time() self._login_user = kwargs.get('login_user', 'Zoom') self._state.set_value(ApplicationState.STARTING) self._update_agent_node_with_app_details() result = self._proc_client.start() if self._run_check_mode: # Reset to global mode if restart with dep self._check_mode() self._run_check_mode = False if result == 0: self._state.set_value(ApplicationState.OK) else: self._state.set_value(ApplicationState.ERROR) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, AlertReason.FAILEDTOSTART) else: self._log.debug('PD is disabled, not sending alert.') self._update_agent_node_with_app_details() return result @time_this def stop(self, **kwargs): """ Stop actual process :param kwargs: passed from zoom.handlers.control_agent_handlers """ if kwargs.get('reset', True): self._proc_client.reset_counters() if kwargs.get('pause', False): self.ignore() self._trigger_time = self._get_current_time() self._login_user = kwargs.get('login_user', 'Zoom') self._state.set_value(ApplicationState.STOPPING) self._update_agent_node_with_app_details() result = self._proc_client.stop(**kwargs) if result != 0 and kwargs.get('argument', 'false') == 'false': self._state.set_value(ApplicationState.ERROR) else: self._state.set_value(ApplicationState.OK) sleep(5) # give everything time to catch up self._update_agent_node_with_app_details() return result def restart(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ # if not self._action_is_ready('restart', allow_undefined=True): # self._log.info('Restart action not ready.') # return self._log.info('Running Restart. Queuing stop, unregister, start.') self._action_queue.clear() self._action_queue.append_unique(Task('stop', kwargs=kwargs)) self._action_queue.append_unique(Task('unregister')) self._action_queue.append_unique(Task('start', kwargs=kwargs)) def dep_restart(self, **kwargs): self._run_check_mode = True # only used in self.start() self._action_queue.append(Task('start_if_ready', pipe=False)) def start_if_ready(self): if self._action_is_ready('start'): self.start() else: self._action_queue.append(Task('react', pipe=False)) @time_this @connected def ignore(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ self._mode.set_value(ApplicationMode.MANUAL) self._log.info('Mode is now "{0}"'.format(self._mode)) self._update_agent_node_with_app_details() return 0 @time_this @connected def react(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ self._mode.set_value(ApplicationMode.AUTO) self._log.info('Mode is now "{0}"'.format(self._mode)) self._update_agent_node_with_app_details() return 0 @time_this @connected def notify(self, **kwargs): """ Send notification to zookeeper that a dependency has gone down. """ # Application failed to start. Already sent PD alert if self._state == ApplicationState.ERROR: return pd_enabled = kwargs.get('pd_enabled', True) if not self._action_is_ready('notify'): self._log.info('notify action not defined or not ready.') return if not self._proc_client.restart_logic.ran_stop: # the application has crashed self._state.set_value(ApplicationState.NOTIFY) self._update_agent_node_with_app_details() if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, AlertReason.CRASHED) else: self._log.debug('PD is disabled, not sending alert.') else: self._log.debug("Service shut down gracefully") def terminate(self): """Terminate child thread/process""" self._running = False def _action_is_ready(self, action_name, allow_undefined=False): """ Check if a configured action's predicates are met :type action_name: str :type allow_undefined: bool :rtype: bool """ action = self._actions.get(action_name, None) if allow_undefined: if action is None: return True return action is not None and action.ready @connected def _update_agent_node_with_app_details(self, event=None): """ Register app data with the agent in the state tree. :type event: kazoo.protocol.states.WatchedEvent or None """ if self._running and \ not self.zkclient.exists(self._paths['zk_state_base']): self.zkclient.create(self._paths['zk_state_base']) data, stat = self.zkclient.get(self._paths['zk_state_base']) try: agent_apps = json.loads(data) except ValueError: agent_apps = dict() # check for config conflict other_host = agent_apps.get('host') if other_host is not None and self._host != other_host: self._log.error('There is a config conflict with {0}. Updates ' 'will no longer be sent until it is resolved.' .format(other_host)) self._state.set_value(ApplicationState.CONFIG_ERROR) # make sure data is the most recent if self.app_details != agent_apps: self.zkclient.set(self._paths['zk_state_base'], json.dumps(self.app_details)) self._log.debug('Registering app data {0}'.format(self.app_details)) # set watch if self._state != ApplicationState.CONFIG_ERROR: self.zkclient.get( self._paths['zk_state_base'], watch=self._update_agent_node_with_app_details) else: self._log.error('Shutting down because of config error.') self.terminate() def _init_paths(self, config, settings, atype): """ :rtype: dict """ paths = dict() registrationpath = verify_attribute(config, 'registrationpath', none_allowed=True) if registrationpath is not None: paths['zk_state_base'] = registrationpath else: paths['zk_state_base'] = \ self._pathjoin(settings.get('ZK_STATE_PATH'), atype, self.name) paths['zk_state_path'] = \ self._pathjoin(paths['zk_state_base'], self._host) paths['zk_config_path'] = \ self._pathjoin(settings.get('ZK_CONFIG_PATH'), atype, self.name) paths['zk_agent_path'] = \ self._pathjoin(settings.get('ZK_AGENT_STATE_PATH'), self._host) return paths def _init_proc_client(self, config, settings, atype): """Create the process client.""" command = verify_attribute(config, 'command', none_allowed=True) script = verify_attribute(config, 'script', none_allowed=True) restartmax = verify_attribute(config, 'restartmax', none_allowed=True, cast=int) if restartmax is None: self._log.info('Restartmax not specified. Assuming 3.') restartmax = 3 g_names = self._get_graphite_metric_names() return ProcessClient(name=self.name, command=command, script=script, apptype=atype, system=self._system, restart_logic=RestartLogic(restartmax), graphite_metric_names=g_names, settings=settings) def _init_actions(self, settings): """ :rtype: dict """ action_factory = ActionFactory(component=self, zkclient=self.zkclient, proc_client=self._proc_client, action_queue=self._action_queue, mode=self._mode, system=self._system, pred_list=self._predicates, settings=settings) return action_factory.create(self.config) def _init_work_manager(self, queue, pipe): """ :rtype: zoom.agent.entities.work_manager.WorkManager """ acceptable_work = dict() # actions have additional logic, so use those if available for k, v in self._actions.iteritems(): acceptable_work[k] = v.run # if action is not available, add the method from Application for w in self._settings.get('ALLOWED_WORK', []): if w not in acceptable_work: if hasattr(self, w): acceptable_work[w] = self.__getattribute__(w) else: self._log.error('Class has no method {0}'.format(w)) else: self._log.debug('Method {0} already assigned to action.' .format(w)) manager = WorkManager(self.name, queue, pipe, acceptable_work) manager.start() return manager @connected def _check_mode(self, event=None): """ Check global run mode for the agents. :type event: kazoo.protocol.states.WatchedEvent or None """ modepath = self._pathjoin(self._settings.get('ZK_GLOBAL_PATH'), 'mode') try: data, stat = self.zkclient.get(modepath, watch=self._check_mode) j = json.loads(data) self._log.info('Getting mode from Zookeeper from path: {0}'. format(modepath)) self._mode.set_value(str(j.get(u'mode', ApplicationMode.MANUAL))) self._log.info('Setting mode to "{0}"'.format(self._mode)) self._update_agent_node_with_app_details() except NoNodeError: self._log.info('ZK path {0} does not exist. Assuming mode "manual"' .format(modepath)) except Exception: self._log.exception('An uncaught exception has occurred.') def _pathjoin(self, *args): """ Helper function to join paths. Uses string joining if it is a Windows box. :rtype: str """ if self._system == PlatformType.LINUX: return os.path.join(*args) elif self._system == PlatformType.WINDOWS: return '/'.join(args) def _get_graphite_metric_names(self): """ splits the state path at 'application' and returns the latter index :rtype: dict """ type_path = self._paths.get('zk_state_base')\ .split(self._settings.get('ZK_STATE_PATH') + '/', 1)[1] type_metric = type_path.replace('/', '.') result_path = self._settings.get('GRAPHITE_RESULT_METRIC') runtime_path = self._settings.get('GRAPHITE_RUNTIME_METRIC') return { "result": result_path.format(type_metric), "runtime": runtime_path.format(type_metric) } def _get_current_time(self): return datetime.now().strftime("%Y-%m-%d %H:%M:%S") def _get_alert_details(self, alert_action, reason): return { "action": alert_action, "service_key": self._pd_svc_key, "incident_key": self._pathjoin('sentinel', self.name, self._host), "description": ('Sentinel Error: Application {0} {1} on host {2}.' .format(self.name, reason, self._host)), "details": ('Sentinel Error: Application {0} {1} on host {2}.\n' 'Review the application log and contact the appropriate' ' development group.' .format(self.name, reason, self._host)) } @catch_exception(NoNodeError) @connected def _create_alert_node(self, alert_action, reason): """ Create Node in ZooKeeper that will result in a PagerDuty alarm :type alert_action: zoom.common.types.AlertActionType """ alert_details = self._get_alert_details(alert_action, reason) # path example: /foo/sentinel.bar.baz.HOSTFOO alert_path = self._pathjoin( self._settings.get('ZK_ALERT_PATH'), re.sub('/', '.', alert_details['incident_key']) ) if self._env in self._settings.get('PAGERDUTY_ENABLED_ENVIRONMENTS'): self._log.info('Creating alert "{0}" node for env: {1}' .format(alert_action, self._env)) if self.zkclient.exists(alert_path): self.zkclient.set(alert_path, value=json.dumps(alert_details)) else: self.zkclient.create(alert_path, value=json.dumps(alert_details)) else: self._log.info('Not creating alert "{0}" node for env: {1}' .format(alert_action, self._env)) self._log.info('Would have created path {0}'.format(alert_path)) @catch_exception(Exception, traceback=True) @run_only_one('listener_lock') def _reset_after_connection_loss(self): """ Recreates all actions and predicates after connection loss. Recheck the mode and allowed instances. """ if self._running: self._log.info('Application listener callback triggered') map(lambda x: x.stop(), self._actions.values()) # stop actions self._actions.clear() self._predicates = [] self._actions = self._init_actions(self._settings) map(lambda x: x.reset(), self._predicates) # reset predicates map(lambda x: x.start(), self._actions.values()) # start actions self._check_mode() self._log.info('Application listener callback complete!') else: self._log.info('The daemon has called for termination. ' 'Not trying to reset after connection loss.') def _zk_listener(self, state): """ The callback function that runs when the connection state to Zookeeper changes. Either passes or immediately spawns a new thread that resets any watches, etc., so that it can listen to future connection state changes. """ try: self._log.info('Zookeeper Connection went from {0} to {1}' .format(self._prev_state, state)) if self._prev_state is None and state == KazooState.CONNECTED: pass elif (self._prev_state == KazooState.LOST and state == KazooState.CONNECTED): self.zkclient.handler.spawn(self._reset_after_connection_loss) elif (self._prev_state == KazooState.CONNECTED and state == KazooState.SUSPENDED): pass elif (self._prev_state == KazooState.CONNECTED and state == KazooState.LOST): pass elif (self._prev_state == KazooState.SUSPENDED and state == KazooState.LOST): pass elif (self._prev_state == KazooState.SUSPENDED and state == KazooState.CONNECTED): self.zkclient.handler.spawn(self._reset_after_connection_loss) elif state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) else: self._log.info('Zookeeper Connection in unknown state: {0}' .format(state)) return self._prev_state = state except Exception: self._log.exception('An uncaught exception has occurred') def __str__(self): return self.__repr__() def __repr__(self): return ("{0}(name={1}, runmode={2}, actions={3})" .format(self.__class__.__name__, self.name, self._mode, self._actions.keys()) )
class PredicateWeekend(SimplePredicate): def __init__(self, comp_name, operational=False, parent=None, interval=10): """ :type comp_name: str :type operational: bool :type parent: str or None :type interval: int or float """ SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent) self.interval = interval self._log = logging.getLogger('sent.{0}.weekend'.format(comp_name)) self._log.info('Registered {0}'.format(self)) self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._started = False @property def weekday(self): """ :rtype: int 0=Sunday, 1=Monday, etc. """ return datetime.date.today().weekday() def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._thread.start() self._block_until_started() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _run_loop(self): while self._operate == True: self._process_met() sleep(self.interval) self._log.info('Done checking for weekend.') def _process_met(self): self.set_met(self.weekday in [Weekdays.SATURDAY, Weekdays.SUNDAY]) def __repr__(self): return ('{0}(component={1}, parent={2}, started={3}, ' 'operational={4}, met={5})'.format(self.__class__.__name__, self._comp_name, self._parent, self.started, self._operational, self._met)) def __eq__(self, other): return type(self) == type(other) def __ne__(self, other): return type(self) != type(other)
class ZookeeperGoodUntilTime(SimplePredicate): def __init__(self, comp_name, settings, zkclient, nodepath, parent=None, interval=5): """ :type comp_name: str :type settings: zoom.agent.entities.thread_safe_object.ThreadSafeObject :type zkclient: kazoo.client.KazooClient :type nodepath: str :type parent: str or None :type interval: int or float """ SimplePredicate.__init__(self, comp_name, settings, parent=parent) self.node = nodepath self.zkclient = zkclient self.interval = interval self._start = None self._stop = None self._log = logging.getLogger('sent.{0}.pred.gut'.format(comp_name)) self._log.info('Registered {0}'.format(self)) self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._started = False self._datetime_regex = ( "^((?P<year>\d{4})\-(?P<month>\d{2})\-(?P<day>\d{2})\s)?" "(?P<hour>\d{2}):(?P<minute>\d{2})(:(?P<second>\d{2}))?" ) @property def current_time(self): return datetime.datetime.now().time() @property def current_datetime(self): return datetime.datetime.now() def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._watch_node() self._thread.start() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _run_loop(self): while self._operate == True: self._process_met() sleep(self.interval) self._log.info('Done comparing guts.') def _process_met(self): results = [] if self._start is not None: compare_to_start = self._get_comparison(self._start) results.append(self._start < compare_to_start) if self._stop is not None: compare_to_stop = self._get_comparison(self._stop) results.append(compare_to_stop < self._stop) if not results: results.append(False) self.set_met(all(results)) # every comparison returned True def _create_dt_dict(self, datetime_string): """ :type datetime_string: str :rtype: dict """ regex_dict = dict() match = re.search(self._datetime_regex, datetime_string) if match: regex_dict = dict(year=match.group('year'), month=match.group('month'), day=match.group('day'), hour=match.group('hour'), minute=match.group('minute'), second=match.group('second')) for k, v in regex_dict.iteritems(): if v is not None: regex_dict[k] = int(v) self._log.debug('dt_dict returning {0}'.format(regex_dict)) return regex_dict def _get_comparison(self, obj): if isinstance(obj, datetime.datetime): return self.current_datetime elif isinstance(obj, datetime.time): return self.current_time def _get_datetime_object(self, data): """ :type data: str :rtype: datetime.datetime or datetime.time or None """ dt_object = None dt_dict = self._create_dt_dict(data) try: # All of year, month and day are not None if all([dt_dict.get(i, None) is not None for i in ('year', 'month', 'day')]): dt_object = datetime.datetime(year=dt_dict['year'], month=dt_dict['month'], day=dt_dict['day'], hour=dt_dict['hour'], minute=dt_dict['minute'], microsecond=0) if dt_dict.get('second', None) is not None: dt_object.replace(second=dt_dict['second']) # both hour and minute are not None elif all([dt_dict.get(i, None) is not None for i in ('hour', 'minute')]): dt_object = datetime.time(hour=dt_dict['hour'], minute=dt_dict['minute'], microsecond=0) if dt_dict.get('second', None) is not None: dt_object.replace(second=dt_dict['second']) else: self._log.error('data "{0}" did not match regex'.format(data)) except (ValueError, TypeError) as ex: self._log.error('Problem with parsing data "{0}": {1}' .format(data, ex)) finally: return dt_object def _parse_data(self, gut_data): """ :type gut_data: dict """ start_data = gut_data.get(u'start', None) self._log.debug('raw start from zk is "{0}"'.format(start_data)) if start_data is not None: self._start = self._get_datetime_object(start_data) stop_data = gut_data.get(u'stop', None) self._log.debug('raw stop from zk is "{0}"'.format(stop_data)) if stop_data is not None: self._stop = self._get_datetime_object(stop_data) if start_data is None and stop_data is None: self._log.error('Start and Stop time not specified!') self._log.info('The current time is: {0}. Start time is: {1}. ' 'Stop time is: {2}' .format(self.current_time, self._start, self._stop)) @connected def _watch_node(self, event=None): """ :type event: kazoo.protocol.states.WatchedEvent or None """ try: exists = self.zkclient.exists(self.node, watch=self._watch_node) if exists: data, stat = self.zkclient.get(self.node, watch=self._watch_node) j = json.loads(data) self._parse_data(j) else: self._log.info('No gut node was found. Watcher is set at {0}' .format(self.node)) except ValueError as ex: self._log.error('Invalid GUT JSON object: {0}'.format(ex)) finally: self._process_met() def __repr__(self): return ('{0}(component={1}, parent={2}, start="{3}", stop="{4}", ' 'zkpath={5}, started={6}, met={7})' .format(self.__class__.__name__, self._comp_name, self._parent, self._start, self._stop, self.node, self.started, self._met)) def __eq__(self, other): return all([ type(self) == type(other), self.node == getattr(other, 'node', None) ]) def __ne__(self, other): return any([ type(self) != type(other), self.node != getattr(other, 'node', None) ])
class PredicateHoliday(SimplePredicate): def __init__(self, comp_name, zkclient, path, operational=False, parent=None, interval=10): """ :type comp_name: str :type zkclient: kazoo.client.KazooClient :type path: str or None :type operational: bool :type parent: str or None :type interval: int or float """ SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent) self.zkclient = zkclient self.interval = interval self.path = path self._log = logging.getLogger('sent.{0}.holiday'.format(comp_name)) self._log.info('Registered {0}'.format(self)) self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._started = False self._holidays = list() @property def date_string(self): """ :rtype: str Example: 20140101 """ return datetime.date.today().strftime('%Y%m%d') def start(self): if self._started is False: self._log.debug('Starting {0}'.format(self)) self._started = True self._watch_node() self._thread.start() self._block_until_started() else: self._log.debug('Already started {0}'.format(self)) def stop(self): if self._started is True: self._log.info('Stopping {0}'.format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info('{0} stopped'.format(self)) else: self._log.debug('Already stopped {0}'.format(self)) def _run_loop(self): while self._operate == True: self._process_met() sleep(self.interval) self._log.info('Done checking for holiday.') def _process_met(self): self.set_met(self.date_string in self._holidays) @connected def _watch_node(self, event=None): """ :type event: kazoo.protocol.states.WatchedEvent or None """ if self.path is None: self._log.warning('No zookeeper path given. This predicate will' ' nevr be met.') return exists = self.zkclient.exists(self.path, watch=self._watch_node) if exists: self._holidays = self.zkclient.get_children(self.path, watch=self._watch_node) self._log.info('Got holidays {0}'.format(self._holidays)) self._process_met() else: self._log.info('No gut node was found. Watcher is set at {0}' .format(self.path)) def __repr__(self): return ('{0}(component={1}, parent={2}, started={3}, ' 'operational={4}, met={5})' .format(self.__class__.__name__, self._comp_name, self._parent, self.started, self._operational, self._met)) def __eq__(self, other): return type(self) == type(other) def __ne__(self, other): return type(self) != type(other)
class PredicateWeekend(SimplePredicate): def __init__(self, comp_name, operational=False, parent=None, interval=10): """ :type comp_name: str :type operational: bool :type parent: str or None :type interval: int or float """ SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent) self.interval = interval self._log = logging.getLogger("sent.{0}.weekend".format(comp_name)) self._log.info("Registered {0}".format(self)) self._operate = ThreadSafeObject(True) self._thread = Thread(target=self._run_loop, name=str(self)) self._thread.daemon = True self._started = False @property def weekday(self): """ :rtype: int 0=Sunday, 1=Monday, etc. """ return datetime.date.today().weekday() def start(self): if self._started is False: self._log.debug("Starting {0}".format(self)) self._started = True self._thread.start() self._block_until_started() else: self._log.debug("Already started {0}".format(self)) def stop(self): if self._started is True: self._log.info("Stopping {0}".format(self)) self._started = False self._operate.set_value(False) self._thread.join() self._log.info("{0} stopped".format(self)) else: self._log.debug("Already stopped {0}".format(self)) def _run_loop(self): while self._operate == True: self._process_met() sleep(self.interval) self._log.info("Done checking for weekend.") def _process_met(self): self.set_met(self.weekday in [Weekdays.SATURDAY, Weekdays.SUNDAY]) def __repr__(self): return "{0}(component={1}, parent={2}, started={3}, " "operational={4}, met={5})".format( self.__class__.__name__, self._comp_name, self._parent, self.started, self._operational, self._met ) def __eq__(self, other): return type(self) == type(other) def __ne__(self, other): return type(self) != type(other)
class Application(object): """ Service object to represent an deployed service. """ def __init__(self, config, settings, queue, system, application_type, cancel_flag): """ :type config: dict (xml) :type settings: dict :type queue: zoom.agent.entities.unique_queue.UniqueQueue :type system: zoom.common.types.PlatformType :type application_type: zoom.common.types.ApplicationType :type cancel_flag: zoom.agent.entities.thread_safe_object.ThreadSafeObject """ self.config = config self._settings = settings self.name = verify_attribute(self.config, 'id', none_allowed=False) self._log = logging.getLogger('sent.{0}.app'.format(self.name)) # informational attributes self._host = socket.getfqdn() self._system = system self._predicates = list() self._running = True # used to manually stop the run loop self._prev_state = None self._actions = dict() # created in _reset_watches on zk connect self._env = os.environ.get('EnvironmentToUse', 'Staging') self._apptype = application_type self._restart_on_crash = \ verify_attribute(self.config, 'restart_on_crash', none_allowed=True) self._post_stop_sleep = verify_attribute(self.config, 'post_stop_sleep', none_allowed=True, cast=int, default=5) # tool-like attributes self.listener_lock = Lock() self._action_queue = queue self._mode = ApplicationMode( ApplicationMode.MANUAL, callback=self._update_agent_node_with_app_details) self._state = ThreadSafeObject( ApplicationState.OK, callback=self._update_agent_node_with_app_details) self._start_stop_time = '' # Default to empty string for comparison self._login_user = '******' # Default to Zoom self._user_set_in_react = False self._run_check_mode = False self._pd_svc_key = verify_attribute(config, 'pagerduty_service', none_allowed=True) restartmax = verify_attribute(config, 'restartmax', none_allowed=True, cast=int, default=3) self._rl = RestartLogic( self.name, restartmax, count_callback=self._update_agent_node_with_app_details) self._read_only = False self._paths = self._init_paths(self.config, settings, application_type) # clients self.zkclient = KazooClient(hosts=get_zk_conn_string(), timeout=60.0, handler=SequentialThreadingHandler(), logger=logging.getLogger( 'kazoo.app.{0}'.format(self.name))) self.zkclient.add_listener(self._zk_listener) self._proc_client = self._init_proc_client(self.config, application_type, cancel_flag) self._actions = self._init_actions(settings) self._work_manager = self._init_work_manager(self._action_queue) def app_details(self): return { 'name': self.name, 'host': self._host, 'platform': self._system, 'mode': self._mode.value, 'state': self._state.value, 'start_stop_time': self._start_stop_time, 'login_user': self._login_user, 'read_only': self._read_only, 'restart_count': self._rl.count } def run(self): """ - Start the zookeeper client - Check for already running instances. - Start main loop, periodically checking whether the process has failed. """ try: self.zkclient.start() # make all action objects start processing predicates self._log.info('Starting to process Actions.') map(lambda x: x.start(), self._actions.values()) # start actions started = all([i.started for i in self._actions.values()]) if not started: self._log.critical('All actions are not started!') else: self._log.info('All actions started.'.format(started)) self._check_mode() # get global mode AFTER starting actions while self._running: sleep(5) self.uninitialize() except Exception as ex: self._log.critical('There was an exception in the main loop. ' 'In a bad state. ({0})'.format(ex)) @catch_exception(NodeExistsError) @connected def register(self, **kwargs): """ Add entry to the state tree """ action_name = kwargs.get('action_name', 'register') if not self.zkclient.exists(self._paths['zk_state_path']): if self._action_is_ready(action_name): self._log.info('Registering %s in state tree.' % self.name) self.zkclient.create(self._paths['zk_state_path'], ephemeral=True, makepath=True) # resolve any pager duty alarms self._create_alert_node(AlertActionType.RESOLVE, AlertReason.RESOLVED) # reset restart counters, etc self._proc_client.reset_counters() self._state.set_value(ApplicationState.STARTED) else: self._log.info( 'Action {0} is not ready. Not registering.'.format( action_name)) else: self._log.info('Already registered (node exists).') return 0 @catch_exception(NoNodeError) @connected def unregister(self, **kwargs): """Remove entry from state tree""" action_name = kwargs.get('action_name', 'unregister') if self._action_is_ready(action_name): self._log.info('Un-registering %s from state tree.' % self.name) self.zkclient.delete(self._paths['zk_state_path']) return 0 @catch_exception(RuntimeError) def uninitialize(self): """ Gracefully stop this Zookeeper session, then free any resentinels held by the client. """ self._log.info('Stopping Zookeeper client') self._work_manager.stop() map(lambda x: x.stop(), self._actions.values()) # stop actions del self._predicates[:] # make sure we delete old predicates self.zkclient.stop() self.zkclient.close() return 0 @time_this def start(self, **kwargs): """ Start actual process :param kwargs: passed from zoom.handlers.control_agent_handlers """ # Restart from UI: ran_stop=True, stay_down=False # Stop from UI: ran_stop=True, stay_down=True # Crash: ran_stop=False, stay_down=False if self._proc_client.restart_logic.ran_stop \ and self._proc_client.restart_logic.stay_down \ and self._apptype == ApplicationType.APPLICATION: self._log.info('Not starting. App was stopped with Zoom.') # set to OK just in case we're staggered self._state.set_value(ApplicationState.OK) return 0 elif self._proc_client.restart_logic.crashed and \ not self._restart_on_crash: self._log.info('Not starting. The application has crashed.') self._state.set_value(ApplicationState.NOTIFY) return 0 else: self._log.debug('Start allowed.') if kwargs.get('reset', True): self._proc_client.reset_counters() if kwargs.get('pause', False): self.ignore() pd_enabled = kwargs.get('pd_enabled', True) self._start_stop_time = self._get_current_time() # set login user if not set in react if not self._user_set_in_react: self._login_user = kwargs.get('login_user', 'Zoom') self._state.set_value(ApplicationState.STARTING) result = self._proc_client.start() if self._run_check_mode: # Reset to global mode if restart with dep self._check_mode() self._run_check_mode = False if result == 0 or result == ApplicationStatus.CANCELLED: self._state.set_value(ApplicationState.STARTED) else: self._state.set_value(ApplicationState.ERROR) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, AlertReason.FAILEDTOSTART) else: self._log.debug('PD is disabled, not sending alert.') return result @time_this def stop(self, **kwargs): """ Stop actual process :param kwargs: Passed from: zoom.www.handlers.control_agent_handler.ControlAgentHandler, zoom.agent.action.action.Action """ if kwargs.get('reset', True): self._proc_client.reset_counters() if kwargs.get('pause', False): self.ignore() self._start_stop_time = self._get_current_time() self._login_user = kwargs.get('login_user', 'Zoom') self._state.set_value(ApplicationState.STOPPING) result = self._proc_client.stop(**kwargs) if result != ApplicationStatus.CANCELLED: # give everything time to catch up, not sure why anymore... self._log.info( 'Sleeping for the configured {0}s after stop.'.format( self._post_stop_sleep)) sleep(self._post_stop_sleep) # reset this value back to False self._user_set_in_react = False if result == ApplicationStatus.CANCELLED: self._state.set_value(ApplicationState.STOPPED) elif result != 0: self._state.set_value(ApplicationState.ERROR) else: self._state.set_value(ApplicationState.STOPPED) return result def status(self): """ Log out the status of each configured action. :rtype: str """ out = '\n' out += '#' * 40 + ' STATUS ' + '#' * 40 out += '\n{0}'.format(self) out += '\n' for i in self._actions.values(): out += '\n{0}'.format(i.status) out += '\n' out += '#' * 40 + ' STATUS ' + '#' * 40 out += '\n' self._log.info(out) return out def restart(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ # if not self._action_is_ready('restart', allow_undefined=True): # self._log.info('Restart action not ready.') # return self._log.info('Running Restart. Queuing stop, unregister, start.') self._action_queue.clear() self._action_queue.append_unique(Task('stop', kwargs=kwargs)) self._action_queue.append_unique(Task('unregister')) self._action_queue.append_unique(Task('start', kwargs=kwargs)) return 0 def dep_restart(self, **kwargs): self._run_check_mode = True # only used in self.start() self._action_queue.append(Task('start_if_ready', kwargs=kwargs)) return 0 def start_if_ready(self, **kwargs): start_action = self._actions.get('start', None) if start_action is not None and start_action.ready: start_action.run(**kwargs) # if start action doesn't exist, a.k.a. read only elif start_action is None: self.start(**kwargs) else: self._action_queue.append(Task('react', kwargs=kwargs)) return 0 @time_this @connected def ignore(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ self._mode.set_value(ApplicationMode.MANUAL) self._log.info('Mode is now "{0}"'.format(self._mode)) return 0 @time_this @connected def react(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ self._mode.set_value(ApplicationMode.AUTO) self._log.info('Mode is now "{0}"'.format(self._mode)) # when react is called through "restart with dependencies" command self._user_set_in_react = True self._login_user = kwargs.get('login_user', 'Zoom') return 0 @time_this @connected def notify(self, **kwargs): """ Send notification based on arbitrary predicates """ action_name = kwargs.get('action_name', 'notify') pd_enabled = kwargs.get('pd_enabled', True) pd_reason = kwargs.get('pd_reason', None) if pd_reason is None: pd_reason = AlertReason.CRASHED if not self._action_is_ready(action_name): self._log.info('notify action not defined or not ready.') return 1 self._state.set_value(ApplicationState.NOTIFY) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, pd_reason) else: self._log.debug('PD is disabled, not sending alert.') return 0 @time_this @connected def ensure_running(self, **kwargs): """ Essentially a clone of `notify`, but tailored for process monitoring. """ # Application failed to start. Already sent PD alert if self._state == ApplicationState.ERROR: return 1 action_name = kwargs.get('action_name', 'ensure_running') pd_enabled = kwargs.get('pd_enabled', True) pd_reason = kwargs.get('pd_reason', None) if pd_reason is None: pd_reason = AlertReason.CRASHED if not self._action_is_ready(action_name): self._log.info('notify action not defined or not ready.') return if not self._proc_client.restart_logic.ran_stop: # the application has crashed self._state.set_value(ApplicationState.NOTIFY) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, pd_reason) else: self._log.debug('PD is disabled, not sending alert.') else: self._log.debug("Service shut down gracefully") return 0 def terminate(self): """Terminate child thread/process""" self._running = False return 0 def _action_is_ready(self, action_name, allow_undefined=False): """ Check if a configured action's predicates are met :type action_name: str :type allow_undefined: bool :rtype: bool """ action = self._actions.get(action_name, None) if allow_undefined: if action is None: return True return action is not None and action.ready @catch_exception(NoNodeError) @connected def _update_agent_node_with_app_details(self, event=None): """ Register app data with the agent in the state tree. :type event: kazoo.protocol.states.WatchedEvent or None """ if self._running and \ not self.zkclient.exists(self._paths['zk_state_base']): self.zkclient.create(self._paths['zk_state_base'], makepath=True) data, stat = self.zkclient.get(self._paths['zk_state_base']) try: agent_apps = json.loads(data) except ValueError: agent_apps = dict() # check for config conflict other_host = agent_apps.get('host') if other_host is not None and self._host != other_host: self._log.error( 'There is a config conflict with {0}. Updates ' 'will no longer be sent until it is resolved.'.format( other_host)) self._state.set_value(ApplicationState.CONFIG_ERROR, run_callback=False) # make sure data is the most recent if self.app_details() != agent_apps: self.zkclient.set(self._paths['zk_state_base'], json.dumps(self.app_details())) self._log.debug('Registering app data {0}'.format( self.app_details())) # set watch if self._state != ApplicationState.CONFIG_ERROR: self.zkclient.get(self._paths['zk_state_base'], watch=self._update_agent_node_with_app_details) else: self._log.error('Shutting down because of config error.') self.terminate() def _init_paths(self, config, settings, atype): """ :rtype: dict """ paths = dict() paths['zk_state_base'] = verify_attribute( config, 'registrationpath', none_allowed=True, default=self._pathjoin( settings.get('zookeeper', {}).get('state'), atype, self.name)) paths['zk_state_path'] = \ self._pathjoin(paths['zk_state_base'], self._host) paths['zk_config_path'] = \ self._pathjoin(settings.get('zookeeper', {}).get('config'), atype, self.name) paths['zk_agent_path'] = \ self._pathjoin(settings.get('zookeeper', {}).get('agent_state'), self._host) return paths def _init_proc_client(self, config, atype, cancel_flag): """Create the process client.""" start_cmd = verify_attribute(config, 'start_cmd', none_allowed=True) stop_cmd = verify_attribute(config, 'stop_cmd', none_allowed=True) status_cmd = verify_attribute(config, 'status_cmd', none_allowed=True) script = verify_attribute(config, 'script', none_allowed=True) g_names = self._get_graphite_metric_names() return ProcessClient(name=self.name, start_cmd=start_cmd, stop_cmd=stop_cmd, status_cmd=status_cmd, script=script, apptype=atype, restart_logic=self._rl, graphite_metric_names=g_names, cancel_flag=cancel_flag) def _init_actions(self, settings): """ :rtype: dict """ action_factory = ActionFactory(component=self, zkclient=self.zkclient, proc_client=self._proc_client, action_queue=self._action_queue, mode=self._mode, system=self._system, pred_list=self._predicates, app_state=self._state, settings=settings) actions = action_factory.create(self.config) self._determine_read_only(actions) return actions def _determine_read_only(self, actions): # Sentinel config may include either start or restart blocks, if either are disabled show as read-only start_action = actions.get('start', None) restart_action = actions.get('restart', None) # Two special cases - both start and restart and neither if start_action and restart_action: if start_action.disabled and restart_action.disabled: self._read_only = True else: self._read_only = False return elif not start_action and not restart_action: self._log.warning( 'Sentinel config contains neither start nor restart predicates, assuming readonly' ) self._read_only = True return # At this point either start action or restart action must exist if not start_action: if restart_action.disabled: self._read_only = True else: self._read_only = False elif not restart_action: if start_action.disabled: self._read_only = True else: self._read_only = False else: self._log.warning('Unhandled read-only configuration') self._read_only = False def _init_work_manager(self, queue): """ :rtype: zoom.agent.entities.work_manager.WorkManager """ acceptable_work = dict() # actions have additional logic, so use those if available for k, v in self._actions.iteritems(): acceptable_work[k] = v.run # if action is not available, add public methods for attribute in [a for a in dir(self) if not a.startswith('_')]: obj = getattr(self, attribute) if hasattr(obj, '__call__'): if attribute not in acceptable_work: acceptable_work[attribute] = obj else: self._log.debug( 'Method {0} already assigned to action.'.format( attribute)) manager = WorkManager(self.name, queue, acceptable_work) manager.start() return manager @connected def _check_mode(self, event=None): """ Check global run mode for the agents. :type event: kazoo.protocol.states.WatchedEvent or None """ global_path = self._settings.get('zookeeper', {}).get('global_config') if global_path is None: self._log.warning('Received no global config path. Zoom will be ' 'unable to change the global mode.') return modepath = self._pathjoin(global_path, 'mode') try: data, stat = self.zkclient.get(modepath, watch=self._check_mode) j = json.loads(data) self._log.info( 'Getting mode from Zookeeper from path: {0}'.format(modepath)) self._mode.set_value(str(j.get(u'mode', ApplicationMode.MANUAL))) self._log.info('Setting mode to "{0}"'.format(self._mode)) except NoNodeError: self._log.info( 'ZK path {0} does not exist. Assuming mode "manual"'.format( modepath)) except Exception: self._log.exception('An uncaught exception has occurred.') def _pathjoin(self, *args): """ Helper function to join paths. Uses string joining if it is a Windows box. :rtype: str """ if self._system == PlatformType.LINUX: return os.path.join(*args) elif self._system == PlatformType.WINDOWS: return '/'.join(args) def _get_graphite_metric_names(self): """ splits the state path at 'application' and returns the latter index :rtype: dict """ names = {"result": None, "runtime": None, "updown": None} type_path = self._paths.get('zk_state_base')\ .split(self._settings.get('zookeeper', {}).get('state') + '/', 1)[1] type_metric = type_path.replace('/', '.') graphite = self._settings.get('graphite') if graphite is not None: result_path = str(graphite.get('result')) runtime_path = str(graphite.get('runtime')) updown_path = str(graphite.get('updown')) names["result"] = result_path.format(type_metric) names["runtime"] = runtime_path.format(type_metric) names["updown"] = updown_path.format(type_metric) return names def _get_current_time(self): return datetime.now().strftime("%Y-%m-%d %H:%M:%S") def _get_alert_details(self, alert_action, reason): return { "action": alert_action, "service_key": self._pd_svc_key, "incident_key": self._pathjoin('sentinel', self.name, self._host), "description": ('Sentinel Error: name={0}, host={1}, issue="{2}".'.format( self.name, self._host, reason)), "details": ('Sentinel Error: name={0}, host={1}, issue="{2}".\n' 'Review the application log and contact the appropriate' ' development group.'.format(self.name, self._host, reason)) } @catch_exception(NoNodeError) @connected def _create_alert_node(self, alert_action, reason): """ Create Node in ZooKeeper that will result in a PagerDuty alarm :type alert_action: zoom.common.types.AlertActionType """ alert_details = self._get_alert_details(alert_action, reason) # path example: /foo/sentinel.bar.baz.HOSTFOO alert = self._settings.get('zookeeper', {}).get('alert') if alert is None: self._log.warning('Was given no alert path. This sentinel will be ' 'unable to forward alerts to Zoom.') return alert_path = self._pathjoin( alert, re.sub('/', '.', alert_details['incident_key'])) if self._env in self._settings.get('pagerduty', {}).get('enabled_environments', []): self._log.info('Creating alert "{0}" node for env: {1}'.format( alert_action, self._env)) if self.zkclient.exists(alert_path): self.zkclient.set(alert_path, value=json.dumps(alert_details)) else: self.zkclient.create(alert_path, value=json.dumps(alert_details)) else: self._log.info('Not creating alert "{0}" node for env: {1}'.format( alert_action, self._env)) self._log.info('Would have created path {0}'.format(alert_path)) @catch_exception(Exception, traceback=True) @run_only_one('listener_lock') def _reset_after_connection_loss(self): """ Recreates all actions and predicates after connection loss. Recheck the mode and allowed instances. """ if self._running: self._log.info('Application listener callback triggered') map(lambda x: x.stop(), self._actions.values()) # stop actions self._actions.clear() self._predicates = [] self._actions = self._init_actions(self._settings) map(lambda x: x.reset(), self._predicates) # reset predicates map(lambda x: x.start(), self._actions.values()) # start actions self._check_mode() self._log.info('Application listener callback complete!') else: self._log.info('The daemon has called for termination. ' 'Not trying to reset after connection loss.') def _zk_listener(self, state): """ The callback function that runs when the connection state to Zookeeper changes. Either passes or immediately spawns a new thread that resets any watches, etc., so that it can listen to future connection state changes. """ try: self._log.info('Zookeeper Connection went from {0} to {1}'.format( self._prev_state, state)) if self._prev_state is None and state == KazooState.CONNECTED: pass elif self._prev_state == KazooState.LOST and state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) elif self._prev_state == KazooState.CONNECTED and state == KazooState.SUSPENDED: pass elif self._prev_state == KazooState.CONNECTED and state == KazooState.LOST: pass elif self._prev_state == KazooState.SUSPENDED and state == KazooState.LOST: pass elif self._prev_state == KazooState.SUSPENDED and state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) elif state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) else: self._log.info( 'Zookeeper Connection in unknown state: {0}'.format(state)) return self._prev_state = state except Exception as ex: self._log.exception('An uncaught exception has occurred in the ' 'listener: {0}'.format(ex)) def __str__(self): return self.__repr__() def __repr__(self): return ("{0}(name={1}, runmode={2})".format(self.__class__.__name__, self.name, self._mode))