def __init__(self, root, host_ip=None): self.root = root self.apps_dir = os.path.join(self.root, self.APPS_DIR) self.watchdog_dir = os.path.join(self.root, self.WATCHDOG_DIR) self.running_dir = os.path.join(self.root, self.RUNNING_DIR) self.cache_dir = os.path.join(self.root, self.CACHE_DIR) self.cleanup_dir = os.path.join(self.root, self.CLEANUP_DIR) self.app_events_dir = os.path.join(self.root, self.APP_EVENTS_DIR) self.metrics_dir = os.path.join(self.root, self.METRICS_DIR) self.archives_dir = os.path.join(self.root, self.ARCHIVES_DIR) self.init_dir = os.path.join(self.root, self.INIT_DIR) self.pending_cleanup_dir = os.path.join(self.root, self.PENDING_CLEANUP_DIR) if host_ip is not None: self.host_ip = host_ip else: hostname = socket.gethostname() self.host_ip = socket.gethostbyname(hostname) self.watchdogs = watchdog.Watchdog(self.watchdog_dir) fs.mkdir_safe(self.apps_dir) fs.mkdir_safe(self.watchdog_dir) fs.mkdir_safe(self.running_dir) fs.mkdir_safe(self.cache_dir) fs.mkdir_safe(self.cleanup_dir) fs.mkdir_safe(self.app_events_dir) fs.mkdir_safe(self.metrics_dir) fs.mkdir_safe(self.archives_dir)
def __init__(self, root): self.root = root self.apps_dir = os.path.join(self.root, self.APPS_DIR) self.bin_dir = os.path.join(self.root, self.BIN_DIR) self.watchdog_dir = os.path.join(self.root, self.WATCHDOG_DIR) self.running_dir = os.path.join(self.root, self.RUNNING_DIR) self.cache_dir = os.path.join(self.root, self.CACHE_DIR) self.cleaning_dir = os.path.join(self.root, self.CLEANING_DIR) self.cleanup_dir = os.path.join(self.root, self.CLEANUP_DIR) self.cleanup_apps_dir = os.path.join(self.root, self.CLEANUP_APPS_DIR) self.configs_dir = os.path.join(self.root, self.CONFIG_DIR) self.app_events_dir = os.path.join(self.root, self.APP_EVENTS_DIR) self.archives_dir = os.path.join(self.root, self.ARCHIVES_DIR) self.images_dir = os.path.join(self.root, self.IMAGES_DIR) self.init_dir = os.path.join(self.root, self.INIT_DIR) self.init1_dir = os.path.join(self.root, self.INIT1_DIR) self.tombstones_dir = os.path.join(self.root, self.TOMBSTONES_DIR) self.cleanup_tombstone_dir = os.path.join(self.tombstones_dir, self.CLEANUP_DIR) self.running_tombstone_dir = os.path.join(self.tombstones_dir, self.RUNNING_DIR) self.init_tombstone_dir = os.path.join(self.tombstones_dir, self.INIT_DIR) self.watchdogs = watchdog.Watchdog(self.watchdog_dir)
def __init__(self, root): self.root = root self.apps_dir = os.path.join(self.root, self.APPS_DIR) self.watchdog_dir = os.path.join(self.root, self.WATCHDOG_DIR) self.running_dir = os.path.join(self.root, self.RUNNING_DIR) self.cache_dir = os.path.join(self.root, self.CACHE_DIR) self.cleanup_dir = os.path.join(self.root, self.CLEANUP_DIR) self.configs_dir = os.path.join(self.root, self.CONFIG_DIR) self.app_events_dir = os.path.join(self.root, self.APP_EVENTS_DIR) self.metrics_dir = os.path.join(self.root, self.METRICS_DIR) self.archives_dir = os.path.join(self.root, self.ARCHIVES_DIR) self.images_dir = os.path.join(self.root, self.IMAGES_DIR) self.init_dir = os.path.join(self.root, self.INIT_DIR) self.pending_cleanup_dir = os.path.join(self.root, self.PENDING_CLEANUP_DIR) self.watchdogs = watchdog.Watchdog(self.watchdog_dir) fs.mkdir_safe(self.apps_dir) fs.mkdir_safe(self.watchdog_dir) fs.mkdir_safe(self.running_dir) fs.mkdir_safe(self.cache_dir) fs.mkdir_safe(self.cleanup_dir) fs.mkdir_safe(self.configs_dir) fs.mkdir_safe(self.app_events_dir) fs.mkdir_safe(self.metrics_dir) fs.mkdir_safe(self.archives_dir) fs.mkdir_safe(self.init_dir)
def setUp(self): self.root = tempfile.mkdtemp() self.watchdog = watchdog.Watchdog(self.root) # Setup some entries for name, age in [('.tmp', 0), ('foo', 10), ('bar_30s', 15), ('baz#lala', 40)]: fname = os.path.join(self.root, name) with open(fname, 'w') as f: f.write(name) os.utime(fname, (age, age)) os.mkdir(os.path.join(self.root, 'food'))
def run(self, watchdogs_dir, *impl_args, **impl_kwargs): """Run the service. The run procedure will first initialize the service's implementation, the setup the service's watchdog, and start the service resource resynchronization procedure. This procedure is in 4 phases to handle both fresh starts and restarts. $ Call the implementation's :function:`initialize` function which allows the implementation to query and import the backend resource's state. $ Setup the service request watcher. $ Import all existing requests (passing them to the :function:`on_created` implementation's handler. $ Call the implementation's :function:`synchronize` function which expunges anything allocated against the backend resource that doesn't have a matching request anymore. The implementation is expected to implement two handlers: * :function:`on_created` that handles new resource requests or update to existing resource request (implementation is expected to be idem-potent. * :function:`on_deleted` that handlers delation of resource requests. It should properly handle the case where the backend resource is already gone. :param ``str`` watchdogs_dir: Path to the watchdogs directory. :param ``tuple`` impl_args: Arguments passed to the implementation's constructor. :param ``dict`` impl_kwargs: Keywords arguments passed to the implementation's constructor. """ # Load the implementation if self._service_class is None: self._service_class = self._load_impl() impl = self._service_class(*impl_args, **impl_kwargs) # Setup the watchdog watchdogs = watchdog.Watchdog(os.path.realpath(watchdogs_dir)) watchdog_lease = watchdogs.create( name='svc-{svc_name}'.format(svc_name=self.name), timeout='{hb:d}s'.format(hb=impl.WATCHDOG_HEARTBEAT_SEC), content='Service %r failed' % self.name ) self._run(impl, watchdog_lease) _LOGGER.info('Shuting down %r service', self.name) # Remove the service heartbeat watchdog_lease.remove()
def _watcher(root_dir, rules_dir, containers_dir, watchdogs_dir): """Treadmill Firewall rule watcher. """ rules_dir = os.path.join(root_dir, rules_dir) containers_dir = os.path.join(root_dir, containers_dir) watchdogs_dir = os.path.join(root_dir, watchdogs_dir) # Setup the watchdog watchdogs = watchdog.Watchdog(watchdogs_dir) wd = watchdogs.create( 'svc-{svc_name}'.format(svc_name='firewall_watcher'), '{hb:d}s'.format(hb=_FW_WATCHER_HEARTBEAT * 2), 'Service firewall watcher failed' ) rulemgr = rulefile.RuleMgr(rules_dir, containers_dir) passthrough = {} def on_created(path): """Invoked when a network rule is created.""" rule_file = os.path.basename(path) _LOGGER.info('adding %r', rule_file) # The rule is the filename chain_rule = rulemgr.get_rule(rule_file) if chain_rule is not None: chain, rule = chain_rule iptables.add_rule(rule, chain=chain) if isinstance(rule, fw.PassThroughRule): passthrough[rule.src_ip] = ( passthrough.setdefault(rule.src_ip, 0) + 1 ) _LOGGER.info('Adding passthrough %r', rule.src_ip) iptables.add_ip_set(iptables.SET_PASSTHROUGHS, rule.src_ip) iptables.flush_pt_conntrack_table(rule.src_ip) else: _LOGGER.warning('Ignoring unparseable rule %r', rule_file) def on_deleted(path): """Invoked when a network rule is deleted.""" # Edge case, if the directory where the rules are kept gets removed, # abort if path == rulemgr.path: _LOGGER.critical('Network rules directory was removed: %r', path) utils.sys_exit(1) # The rule is the filename rule_file = os.path.basename(path) _LOGGER.info('Removing %r', rule_file) chain_rule = rulemgr.get_rule(rule_file) if chain_rule is not None: chain, rule = chain_rule iptables.delete_rule(rule, chain=chain) if isinstance(rule, fw.PassThroughRule): if passthrough[rule.src_ip] == 1: # Remove the IPs from the passthrough set passthrough.pop(rule.src_ip) _LOGGER.info('Removing passthrough %r', rule.src_ip) iptables.rm_ip_set(iptables.SET_PASSTHROUGHS, rule.src_ip) iptables.flush_pt_conntrack_table(rule.src_ip) else: passthrough[rule.src_ip] -= 1 else: _LOGGER.warning('Ignoring unparseable file %r', rule_file) _LOGGER.info('Monitoring fw rules changes in %r', rulemgr.path) watch = dirwatch.DirWatcher(rulemgr.path) watch.on_created = on_created watch.on_deleted = on_deleted # Minimal initialization of the all chains and sets _init_rules() # now that we are watching, prime the rules current_rules = rulemgr.get_rules() # Bulk apply rules _configure_rules(current_rules) for _chain, rule in current_rules: if isinstance(rule, fw.PassThroughRule): passthrough[rule.src_ip] = ( passthrough.setdefault(rule.src_ip, 0) + 1 ) # Add the IPs to the passthrough set _LOGGER.info('Adding passthrough %r', rule.src_ip) iptables.add_ip_set(iptables.SET_PASSTHROUGHS, rule.src_ip) _LOGGER.info('Current rules: %r', current_rules) while True: if watch.wait_for_events(timeout=_FW_WATCHER_HEARTBEAT): # Process no more than 5 events between heartbeats watch.process_events(max_events=5) rulemgr.garbage_collect() wd.heartbeat() _LOGGER.info('service shutdown.') wd.remove()
def run(self, watchdogs_dir, *impl_args, **impl_kwargs): """Run the service.""" # Load the implementation if self._service_class is None: self._service_class = self._load_impl() impl = self._service_class(*impl_args, **impl_kwargs) # Setup the watchdog watchdogs = watchdog.Watchdog(os.path.realpath(watchdogs_dir)) watchdog_lease = watchdogs.create( name='svc-{svc_name}'.format(svc_name=self.name), timeout='{hb:d}s'.format(hb=impl.WATCHDOG_HEARTBEAT_SEC), content='Service %r failed' % self.name ) # Create the status socket ss = self._create_status_socket() # Run initialization impl.initialize(self._dir) watcher = dirwatch.DirWatcher(self._rsrc_dir) # Call all the callbacks with the implementation instance watcher.on_created = functools.partial(self._on_created, impl) watcher.on_deleted = functools.partial(self._on_deleted, impl) # NOTE: A modified request is treated as a brand new request watcher.on_modified = functools.partial(self._on_created, impl) self._io_eventfd = eventfd.eventfd(0, eventfd.EFD_CLOEXEC) # Before starting, check the request directory svcs = self._check_requests() # and "fake" a created event on all the existing requests for existing_svcs in svcs: self._on_created(impl, existing_svcs) # Before starting, make sure backend state and service state are # synchronized. impl.synchronize() # Report service status status_info = {} status_info.update(impl.report_status()) # Setup the poll object loop_poll = select.poll() loop_callbacks = {} base_event_handlers = [ ( self._io_eventfd, select.POLLIN, functools.partial( self._handle_queued_io_events, watcher=watcher, impl=impl, ) ), ( watcher.inotify, select.POLLIN, functools.partial( self._handle_io_events, watcher=watcher, impl=impl, ) ), ( ss, select.POLLIN, functools.partial( self._publish_status, status_socket=ss, status_info=status_info, ) ), ] # Initial collection of implementation' event handlers impl_event_handlers = impl.event_handlers() self._update_poll_registration( loop_poll, loop_callbacks, base_event_handlers + impl_event_handlers, ) loop_timeout = impl.WATCHDOG_HEARTBEAT_SEC // 2 while not self._is_dead: # Check for events updated = self._run_events( loop_poll, loop_timeout, loop_callbacks, ) if updated: # Report service status status_info.clear() status_info.update(impl.report_status()) # Update poll registration if needed impl_event_handlers = impl.event_handlers() self._update_poll_registration( loop_poll, loop_callbacks, base_event_handlers + impl_event_handlers, ) # Clean up stale requests self._check_requests() # Heartbeat watchdog_lease.heartbeat() _LOGGER.info('Shuting down %r service', self.name) # Remove the service heartbeat watchdog_lease.remove()
def version_monitor(approot, command): """Runs node version monitor.""" cli_cmd = list(command) _LOGGER.info('Initializing code monitor: %r', cli_cmd) watchdogs = watchdog.Watchdog( os.path.join( approot, appenv.AppEnvironment.WATCHDOG_DIR, )) context.GLOBAL.zk.conn.add_listener(zkutils.exit_on_lost) while not context.GLOBAL.zk.conn.exists(z.VERSION): _LOGGER.warn('%r node not created yet. Cell masters running?', z.VERSION) time.sleep(30) hostname = sysinfo.hostname() version_path = z.path.version(hostname) codepath = os.path.realpath(utils.rootdir()) digest = versionmgr.checksum_dir(codepath).hexdigest() _LOGGER.info('codepath: %s, digest: %s', codepath, digest) info = { 'codepath': codepath, 'since': int(time.time()), 'digest': digest, } zkutils.put(context.GLOBAL.zk.conn, version_path, info) @context.GLOBAL.zk.conn.DataWatch(version_path) @exc.exit_on_unhandled def _watch_version(_data, _stat, event): """Force exit if server node is deleted.""" # If the node is deleted, we exit to pick up new version code. if event is not None and event.type == 'DELETED': # The version info not present, restart services and register # new checksum. _LOGGER.info('Upgrade requested, running: %s', cli_cmd) if cli_cmd: try: subproc.check_call(cli_cmd) # Record successful upgrade. except subprocess.CalledProcessError: _LOGGER.exception('Upgrade failed.') # Immediately trigger a watchdog timeout watchdogs.create( name='version_monitor', timeout='0s', content='Upgrade to ' '{code!r}({digest}) failed'.format(code=codepath, digest=digest), ).heartbeat() del info['digest'] _LOGGER.info('Upgrade complete.') utils.sys_exit(0) return True while True: time.sleep(100000) _LOGGER.info('service shutdown.')