def _check_livecd(_log): is_livecd = _is_livecd() if is_livecd: helpers.write_datetime_marker_file(constants.LIVECD_MARKER_FILE) else: run_command(['/bin/rm', '-f', constants.LIVECD_MARKER_FILE], retval=runcommand.FAIL) return is_livecd
def _update_admin_timestamp(self): """Update Web UI admin active timestamp if web UI is currently polling Ajax. Used by crontab for graph redraw optimizations (draw graphs frequently if admin sessions (= Ajax) is active). """ # XXX: use actual session info; does not work now because of client-side # javascript timeout if self.master._ajax_helper.have_status_change_waiters(): helpers.write_datetime_marker_file(constants.WEBUI_ADMIN_ACTIVE_TIMESTAMP)
def _update_admin_timestamp(self): """Update Web UI admin active timestamp if web UI is currently polling Ajax. Used by crontab for graph redraw optimizations (draw graphs frequently if admin sessions (= Ajax) is active). """ # XXX: use actual session info; does not work now because of client-side # javascript timeout if self.master._ajax_helper.have_status_change_waiters(): helpers.write_datetime_marker_file( constants.WEBUI_ADMIN_ACTIVE_TIMESTAMP)
def _check_memory(_log): """Check that memory size is over minimum if we are booting from live-cd. Note: the minimum size of 256M to even boot Ubuntu is already checked by initramfs scripts when the system boots to live-cd mode but not when the installed system is booted. """ run_command(['/bin/rm', '-f', constants.LOWMEM_MARKER_FILE], retval=runcommand.FAIL) if _memsize() < constants.MEM_MINSIZE: raise Exception('System memory size is too small (%sMb < %sMb).' % (str(_memsize()/1024), str(constants.MEM_MINSIZE/1024))) if _is_livecd_lowmem(): helpers.write_datetime_marker_file(constants.LOWMEM_MARKER_FILE)
def _write_marker_file(msg): # XXX: msg is not used for anything here ?? rv = helpers.write_datetime_marker_file( constants.UPDATE_RESULT_MARKER_FILE) if rv is None: raise Exception('failed to write update timestamp marker')
def run_minutely(): # We want to do the cron watchdog check immediately for two reasons # First, we don't want to do any RDF operations before we're happy # with the watchdog check, as we may get stuck otherwise. Second, # we don't want to do any log operations either, as opening the log # device when the syslog daemon is missing causes trouble. Hence, # we run watchdog checks, and try to log any interesting things # after a possible watchdog action has been initiated. # # Also, sqlite-based RDF database may cause some trouble here. Watchdog # check is not dependent on RDF, but graph update is and may block for a # long time if a lock is held by someone. Further, sqlite does not ensure # fairness for those competing for database access, so we may starve or # become a bit hungry here. # # However, cron happily starts multiple instances of this script in parallel # (once every minute), so even if we deadlock to RDF database, the watchdog # here will successfully reboot. # XXX: _check_ui_health() is currently not fixed to work with it, but # this would ideally happen after the _check_ui_health() call _setup_logging() if helpers.is_live_cd(): _log.info('live cd, skipping cron action') return _check_ui_health() # XXX: graphs update may take time in which case we do not want to start # new update again while the old is still running if os.path.exists(constants.CRON_MINUTELY_RUNNING_MARKERFILE): _log.warning('detected cron run_minutely still running, stopping before actual operations') return try: helpers.write_datetime_marker_file(constants.CRON_MINUTELY_RUNNING_MARKERFILE) try: _update_graphs(quick=False) except: _log.exception('graph update failed') try: _update_snmp() except: _log.exception('snmp update failed') finally: if os.path.exists(constants.CRON_MINUTELY_RUNNING_MARKERFILE): os.unlink(constants.CRON_MINUTELY_RUNNING_MARKERFILE)
def _firstboot_config(_log): if not os.path.exists(constants.CONFIGURED_MARKER): from codebay.l2tpserver import gnomeconfig is_livecd = _is_livecd() # Gnome config is fragile: only done in first boot after install # XXX: if update needs to change gnome config, this must be refactored/rewritten in the future gnomeconfig.gnomeconfig_firstboot(is_livecd) _create_installation_uuid(_log, firstboot=True) # No RRD when lowmem. if not _is_livecd_lowmem(): from codebay.l2tpserver import graphs g = graphs.Graphs() g.create_rrd_files() helpers.write_datetime_marker_file(constants.CONFIGURED_MARKER) try: from codebay.l2tpserver import gdmconfig # Overwrite gdm config in every boot to ensure that it is done after update gdmconfig.run(autologin=True, autologin_user=constants.ADMIN_USER_NAME) except: _log.exception('failed to (re)configure gdm, ignoring.') try: from codebay.l2tpserver import firefoxconfig # Overwrite firefox config in every boot to ensure it is done after update firefoxconfig.set_firefox_configuration('/home/%s' % constants.ADMIN_USER_NAME) except: _log.exception('failed to (re)configure firefox, ignoring.') # Note: doublecheck on every boot. _create_installation_uuid(_log, firstboot=False) # (Re)create desktop icon which also enables autostart # # Two .desktop files are required: # - ~/Desktop/vpnease.desktop is shown on the user desktop, # but has no autostart # - ~/.config/autostart/vpnease-autostart.desktop causes # autostart but is not shown on the desktop _recreate_desktop_and_autostart()
def _do_timesync(self, utc_time): # XXX: if time step is too large, we need to take some measures here to prevent twisted problems # (Twisted does not handle time jumps correctly; if time goes backwards, timers become frozen). # One relatively benign fix here would be to reschedule all critical timers, such as the watchdog. # A more brute force fix would be to restart the web UI. try: full_sync = timesync.update_system_time( utc_time, cap_backwards=constants.WEBUI_TIMESYNC_CAP_BACKWARDS, cap_forwards=constants.WEBUI_TIMESYNC_CAP_FORWARDS, ) if full_sync: helpers.write_datetime_marker_file(constants.WEBUI_LAST_TIMESYNC_FILE) else: # probably capped, don't write marker because it would enable RRD with bogus system time _log.info("timesync not full (probably capped), not writing webui timesync file") # notify if time difference is still too large (jump was capped too heavily) now = datetime.datetime.utcnow() timediff = utc_time - now if timediff < datetime.timedelta(0, 0, 0): timediff = -timediff if (timediff > constants.WEBUI_TIMESYNC_NOTIFY_LIMIT) and (not self._timesync_notify_shown): try: from codebay.l2tpserver import gnomehelpers gnomehelpers.show_notification( constants.WEBUI_TIMESYNC_NOTIFY_TITLE, constants.WEBUI_TIMESYNC_NOTIFY_TEXT, timeout=constants.WEBUI_TIMESYNC_NOTIFY_TIMEOUT, critical=False, ) # NB: it is important to do this after show_notification(); if, for instance, # boot is in progress, the initial notification will fail and a notification # will only be shown on reidentify. Not ideal, but at least the notification # will not be completely hidden. self._timesync_notify_shown = True except: _log.exception("_do_timesync(): failed to show notify of time jump") except: _log.exception("time sync failed")
def run_watchdog(self): """Run the Web UI master watchdog (and other periodic checks) and take action if necessary. Web UI is the top-level watchdog element in the product. This function checks that the system is up and running in an acceptable manner, and if not, take action (reboot typically). In particular, this function checks that if the runner should be active and it is not (process has exited or is stuck), corrective action is taken. In addition to this actual watchdog checking, this function also does other periodic checks and respective actions. In Live CD the watchdog never runs, so no watch warnings or actions will be shown either. """ # increase rounds self._watchdog_rounds += 1 _log.debug('run_watchdog(), round %d' % self._watchdog_rounds) # update watchdog-last-update file try: helpers.write_datetime_marker_file( constants.WEBUI_WATCHDOG_LAST_UPDATED_FILE) except: _log.exception('cannot update webui watchdog file') # skip tests on livecd if self.master.is_live_cd(): _log.debug('watchdog on live cd, skipping') return # not live cd, normal watchdog try: self._watchdog_checks() except: # this should not happen, at least persistently _log.exception('watchdog execution failed, ignoring') # periodic reboot check try: self._periodic_reboot_check() except: _log.exception('periodic reboot check failed, ignoring') # update admin active file, if ajax polling is active (needed by graph # redraw optimizations in crontab) try: self._update_admin_timestamp() except: _log.exception('admin active file update failed, ignoring') # database export check try: self._database_export_check() except: _log.exception('database export check failed, ignoring') # info log even in success case _log.info('webui watchdog completed, strikes %d, round %d' % (self._watchdog_strikes, self._watchdog_rounds))
def _reboot(): """Reboot system after web UI health check failure. Only reboot if the boot timestamp is old enough to ensure that (a) we don't panic when the system is just starting up, and (b) if there is a reboot loop, slow it down to something manageable. Resets the failure count even when reboot is not done so that later errors would not cause immediate reboot after. """ try: if os.path.exists(constants.BOOT_TIMESTAMP_FILE): f = open(constants.BOOT_TIMESTAMP_FILE, 'r') boottime = datatypes.parse_datetime_from_iso8601_subset(f.readline().strip()) _log.info('_reboot: last boot timestamp: %s' % boottime) currenttime = datetime.datetime.utcnow() _log.info('_reboot: current time %s' % currenttime) runtime = currenttime - boottime _log.info('_reboot: running time: %s (limit: %s)' % (runtime, constants.CRON_BOOTTIME_FAILURE_WAIT)) if runtime < constants.CRON_BOOTTIME_FAILURE_WAIT and runtime > constants.ZERO_TIMEDELTA: _log.info('_reboot: too soon to reboot despite UI health check failures') _reset_failure_count() return except: _log.exception('_reboot: failed to check boot timestamp, rebooting anyway') _log.warning('_reboot: rebooting due to UI health check failures') _reset_failure_count() # Launch a notify in the background: may or may not work try: run_command(['%s watchdognotify 1>/dev/null 2>/dev/null &' % constants.CMD_L2TPGW_CRON], shell=True) except: _log.exception('failed to launch notify in the background') # Simulate normal update behaviour and force fsck on next # boot try: if os.path.exists(constants.UPDATE_FORCE_MARKER_FILE): os.unlink(constants.UPDATE_FORCE_MARKER_FILE) if os.path.exists(constants.UPDATE_SKIP_MARKER_FILE): os.unlink(constants.UPDATE_SKIP_MARKER_FILE) if os.path.exists(constants.FASTBOOT_MARKER_FILE): os.unlink(constants.FASTBOOT_MARKER_FILE) helpers.write_datetime_marker_file(constants.FORCE_FSCK_MARKER_FILE) except: _log.exception('marker file cleanup failed, but we reboot anyway') # Give some time for notify to show try: _log.info('sleeping %s seconds before rebooting' % constants.CRON_WATCHDOG_REBOOT_DELAY) time.sleep(constants.CRON_WATCHDOG_REBOOT_DELAY) except: _log.exception('failed to sleep in cron watchdog action, ignoring') # Flush syslog to disk try: _log.info('flushing syslog to disk') pid = int(open(constants.SYSLOG_PIDFILE, 'rb').read().strip()) # do not care of fd leak here os.kill(pid, signal.SIGHUP) time.sleep(1) except: _log.exception('failed to flush syslog, ignoring') # Sync disk and force reboot: this avoids possible start/stop # system script blocking try: run_command(constants.CMD_SYNC) run_command(constants.CMD_SYNC) run_command(constants.CMD_SYNC) except: _log.exception('failed to do filesystem sync, ignoring') run_command([constants.CMD_REBOOT, '-f', '-d'])
def run_update(self): # Ensure no questions asked os.environ['DEBIAN_FRONTEND'] = 'noninteractive' # XXX: is this ok? # os.environ['DEBIAN_PRIORITY'] = 'critical' # os.environ['DEBCONF_NOWARNINGS'] = 'yes' result = None old_version = None old_version_cached = None new_version = None new_version_cached = None # Tolerate logger errors try: from codebay.common import logger _log = logger.get('l2tpserver.update.update') except: _log = None try: from codebay.l2tpserver import constants from codebay.l2tpserver import runcommand from codebay.l2tpserver import versioninfo from codebay.l2tpserver import helpers run_command = runcommand.run_command except: print 'failed to import helpers, cannot update' return 3 try: def _run_aptitude(options, cwd=None): """Run aptitude with automatic -q and -y options.""" # XXX: add some option to make aptitude return with error instead of asking to ignore trust violations => may not be possible # - if cannot be done, missing repository keys will result in update timeout (aptitude will block) cmd = [constants.CMD_APTITUDE, '-q', '-y'] + options if _log is not None: _log.info('running: %s' % ' '.join(cmd)) retval, stdout, stderr = run_command(cmd, cwd=cwd) if retval != 0: if _log is not None: _log.warning('aptitude return value: %d' % retval) _log.warning('stdout:\n%s' % stdout) _log.warning('stderr:\n%s' % stderr) return retval, stdout, stderr def _dpkg_configure_fixup(): """Attempt 'dpkg --configure -a' fixup.""" # XXX: the previous worry here is that something may block, and an update # might actually fix the situation anyway. try: # XXX: more logging, XXX: path use [rv, out, err] = run_command([ 'dpkg', '--force-depends', '--force-confmiss', '--force-confdef', '--force-confold', '--configure', '-a' ], retval=runcommand.FAIL) except: if _log is not None: _log.exception( 'dpkg --configure -a failed, this is bad') def _cleanup_old_package_lists(): """Cleanup old package lists.""" try: if _log is not None: _log.info('cleaning up apt sources lists') # Remove all files in partial dir partial_dir = '/var/lib/apt/lists/partial' for f in os.listdir(partial_dir): fpath = os.path.join(partial_dir, f) if os.path.isfile(fpath): run_command(['/bin/rm', '-f', fpath]) # Remove all files except lock lists_dir = '/var/lib/apt/lists' for f in os.listdir(lists_dir): fpath = os.path.join(lists_dir, f) m = lockfile_name_re.match(f) if (os.path.isfile(fpath)) and (m is None): run_command(['/bin/rm', '-f', fpath]) except: if _log is not None: _log.exception('failed to remove package list files') def _aptitude_force_install(): """Attempt aptitude install which may clear up some (but not nearly all) error situations.""" try: # XXX: more logging here # XXX: add -q (quiet) option? run_command( [constants.CMD_APTITUDE, '-f', '-y', 'install'], retval=runcommand.FAIL) except: if _log is not None: _log.exception('apt-get cleanup failed, ignoring') # Figure out version v_name_re = re.compile('vpnease_.*\.deb') lockfile_name_re = re.compile('^lock$') old_version, old_version_cached = versioninfo.get_version_info() if _log is not None: _log.info('starting update, current_version: %s (cached: %s)' % (str(old_version), str(old_version_cached))) # Remove old repository keys and import new ones if _log is not None: _log.info('removing old repository keys') [rv, out, err] = run_command([ constants.CMD_RM, '-f', '/etc/apt/trustdb.gpg', '/etc/apt/trusted.gpg' ]) # XXX: more logging here [rv, out, err] = run_command([constants.CMD_APT_KEY, 'list' ]) # this recreates the gpg key store files if _log is not None: _log.info('importing repository keys') # XXX: more logging here [rv, out, err] = run_command([ constants.CMD_APT_KEY, 'add', constants.UPDATE_REPOSITORY_KEYS_FILE ]) if rv != 0: result = [9, 'failed to import repository keys'] raise UpdateError() # Cleanup old package list files, ignore errors _cleanup_old_package_lists() # Aptitude -f -y install _dpkg_configure_fixup() _aptitude_force_install() # In some practical situtions, e.g. in #734, the above command may fail # really badly (caused by broken package scripts), leading to a "dpkg # interrupted state". We re-run dpkg --configure -a to recover. _dpkg_configure_fixup() # Aptitude update if _log is not None: _log.info('updating package list') [rv, out, err] = _run_aptitude(['update']) if rv != 0: result = [4, 'package list update failed'] raise UpdateError() # Cleanup possibly confusing files in /tmp, should not be needed for f in os.listdir('/tmp'): m = v_name_re.match(f) if m is not None: [rv, out, err] = run_command( [constants.CMD_RM, '-f', os.path.join('/tmp', f)]) # XXX: sanity _dpkg_configure_fixup() # Test repository access if _log is not None: _log.info('testing repository access') [rv, out, err] = _run_aptitude(['download', 'vpnease'], cwd='/tmp') if rv != 0: result = [5, 'server not available'] raise UpdateError() # If testing for version info fails, it is ignored and it may generate an error # later when we check if the product version changed or not. for f in os.listdir('/tmp'): m = v_name_re.match(f) if m is not None: cmp_version = versioninfo.get_package_version_info( os.path.join('/tmp', f)) if cmp_version is not None and cmp_version == old_version: result = [ 0, 'update not run, version in repository matches current version' ] raise UpdateError() break # Aptitude clean if _log is not None: _log.info('cleaning up old package files') _run_aptitude(['clean']) # Aptitude upgrade # XXX: should this be dist-upgrade? dist-upgrade seems to be more forceful... _dpkg_configure_fixup() # XXX: sanity [rv1, out, err] = _run_aptitude(['upgrade']) # Aptitude install vpnease _dpkg_configure_fixup() # XXX: sanity [rv2, out, err ] = _run_aptitude(['install', 'vpnease' ]) # Ensure vpnease package is still installed # Reinstall dependencies of vpnease package (sanity) _dpkg_configure_fixup() # XXX: sanity try: deps = [] dep_re = re.compile('^Depends: (.*)$') # XXX: better logging here # XXX: check that package is installed correctly [rv, out, err] = run_command(['dpkg', '-s', 'vpnease']) for l in out.split('\n'): m = dep_re.match(l) if m is None: continue for d in m.groups()[0].split(','): ds = d.strip().split(' ') deps.append(ds[0]) _run_aptitude(['reinstall'] + deps) except: if _log is not None: _log.info('failed to reinstall vpnease depends') # Aptitude reinstall vpnease try: _run_aptitude(['reinstall', 'vpnease']) except: if _log is not None: _log.info('failed to reinstall vpnease depends') # Aptitude clean if _log is not None: _log.info('cleaning up downloaded package files') _run_aptitude(['clean']) # Get new version, determine whether update succeeded based on version & return values new_version, new_version_cached = versioninfo.get_version_info() if rv1 != 0 or rv2 != 0: result = [ 6, 'upgrade/install command failed with error value: %s/%s' % (str(rv1), str(rv2)) ] raise UpdateError() elif new_version_cached: result = [7, 'failed to read version information after update'] raise UpdateError() elif old_version == new_version: result = [ 8, 'product version unchanged after update, update failed' ] raise UpdateError() except UpdateError: pass except: result = [32, 'update failed with an exception'] # At this point, update was successful if result is None # Cleanup.. [rv, out, err] = run_command([constants.CMD_RM, '-f', '/tmp/vpnease_*.deb']) # Note: because timesync may be done later in update (after this script is run) # this timestamp may not be very accurate. def _write_marker_file(msg): # XXX: msg is not used for anything here ?? rv = helpers.write_datetime_marker_file( constants.UPDATE_RESULT_MARKER_FILE) if rv is None: raise Exception('failed to write update timestamp marker') # Handle errors if result is not None: if result[0] == 32: if _log is not None: _log.exception(result[1]) else: if _log is not None: _log.warning(result[1]) try: _write_marker_file('%s %s\nfailed\n%s' % (old_version, new_version, result[1])) except: if _log is not None: _log.exception( 'failed to write marker fail after failed update, ignoring' ) return result[0] # Update last update result marker file try: if _log is not None: _log.info('update success: %s -> %s' % (old_version, new_version)) _write_marker_file('%s %s\nsuccess\nUpdate success' % (old_version, new_version)) except: if _log is not None: _log.exception( 'failed to write marker file after successful update, ignoring' ) # Update last successful update timestamp, used by Web UI try: rv = helpers.write_datetime_marker_file( constants.LAST_SUCCESSFUL_UPDATE_MARKER_FILE) if rv is None: raise Exception( 'failed to write last successful update timestamp') except: if _log is not None: _log.exception( 'failed to write last successful update timestamp, ignoring' ) # Update done with success return 2
def _periodic_reboot_check(self): """Check for periodic reboot and take action if necessary. Tries to be clever and avoid reboot if connections are up. Uptime estimation is annoying: if time is changed on this reboot, the estimate may be grossly wrong. To ensure that we don't reboot on the first boot (when time is synchronized) uncontrollably, this function also checks that enough watchdog rounds have been run to warrant a reboot. The underlying assumption is that web UI has been running continuously, which is currently OK because we don't restart it ever (cron watchdog will just reboot if UI is down). Staggering of reboot is added by randomizing the "minute" of the reboot in the range [0,45] (not [0,60] for leeway). The "minute" is randomized when watchdog is created, so it stays the same every time for one reboot. Note that the stagger is effectively only applied to the first reboot attempt; next attempts (e.g. next day at designated time) will not have a stagger. If more staggering behavior is desired, see XXX below. """ uptime = self.master.get_uptime() reboot_required = False now = datetime.datetime.utcnow() _log.debug('_periodic_reboot_check: uptime=%s' % uptime) # Check whether UI configuration requires a reboot (time & day match) try: reboot_limit = uihelpers.compute_periodic_reboot_time() reboot_limit += self._periodic_reboot_stagger_delta _log.debug('_periodic_reboot_check: reboot limit after stagger: %s' % reboot_limit) lm = licensemanager.LicenseMonitor() count, limit, limit_leeway = lm.count_normal_users() # time to periodic reboot (negative = past due) diff = reboot_limit - now _log.debug('_periodic_reboot_check: periodic reboot diff (limit-now, time to reboot): %s' % str(diff)) if diff <= datetime.timedelta(0, 0, 0): overdue = -diff _log.debug('_periodic_reboot_check: periodic reboot is %s overdue' % overdue) if count > 0: # there are clients (without license restrictions!), give 24h leeway if overdue < datetime.timedelta(1, 0, 0): # XXX: hardcoded _log.info('_periodic_reboot_check: want to do a periodic reboot, but there are active clients (%d), skipping' % count) else: _log.warning('_periodic_reboot_check: want to a periodic reboot, active clients (%d), but leeway over, rebooting anyway' % count) reboot_required = True else: _log.warning('_periodic_reboot_check: want to do a periodic reboot, and no active clients, ok') reboot_required = True except: _log.exception('_periodic_reboot_check: failed when checking for periodic reboot policy') # If not within periodic reboot time window (e.g. 02:00-03:00 local time), # skip periodic reboot. if reboot_required: # XXX: better stagger check could be applied here (checked every day) if not uihelpers.check_periodic_reboot_time_window(now): _log.warning('_periodic_reboot_check: want to do a periodic reboot, but not within periodic reboot time window') reboot_required = False # If more than a maximum number of days, reboot, despite configuration if uptime > constants.PERIODIC_REBOOT_MAX_UPTIME: _log.warning('_periodic_reboot_check: uptime is too large (%s), requires reboot' % uptime) reboot_required = True elif uptime < 0.0: # negative uptime: ignore it for now; if the diff is great, we'll get a periodic reboot anyway later _log.warning('_periodic_reboot_check: uptime is negative (%s), ignoring' % uptime) # Sanity check: if we want to reboot, check that enough watchdog rounds # have elapsed (roughly 24h). if reboot_required: rounds = self.get_watchdog_rounds() if rounds < constants.PERIODIC_REBOOT_MINIMUM_WATCHDOG_ROUNDS: _log.warning('_periodic_reboot_check: want to do periodic reboot, but watchdog rounds too low (%d < %d)' % (rounds, constants.PERIODIC_REBOOT_MINIMUM_WATCHDOG_ROUNDS)) reboot_required = False # Take action if necessary if reboot_required: if self._periodic_reboot_started: _log.info('_periodic_reboot_check: reboot required but periodic reboot already in progress, no action needed') else: try: _log.warning('_periodic_reboot_check: periodic reboot started') self._periodic_reboot_started = True self._periodic_reboot_show_warning() helpers.increment_global_status_counter(ns.periodicReboots) helpers.db_flush() except: _log.exception('failed to increment counter') try: helpers.write_datetime_marker_file(constants.LAST_AUTOMATIC_REBOOT_MARKER_FILE) except: _log.exception('failed to write last automatic reboot marker file') uihelpers.ui_reboot(constants.WEBUI_PRODUCT_PERIODIC_REBOOT_MESSAGE, skip_update=False, force_update=False, force_fsck=True, delay=120.0) # XXX: constants
def run(self): # remove default database just in case from codebay.l2tpserver import db db.remove_database() # log informative stuff into log first try: self.log_update_info() except: _log.exception('log_update_info() failed, ignoring') # prepare; may raise exceptions which propagate directly to caller try: self.prepare() except: _log.exception('prepare() failed') raise # run twisted, exits when protocol run complete try: self.run_twisted() except: _log.exception('run_twisted() failed') raise # do timesync last, to avoid disrupting timers if self.server_utctime_received_at is None: _log.info( 'management server connection time not available, ignoring timesync.' ) else: if self.do_timesync: try: # adjust server supplied time with local difference # this is not very nice, but we want to do timesync last now = datetime.datetime.utcnow() diff = now - self.server_utctime_received_at if diff > datetime.timedelta(0, 60 * 60, 0): # 1 hour # Something's badly wrong, system time apparently jumped. # This can happen e.g. if system time is updated by distro # scripts when runner restarts distro networking. # # If this happens, we just zero the diff: this causes inaccuracy # in time sync (<1 min) but is better than jumping arbitrarily. _log.warning( 'time jump when attempting to sync time, diff is %s; zeroing' % diff) diff = datetime.timedelta(0, 0, 0) dt = self.server_utctime + diff _log.debug( 'doing timesync: server time before adjustment: %s, server time after adjustment: %s, received-at: %s, time-now:%s, diff: %s' % (self.server_utctime, dt, self.server_utctime_received_at, now, diff)) # update time, don't cap (allow arbitrary time jumps) timesync.update_system_time(dt, cap_backwards=None, cap_forwards=None) helpers.write_datetime_marker_file( constants.UPDATE_TIMESYNC_TIMESTAMP_FILE) except: _log.exception( 'timesync with management server failed, ignoring.') # return value handling # # Overall the deferred chain started by runner and management connection # ends up in success or failure. We may or may not get a process exit # code, depending on what is executed. Finally, if a timeout occurs, # this error is flagged specially in self.update_failed. if isinstance(self.update_exit_code, (int, long)): if self.update_exit_code == 0: # update run, did not try update => signaled as error (but supported case) raise UpdateNotDoneError( 'policy requires update, but update cannot be performed') else: if self.update_exit_code == 2: # update run, update succeeded => signaled as no exception, success case raise UpdateDone( 'policy requires update and update was successful') else: # update run, update failed or unknown error raise UpdateFailedError( 'update script failed with exit code: %s' % self.update_exit_code) else: # update was not executed [or exit code is corrupted (should not happen)] # typically we come here after a connection timeout when we try to update and/or timesync if self.update_failed: # from global errback # update not run, but failure (probably timeout) raise UpdateUnknownError('unknown error (errback)') else: # policy does not require an update, no action was taken, success raise UpdateNotRequired('update not required by policy') raise UpdateUnknownError('should not be here')
def _write_marker_file(msg): # XXX: msg is not used for anything here ?? rv = helpers.write_datetime_marker_file(constants.UPDATE_RESULT_MARKER_FILE) if rv is None: raise Exception('failed to write update timestamp marker')
def run_update(self): # Ensure no questions asked os.environ['DEBIAN_FRONTEND'] = 'noninteractive' # XXX: is this ok? # os.environ['DEBIAN_PRIORITY'] = 'critical' # os.environ['DEBCONF_NOWARNINGS'] = 'yes' result = None old_version = None old_version_cached = None new_version = None new_version_cached = None # Tolerate logger errors try: from codebay.common import logger _log = logger.get('l2tpserver.update.update') except: _log = None try: from codebay.l2tpserver import constants from codebay.l2tpserver import runcommand from codebay.l2tpserver import versioninfo from codebay.l2tpserver import helpers run_command = runcommand.run_command except: print 'failed to import helpers, cannot update' return 3 try: def _run_aptitude(options, cwd=None): """Run aptitude with automatic -q and -y options.""" # XXX: add some option to make aptitude return with error instead of asking to ignore trust violations => may not be possible # - if cannot be done, missing repository keys will result in update timeout (aptitude will block) cmd = [constants.CMD_APTITUDE, '-q', '-y'] + options if _log is not None: _log.info('running: %s' % ' '.join(cmd)) retval, stdout, stderr = run_command(cmd, cwd=cwd) if retval != 0: if _log is not None: _log.warning('aptitude return value: %d' % retval) _log.warning('stdout:\n%s' % stdout) _log.warning('stderr:\n%s' % stderr) return retval, stdout, stderr def _dpkg_configure_fixup(): """Attempt 'dpkg --configure -a' fixup.""" # XXX: the previous worry here is that something may block, and an update # might actually fix the situation anyway. try: # XXX: more logging, XXX: path use [rv, out, err] = run_command(['dpkg', '--force-depends', '--force-confmiss', '--force-confdef', '--force-confold', '--configure', '-a'], retval=runcommand.FAIL) except: if _log is not None: _log.exception('dpkg --configure -a failed, this is bad') def _cleanup_old_package_lists(): """Cleanup old package lists.""" try: if _log is not None: _log.info('cleaning up apt sources lists') # Remove all files in partial dir partial_dir = '/var/lib/apt/lists/partial' for f in os.listdir(partial_dir): fpath = os.path.join(partial_dir, f) if os.path.isfile(fpath): run_command(['/bin/rm', '-f', fpath]) # Remove all files except lock lists_dir = '/var/lib/apt/lists' for f in os.listdir(lists_dir): fpath = os.path.join(lists_dir, f) m = lockfile_name_re.match(f) if (os.path.isfile(fpath)) and (m is None): run_command(['/bin/rm', '-f', fpath]) except: if _log is not None: _log.exception('failed to remove package list files') def _aptitude_force_install(): """Attempt aptitude install which may clear up some (but not nearly all) error situations.""" try: # XXX: more logging here # XXX: add -q (quiet) option? run_command([constants.CMD_APTITUDE, '-f', '-y', 'install'], retval=runcommand.FAIL) except: if _log is not None: _log.exception('apt-get cleanup failed, ignoring') # Figure out version v_name_re = re.compile('vpnease_.*\.deb') lockfile_name_re = re.compile('^lock$') old_version, old_version_cached = versioninfo.get_version_info() if _log is not None: _log.info('starting update, current_version: %s (cached: %s)' % (str(old_version), str(old_version_cached))) # Remove old repository keys and import new ones if _log is not None: _log.info('removing old repository keys') [rv, out, err] = run_command([constants.CMD_RM, '-f', '/etc/apt/trustdb.gpg', '/etc/apt/trusted.gpg']) # XXX: more logging here [rv, out, err] = run_command([constants.CMD_APT_KEY, 'list']) # this recreates the gpg key store files if _log is not None: _log.info('importing repository keys') # XXX: more logging here [rv, out, err] = run_command([constants.CMD_APT_KEY, 'add', constants.UPDATE_REPOSITORY_KEYS_FILE]) if rv != 0: result = [9, 'failed to import repository keys'] raise UpdateError() # Cleanup old package list files, ignore errors _cleanup_old_package_lists() # Aptitude -f -y install _dpkg_configure_fixup() _aptitude_force_install() # In some practical situtions, e.g. in #734, the above command may fail # really badly (caused by broken package scripts), leading to a "dpkg # interrupted state". We re-run dpkg --configure -a to recover. _dpkg_configure_fixup() # Aptitude update if _log is not None: _log.info('updating package list') [rv, out, err] = _run_aptitude(['update']) if rv != 0: result = [4, 'package list update failed'] raise UpdateError() # Cleanup possibly confusing files in /tmp, should not be needed for f in os.listdir('/tmp'): m = v_name_re.match(f) if m is not None: [rv, out, err] = run_command([constants.CMD_RM, '-f', os.path.join('/tmp', f)]) # XXX: sanity _dpkg_configure_fixup() # Test repository access if _log is not None: _log.info('testing repository access') [rv, out, err] = _run_aptitude(['download', 'vpnease'], cwd='/tmp') if rv != 0: result = [5, 'server not available'] raise UpdateError() # If testing for version info fails, it is ignored and it may generate an error # later when we check if the product version changed or not. for f in os.listdir('/tmp'): m = v_name_re.match(f) if m is not None: cmp_version = versioninfo.get_package_version_info(os.path.join('/tmp', f)) if cmp_version is not None and cmp_version == old_version: result = [0, 'update not run, version in repository matches current version'] raise UpdateError() break # Aptitude clean if _log is not None: _log.info('cleaning up old package files') _run_aptitude(['clean']) # Aptitude upgrade # XXX: should this be dist-upgrade? dist-upgrade seems to be more forceful... _dpkg_configure_fixup() # XXX: sanity [rv1, out, err] = _run_aptitude(['upgrade']) # Aptitude install vpnease _dpkg_configure_fixup() # XXX: sanity [rv2, out, err] = _run_aptitude(['install', 'vpnease']) # Ensure vpnease package is still installed # Reinstall dependencies of vpnease package (sanity) _dpkg_configure_fixup() # XXX: sanity try: deps = [] dep_re = re.compile('^Depends: (.*)$') # XXX: better logging here # XXX: check that package is installed correctly [rv, out, err] = run_command(['dpkg', '-s', 'vpnease']) for l in out.split('\n'): m = dep_re.match(l) if m is None: continue for d in m.groups()[0].split(','): ds = d.strip().split(' ') deps.append(ds[0]) _run_aptitude(['reinstall'] + deps) except: if _log is not None: _log.info('failed to reinstall vpnease depends') # Aptitude reinstall vpnease try: _run_aptitude(['reinstall', 'vpnease']) except: if _log is not None: _log.info('failed to reinstall vpnease depends') # Aptitude clean if _log is not None: _log.info('cleaning up downloaded package files') _run_aptitude(['clean']) # Get new version, determine whether update succeeded based on version & return values new_version, new_version_cached = versioninfo.get_version_info() if rv1 != 0 or rv2 != 0: result = [6, 'upgrade/install command failed with error value: %s/%s' % (str(rv1), str(rv2))] raise UpdateError() elif new_version_cached: result = [7, 'failed to read version information after update'] raise UpdateError() elif old_version == new_version: result = [8, 'product version unchanged after update, update failed'] raise UpdateError() except UpdateError: pass except: result = [32, 'update failed with an exception'] # At this point, update was successful if result is None # Cleanup.. [rv, out, err] = run_command([constants.CMD_RM, '-f', '/tmp/vpnease_*.deb']) # Note: because timesync may be done later in update (after this script is run) # this timestamp may not be very accurate. def _write_marker_file(msg): # XXX: msg is not used for anything here ?? rv = helpers.write_datetime_marker_file(constants.UPDATE_RESULT_MARKER_FILE) if rv is None: raise Exception('failed to write update timestamp marker') # Handle errors if result is not None: if result[0] == 32: if _log is not None: _log.exception(result[1]) else: if _log is not None: _log.warning(result[1]) try: _write_marker_file('%s %s\nfailed\n%s' % (old_version, new_version, result[1])) except: if _log is not None: _log.exception('failed to write marker fail after failed update, ignoring') return result[0] # Update last update result marker file try: if _log is not None: _log.info('update success: %s -> %s' % (old_version, new_version)) _write_marker_file('%s %s\nsuccess\nUpdate success' % (old_version, new_version)) except: if _log is not None: _log.exception('failed to write marker file after successful update, ignoring') # Update last successful update timestamp, used by Web UI try: rv = helpers.write_datetime_marker_file(constants.LAST_SUCCESSFUL_UPDATE_MARKER_FILE) if rv is None: raise Exception('failed to write last successful update timestamp') except: if _log is not None: _log.exception('failed to write last successful update timestamp, ignoring') # Update done with success return 2
def _fsck_markers_check(_log): [rv, out, err] = run_command(['rm', '-f', constants.FORCE_FSCK_MARKER_FILE]) helpers.write_datetime_marker_file(constants.FASTBOOT_MARKER_FILE)
def _record_boot_timestamp(_log): """Record boot timestamp to a file.""" timestamp = helpers.write_datetime_marker_file(constants.BOOT_TIMESTAMP_FILE) if timestamp is None: raise Exception('boot timestamp write failed')
def run_watchdog(self): """Run the Web UI master watchdog (and other periodic checks) and take action if necessary. Web UI is the top-level watchdog element in the product. This function checks that the system is up and running in an acceptable manner, and if not, take action (reboot typically). In particular, this function checks that if the runner should be active and it is not (process has exited or is stuck), corrective action is taken. In addition to this actual watchdog checking, this function also does other periodic checks and respective actions. In Live CD the watchdog never runs, so no watch warnings or actions will be shown either. """ # increase rounds self._watchdog_rounds += 1 _log.debug('run_watchdog(), round %d' % self._watchdog_rounds) # update watchdog-last-update file try: helpers.write_datetime_marker_file(constants.WEBUI_WATCHDOG_LAST_UPDATED_FILE) except: _log.exception('cannot update webui watchdog file') # skip tests on livecd if self.master.is_live_cd(): _log.debug('watchdog on live cd, skipping') return # not live cd, normal watchdog try: self._watchdog_checks() except: # this should not happen, at least persistently _log.exception('watchdog execution failed, ignoring') # periodic reboot check try: self._periodic_reboot_check() except: _log.exception('periodic reboot check failed, ignoring') # update admin active file, if ajax polling is active (needed by graph # redraw optimizations in crontab) try: self._update_admin_timestamp() except: _log.exception('admin active file update failed, ignoring') # database export check try: self._database_export_check() except: _log.exception('database export check failed, ignoring') # info log even in success case _log.info('webui watchdog completed, strikes %d, round %d' % (self._watchdog_strikes, self._watchdog_rounds))
def _periodic_reboot_check(self): """Check for periodic reboot and take action if necessary. Tries to be clever and avoid reboot if connections are up. Uptime estimation is annoying: if time is changed on this reboot, the estimate may be grossly wrong. To ensure that we don't reboot on the first boot (when time is synchronized) uncontrollably, this function also checks that enough watchdog rounds have been run to warrant a reboot. The underlying assumption is that web UI has been running continuously, which is currently OK because we don't restart it ever (cron watchdog will just reboot if UI is down). Staggering of reboot is added by randomizing the "minute" of the reboot in the range [0,45] (not [0,60] for leeway). The "minute" is randomized when watchdog is created, so it stays the same every time for one reboot. Note that the stagger is effectively only applied to the first reboot attempt; next attempts (e.g. next day at designated time) will not have a stagger. If more staggering behavior is desired, see XXX below. """ uptime = self.master.get_uptime() reboot_required = False now = datetime.datetime.utcnow() _log.debug('_periodic_reboot_check: uptime=%s' % uptime) # Check whether UI configuration requires a reboot (time & day match) try: reboot_limit = uihelpers.compute_periodic_reboot_time() reboot_limit += self._periodic_reboot_stagger_delta _log.debug( '_periodic_reboot_check: reboot limit after stagger: %s' % reboot_limit) lm = licensemanager.LicenseMonitor() count, limit, limit_leeway = lm.count_normal_users() # time to periodic reboot (negative = past due) diff = reboot_limit - now _log.debug( '_periodic_reboot_check: periodic reboot diff (limit-now, time to reboot): %s' % str(diff)) if diff <= datetime.timedelta(0, 0, 0): overdue = -diff _log.debug( '_periodic_reboot_check: periodic reboot is %s overdue' % overdue) if count > 0: # there are clients (without license restrictions!), give 24h leeway if overdue < datetime.timedelta(1, 0, 0): # XXX: hardcoded _log.info( '_periodic_reboot_check: want to do a periodic reboot, but there are active clients (%d), skipping' % count) else: _log.warning( '_periodic_reboot_check: want to a periodic reboot, active clients (%d), but leeway over, rebooting anyway' % count) reboot_required = True else: _log.warning( '_periodic_reboot_check: want to do a periodic reboot, and no active clients, ok' ) reboot_required = True except: _log.exception( '_periodic_reboot_check: failed when checking for periodic reboot policy' ) # If not within periodic reboot time window (e.g. 02:00-03:00 local time), # skip periodic reboot. if reboot_required: # XXX: better stagger check could be applied here (checked every day) if not uihelpers.check_periodic_reboot_time_window(now): _log.warning( '_periodic_reboot_check: want to do a periodic reboot, but not within periodic reboot time window' ) reboot_required = False # If more than a maximum number of days, reboot, despite configuration if uptime > constants.PERIODIC_REBOOT_MAX_UPTIME: _log.warning( '_periodic_reboot_check: uptime is too large (%s), requires reboot' % uptime) reboot_required = True elif uptime < 0.0: # negative uptime: ignore it for now; if the diff is great, we'll get a periodic reboot anyway later _log.warning( '_periodic_reboot_check: uptime is negative (%s), ignoring' % uptime) # Sanity check: if we want to reboot, check that enough watchdog rounds # have elapsed (roughly 24h). if reboot_required: rounds = self.get_watchdog_rounds() if rounds < constants.PERIODIC_REBOOT_MINIMUM_WATCHDOG_ROUNDS: _log.warning( '_periodic_reboot_check: want to do periodic reboot, but watchdog rounds too low (%d < %d)' % (rounds, constants.PERIODIC_REBOOT_MINIMUM_WATCHDOG_ROUNDS)) reboot_required = False # Take action if necessary if reboot_required: if self._periodic_reboot_started: _log.info( '_periodic_reboot_check: reboot required but periodic reboot already in progress, no action needed' ) else: try: _log.warning( '_periodic_reboot_check: periodic reboot started') self._periodic_reboot_started = True self._periodic_reboot_show_warning() helpers.increment_global_status_counter(ns.periodicReboots) helpers.db_flush() except: _log.exception('failed to increment counter') try: helpers.write_datetime_marker_file( constants.LAST_AUTOMATIC_REBOOT_MARKER_FILE) except: _log.exception( 'failed to write last automatic reboot marker file') uihelpers.ui_reboot( constants.WEBUI_PRODUCT_PERIODIC_REBOOT_MESSAGE, skip_update=False, force_update=False, force_fsck=True, delay=120.0) # XXX: constants
def run(self): # remove default database just in case from codebay.l2tpserver import db db.remove_database() # log informative stuff into log first try: self.log_update_info() except: _log.exception('log_update_info() failed, ignoring') # prepare; may raise exceptions which propagate directly to caller try: self.prepare() except: _log.exception('prepare() failed') raise # run twisted, exits when protocol run complete try: self.run_twisted() except: _log.exception('run_twisted() failed') raise # do timesync last, to avoid disrupting timers if self.server_utctime_received_at is None: _log.info('management server connection time not available, ignoring timesync.') else: if self.do_timesync: try: # adjust server supplied time with local difference # this is not very nice, but we want to do timesync last now = datetime.datetime.utcnow() diff = now - self.server_utctime_received_at if diff > datetime.timedelta(0, 60*60, 0): # 1 hour # Something's badly wrong, system time apparently jumped. # This can happen e.g. if system time is updated by distro # scripts when runner restarts distro networking. # # If this happens, we just zero the diff: this causes inaccuracy # in time sync (<1 min) but is better than jumping arbitrarily. _log.warning('time jump when attempting to sync time, diff is %s; zeroing' % diff) diff = datetime.timedelta(0, 0, 0) dt = self.server_utctime + diff _log.debug('doing timesync: server time before adjustment: %s, server time after adjustment: %s, received-at: %s, time-now:%s, diff: %s' % (self.server_utctime, dt, self.server_utctime_received_at, now, diff)) # update time, don't cap (allow arbitrary time jumps) timesync.update_system_time(dt, cap_backwards=None, cap_forwards=None) helpers.write_datetime_marker_file(constants.UPDATE_TIMESYNC_TIMESTAMP_FILE) except: _log.exception('timesync with management server failed, ignoring.') # return value handling # # Overall the deferred chain started by runner and management connection # ends up in success or failure. We may or may not get a process exit # code, depending on what is executed. Finally, if a timeout occurs, # this error is flagged specially in self.update_failed. if isinstance(self.update_exit_code, (int, long)): if self.update_exit_code == 0: # update run, did not try update => signaled as error (but supported case) raise UpdateNotDoneError('policy requires update, but update cannot be performed') else: if self.update_exit_code == 2: # update run, update succeeded => signaled as no exception, success case raise UpdateDone('policy requires update and update was successful') else: # update run, update failed or unknown error raise UpdateFailedError('update script failed with exit code: %s' % self.update_exit_code) else: # update was not executed [or exit code is corrupted (should not happen)] # typically we come here after a connection timeout when we try to update and/or timesync if self.update_failed: # from global errback # update not run, but failure (probably timeout) raise UpdateUnknownError('unknown error (errback)') else: # policy does not require an update, no action was taken, success raise UpdateNotRequired('update not required by policy') raise UpdateUnknownError('should not be here')