def refresh(self): self.plugins = [] cmd = [smf.SVCSCMD, "-H", "-o", "state,FMRI", PLUGINBASEFMRI] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) outdata,errdata = p.communicate() err = p.wait() if err != 0: self._refreshLock.release() raise RuntimeError, '%s failed with exit code %d\n%s' % \ (str(cmd), err, errdata) for line in outdata.rstrip().split('\n'): line = line.rstrip().split() state = line[0] fmri = line[1] # Note that the plugins, being dependent on the time-slider service # themselves will typically be in an offline state when enabled. They will # transition to an "online" state once time-slider itself comes # "online" to satisfy it's dependency if state == "online" or state == "offline" or state == "degraded": util.debug("Found enabled plugin:\t%s" % (fmri), self.verbose) try: plugin = Plugin(fmri, self.verbose) self.plugins.append(plugin) except RuntimeError, message: sys.stderr.write("Ignoring misconfigured plugin: %s\n" \ % (fmri)) sys.stderr.write("Reason:\n%s\n" % (message)) else: util.debug("Found disabled plugin:\t%s" + fmri, self.verbose)
def _run_warning_cleanup(self, zpool): util.debug("Performing warning level cleanup on %s" % \ zpool.name, \ self.verbose) self._run_cleanup(zpool, "daily", self._warningLevel) if zpool.get_capacity() > self._warningLevel: self._run_cleanup(zpool, "hourly", self._warningLevel)
def refresh(self): self.plugins = [] cmd = [smf.SVCSCMD, "-H", "-o", "state,FMRI", PLUGINBASEFMRI] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) outdata,errdata = p.communicate() err = p.wait() if err != 0: self._refreshLock.release() raise RuntimeError, '%s failed with exit code %d\n%s' % \ (str(cmd), err, errdata) for line in outdata.rstrip().split('\n'): line = line.rstrip().split() state = line[0] fmri = line[1] # Note that the plugins, being dependent on the time-slider service # themselves will typically be in an offline state when enabled. They will # transition to an "online" state once time-slider itself comes # "online" to satisfy it's dependency if state == "online" or state == "offline" or state == "degraded": util.debug("Found enabled plugin:\t%s" % (fmri), self.verbose) try: plugin = Plugin(fmri, self.verbose) self.plugins.append(plugin) except RuntimeError, message: sys.stderr.write("Ignoring misconfigured plugin: %s\n" \ % (fmri)) sys.stderr.write("Reason:\n%s\n" % (message)) else: util.debug("Found disabled plugin:\t%s" % (fmri), self.verbose)
def get(self, section, option): try: result = self.config.get(section, option) util.debug('CONFIG: GET section %s, option %s with value %s\n' % (section, option, result), 1) return result except (ConfigParser.NoOptionError, ConfigParser.NoSectionError): util.debug('CONFIG: NOTFOUND section %s, option %s\n' % (section, option), 1) return ''
def _run_critical_cleanup(self, zpool): util.debug("Performing critical level cleanup on %s" % \ zpool.name, \ self.verbose) self._run_cleanup(zpool, "weekly", self._criticalLevel) if zpool.get_capacity() > self._criticalLevel: self._run_cleanup(zpool, "daily", self._criticalLevel) if zpool.get_capacity() > self._criticalLevel: self._run_cleanup(zpool, "hourly", self._criticalLevel)
def get(self, section, option): try: result = self.config.get(section, option) util.debug( 'CONFIG: GET section %s, option %s with value %s\n' % (section, option, result), 1) return result except (configparser.NoOptionError, configparser.NoSectionError): util.debug( 'CONFIG: NOTFOUND section %s, option %s\n' % (section, option), 1) return ''
def _run_emergency_cleanup(self, zpool): util.debug("Performing emergency level cleanup on %s" % \ zpool.name, \ self.verbose) self._run_cleanup(zpool, "monthly", self._emergencyLevel) if zpool.get_capacity() > self._emergencyLevel: self._run_cleanup(zpool, "weekly", self._emergencyLevel) if zpool.get_capacity() > self._emergencyLevel: self._run_cleanup(zpool, "daily", self._emergencyLevel) if zpool.get_capacity() > self._emergencyLevel: self._run_cleanup(zpool, "hourly", self._emergencyLevel) if zpool.get_capacity() > self._emergencyLevel: self._run_cleanup(zpool, "frequent", self._emergencyLevel) #Finally, as a last resort, delete custom scheduled snaphots for schedule, i, p, k in self._customSchedules: if zpool.get_capacity() < self._emergencyLevel: break else: self._run_cleanup(zpool, schedule, self._emergencyLevel)
def _needs_cleanup(self): if self._remedialCleanup == False: # Sys admin has explicitly instructed for remedial cleanups # not to be performed. return False now = int(time.time()) # Don't run checks any less than 15 minutes apart. if self._cleanupLock.acquire(False) == False: #Indicates that a cleanup is already running. return False # FIXME - Make the cleanup interval equal to the minimum snapshot interval # if custom snapshot schedules are defined and enabled. elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)): pass else: for zpool in self._zpools: try: if zpool.get_capacity() > self._warningLevel: # Before getting into a panic, determine if the pool # is one we actually take snapshots on, by checking # for one of the "auto-snapshot:<schedule> tags. Not # super fast, but it only happens under exceptional # circumstances of a zpool nearing it's capacity. for sched in self._allSchedules: sets = zpool.list_auto_snapshot_sets(sched[0]) if len(sets) > 0: util.debug("%s needs a cleanup" \ % zpool.name, \ self.verbose) self._cleanupLock.release() return True except RuntimeError as message: sys.stderr.write("Error checking zpool capacity of: " + \ zpool.name + "\n") self._cleanupLock.release() self.exitCode = smf.SMF_EXIT_ERR_FATAL # Propogate up to thread's run() mehod. raise RuntimeError(message) self._lastCleanupCheck = int(time.time()) self._cleanupLock.release() return False
def _check_snapshots(self): """ Check the schedules and see what the required snapshot is. Take one immediately on the first overdue snapshot required """ # Make sure a refresh() doesn't mess with the schedule while # we're reading through it. self._refreshLock.acquire() next, schedule = self._next_due() self._refreshLock.release() now = int(time.time()) while next != None and next <= now: label = self._take_snapshots(schedule) self._plugin.execute_plugins(schedule, label) self._refreshLock.acquire() self._update_schedules() next, schedule = self._next_due() self._refreshLock.release() dt = datetime.datetime.fromtimestamp(next) util.debug("Next snapshot is %s due at: %s" % \ (schedule, dt.isoformat()), \ self.verbose) return next
def __init__(self, instanceName, debug=False): self.verbose = debug util.debug("Instantiating plugin for:\t%s" % (instanceName), self.verbose) self.smfInst = pluginsmf.PluginSMF(instanceName) self._proc = None # Note that the associated plugin service's start method checks # that the command is defined and executable. But SMF doesn't # bother to do this for offline services until all dependencies # (ie. time-slider) are brought online. # So we also check the permissions here. command = self.smfInst.get_trigger_command() try: statinfo = os.stat(command) other_x = (statinfo.st_mode & 01) if other_x == 0: raise RuntimeError, 'Plugin: %s:\nConfigured trigger command is not ' \ 'executable:\n%s' \ % (self.smfInst.instanceName, command) except OSError: raise RuntimeError, 'Plugin: %s:\nCan not access the configured ' \ 'plugin/trigger_command:\n%s' \ % (self.smfInst.instanceName, command)
def _perform_cleanup(self): if self._cleanupLock.acquire(False) == False: # Cleanup already running. Skip return self._destroyedsnaps = [] for zpool in self._zpools: try: self._poolstatus[zpool.name] = 0 capacity = zpool.get_capacity() if capacity > self._warningLevel: self._run_warning_cleanup(zpool) self._poolstatus[zpool.name] = 1 capacity = zpool.get_capacity() if capacity > self._criticalLevel: self._run_critical_cleanup(zpool) self._poolstatus[zpool.name] = 2 capacity = zpool.get_capacity() if capacity > self._emergencyLevel: self._run_emergency_cleanup(zpool) self._poolstatus[zpool.name] = 3 capacity = zpool.get_capacity() if capacity > self._emergencyLevel: self._run_emergency_cleanup(zpool) self._poolstatus[zpool.name] = 4 # This also catches exceptions thrown from _run_<level>_cleanup() # and _run_cleanup() in methods called by _perform_cleanup() except RuntimeError as message: sys.stderr.write("Remedial space cleanup failed because " + \ "of failure to determinecapacity of: " + \ zpool.name + "\n") self.exitCode = smf.SMF_EXIT_ERR_FATAL self._cleanupLock.release() # Propogate up to thread's run() method. raise RuntimeError(message) # Bad - there's no more snapshots left and nothing # left to delete. We don't disable the service since # it will permit self recovery and snapshot # retention when space becomes available on # the pool (hopefully). util.debug("%s pool status after cleanup:" \ % zpool.name, \ self.verbose) util.debug(zpool.name, self.verbose) util.debug("Cleanup completed. %d snapshots were destroyed" \ % len(self._destroyedsnaps), \ self.verbose) # Avoid needless list iteration for non-debug mode if self.verbose == True and len(self._destroyedsnaps) > 0: for snap in self._destroyedsnaps: sys.stderr.write("\t%s\n" % snap) self._cleanupLock.release()
def is_running(self): if self._proc == None: util.debug("Plugin child process is not started", self.verbose) return False else: self._proc.poll() if self._proc.returncode == None: util.debug("Plugin child process is still running", self.verbose) return True else: util.debug("Plugin child process has ended", self.verbose) return False
def execute(self, schedule, label): triggers = self.smfInst.get_trigger_list() try: triggers.index("all") except ValueError: try: triggers.index(schedule) except ValueError: return # Skip if already running if self.is_running() == True: util.debug("Plugin: %s is already running. Skipping execution" \ % (self.smfInst.instanceName), \ self.verbose) return # Skip if plugin FMRI has been disabled or placed into maintenance cmd = [smf.SVCSCMD, "-H", "-o", "state", self.smfInst.instanceName] outdata, errdata = util.run_command(cmd) state = outdata.strip() if state == "disabled" or state == "maintenance": util.debug("Plugin: %s is in %s state. Skipping execution" \ % (self.smfInst.instanceName, state), \ self.verbose) return cmd = self.smfInst.get_trigger_command() util.debug("Executing plugin command: %s" % str(cmd), self.verbose) svcFmri = "%s:%s" % (autosnapsmf.BASESVC, schedule) os.putenv("AUTOSNAP_FMRI", svcFmri) os.putenv("AUTOSNAP_LABEL", label) try: os.putenv("PLUGIN_FMRI", self.smfInst.instanceName) self._proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, universal_newlines=True) except OSError as message: raise RuntimeError("%s subprocess error:\n %s" % \ (cmd, str(message))) self._proc = None
def execute(self, schedule, label): triggers = self.smfInst.get_trigger_list() try: triggers.index("all") except ValueError: try: triggers.index(schedule) except ValueError: return # Skip if already running if self.is_running() == True: util.debug("Plugin: %s is already running. Skipping execution" \ % (self.smfInst.instanceName), \ self.verbose) return # Skip if plugin FMRI has been disabled or placed into maintenance cmd = [smf.SVCSCMD, "-H", "-o", "state", self.smfInst.instanceName] outdata,errdata = util.run_command(cmd) state = outdata.strip() if state == "disabled" or state == "maintenance": util.debug("Plugin: %s is in %s state. Skipping execution" \ % (self.smfInst.instanceName, state), \ self.verbose) return cmd = self.smfInst.get_trigger_command() util.debug("Executing plugin command: %s" % str(cmd), self.verbose) svcFmri = "%s:%s" % (autosnapsmf.BASESVC, schedule) os.putenv("AUTOSNAP_FMRI", svcFmri) os.putenv("AUTOSNAP_LABEL", label) try: os.putenv("PLUGIN_FMRI", self.smfInst.instanceName) self._proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) except OSError, message: raise RuntimeError, "%s subprocess error:\n %s" % \ (cmd, str(message)) self._proc = None
def main(argv): # Check that appropriate environment variables have been # provided by time-sliderd # # The label used for the snapshot set just taken, ie. the # component proceeding the "@" in the snapshot name snaplabel = os.getenv("AUTOSNAP_LABEL") # The SMF fmri of the auto-snapshot instance corresponding to # the snapshot set just taken. snapfmri = os.getenv("AUTOSNAP_FMRI") # The SMF fmri of the time-slider plugin instance associated with # this command. pluginfmri = os.getenv("PLUGIN_FMRI") if pluginfmri == None: sys.stderr.write("No time-slider plugin SMF instance FMRI defined. " \ "This plugin does not support command line " "execution. Exiting\n") sys.exit(-1) syslog.openlog(pluginfmri, 0, syslog.LOG_DAEMON) cmd = [smf.SVCPROPCMD, "-p", verboseprop, pluginfmri] outdata, errdata = util.run_command(cmd) if outdata.rstrip() == "true": verbose = True else: verbose = False if snaplabel == None: log_error(syslog.LOG_ERR, "No snapshot label provided. Exiting") sys.exit(-1) if snapfmri == None: log_error(syslog.LOG_ERR, "No auto-snapshot SMF instance FMRI provided. Exiting") sys.exit(-1) schedule = snapfmri.rsplit(':', 1)[1] plugininstance = pluginfmri.rsplit(':', 1)[1] # The user property/tag used when tagging and holding zfs datasets propname = "%s:%s" % (propbasename, plugininstance) # Identifying snapshots is a 3 stage process. # # First: identify all snapshots matching the AUTOSNAP_LABEL # value passed in by the time-slider daemon. # # Second: Filter out snapshots of volumes, since rsync can only # back up filesystems. # # Third: we need to filter the results and ensure that the # filesystem corresponding to each snapshot is actually # tagged with the property (com.sun:auto-snapshot<:schedule>) # # This is necessary to avoid confusion whereby a snapshot might # have been sent|received from one zpool to another on the same # system. The received snapshot will show up in the first pass # results but is not actually part of the auto-snapshot set # created by time-slider. It also avoids incorrectly placing # zfs holds on the imported snapshots. datasets = zfs.Datasets() candidates = datasets.list_snapshots(snaplabel) autosnapsets = datasets.list_auto_snapshot_sets(schedule) autosnapfs = [name for [name,mount] in datasets.list_filesystems() \ if name in autosnapsets] snappeddatasets = [] snapnames = [name for [name,ctime] in candidates \ if name.split('@',1)[0] in autosnapfs] # Mark the snapshots with a user property. Doing this instead of # placing a physical hold on the snapshot allows time-slider to # expire the snapshots naturally or destroy them if a zpool fills # up and triggers a remedial cleanup. # It also prevents the possiblity of leaving snapshots lying around # indefinitely on the system if the plugin SMF instance becomes # disabled or having to release a pile of held snapshots. # We set org.opensolaris:time-slider-plugin:<instance> to "pending", # indicate snapshots = [] for snap in snapnames: snapshot = zfs.Snapshot(snap) fs = zfs.Filesystem(snapshot.fsname) if fs.get_user_property(rsyncsmf.RSYNCFSTAG) == "true": if fs.is_mounted() == True: snapshot.set_user_property(propname, "pending") util.debug("Marking %s as pending rsync" % (snap), verbose) else: util.debug("Ignoring snapshot of unmounted fileystem: %s" \ % (snap), verbose)
def _configure_svc_props(self): try: self.verbose = self._smf.get_verbose() except RuntimeError as message: sys.stderr.write("Error determing whether debugging is enabled\n") self.verbose = False try: cleanup = self._smf.get_remedial_cleanup() warn = self._smf.get_cleanup_level("warning") util.debug("Warning level value is: %d%%" % warn, self.verbose) crit = self._smf.get_cleanup_level("critical") util.debug("Critical level value is: %d%%" % crit, self.verbose) emer = self._smf.get_cleanup_level("emergency") util.debug("Emergency level value is: %d%%" % emer, self.verbose) except RuntimeError as message: sys.stderr.write("Failed to determine cleanup threshhold levels\n") sys.stderr.write("Details:\n" + \ "--------BEGIN ERROR MESSAGE--------\n" + \ str(message) + \ "\n---------END ERROR MESSAGE---------\n") sys.stderr.write("Using factory defaults of 80%, 90% and 95%\n") #Go with defaults #FIXME - this would be an appropriate case to mark svc as degraded self._remedialCleanup = True self._warningLevel = 80 self._criticalLevel = 90 self._emergencyLevel = 95 else: self._remedialCleanup = cleanup self._warningLevel = warn self._criticalLevel = crit self._emergencyLevel = emer try: self._keepEmpties = self._smf.get_keep_empties() except RuntimeError as message: # Not fatal, just assume we delete them (default configuration) sys.stderr.write( "Can't determine whether to keep empty snapshots\n") sys.stderr.write("Details:\n" + \ "--------BEGIN ERROR MESSAGE--------\n" + \ str(message) + \ "\n---------END ERROR MESSAGE---------\n") sys.stderr.write("Assuming default value: False\n") self._keepEmpties = False # Previously, snapshot labels used the ":" character was used as a # separator character for datestamps. Windows filesystems such as # CIFS and FAT choke on this character so now we use a user definable # separator value, with a default value of "_" # We need to check for both the old and new format when looking for # snapshots. self._separator = self._smf.get_separator() self._prefix = "%s[:%s]" \ % (autosnapsmf.SNAPLABELPREFIX, self._separator) # Rebuild pool list self._zpools = [] try: for poolname in zfs.list_zpools(): # Do not try to examine FAULTED pools zpool = zfs.ZPool(poolname) if zpool.health == "FAULTED": util.debug("Ignoring faulted Zpool: %s\n" \ % (zpool.name), \ self.verbose) else: self._zpools.append(zpool) util.debug(str(zpool), self.verbose) except RuntimeError as message: sys.stderr.write("Could not list Zpools\n") self.exitCode = smf.SMF_EXIT_ERR_FATAL # Propogate exception up to thread's run() method raise RuntimeError(message)
def execute_plugins(self, schedule, label): util.debug("Executing plugins for \"%s\" with label: \"%s\"" \ % (schedule, label), \ self.verbose) for plugin in self.plugins: plugin.execute(schedule, label)
def main(argv): # Check appropriate environment variables habe been supplied # by time-slider # # The label used for the snapshot set just taken, ie. the # component proceeding the "@" in the snapshot name snaplabel = os.getenv("AUTOSNAP_LABEL") # The SMF fmri of the auto-snapshot instance corresponding to # the snapshot set just taken. snapfmri = os.getenv("AUTOSNAP_FMRI") # The SMF fmri of the time-slider plugin instance associated with # this command. pluginfmri = os.getenv("PLUGIN_FMRI") if pluginfmri == None: sys.stderr.write("No time-slider plugin SMF instance FMRI defined. " \ "This plugin does not support command line " "execution. Exiting\n") sys.exit(-1) syslog.openlog(pluginfmri, 0, syslog.LOG_DAEMON) cmd = [smf.SVCPROPCMD, "-p", verboseprop, pluginfmri] outdata,errdata = util.run_command(cmd) if outdata.rstrip() == "true": verbose = True else: verbose = False if snaplabel == None: log_error(syslog.LOG_ERR, "No snapshot label defined. Exiting") sys.exit(-1) if snapfmri == None: log_error(syslog.LOG_ERR, "No auto-snapshot SMF instance FMRI defined. Exiting") sys.exit(-1) schedule = snapfmri.rsplit(':', 1)[1] plugininstance = pluginfmri.rsplit(':', 1)[1] # The user property/tag used when tagging and holding zfs datasets propname = "%s:%s" % (propbasename, plugininstance) # Identifying snapshots is a two stage process. # # First: identify all snapshots matching the AUTOSNAP_LABEL # value passed in by the time-slider daemon. # # Second: we need to filter the results and ensure that the # filesystem/voluem corresponding to each snapshot is actually # tagged with the property (com.sun:auto-snapshot<:schedule>) # # This is necessary to avoid confusion whereby a snapshot might # have been sent|received from one zpool to another on the same # system. The received snapshot will show up in the first pass # results but is not actually part of the auto-snapshot set # created by time-slider. It also avoids incorrectly placing # zfs holds on the imported snapshots. datasets = zfs.Datasets() candidates = datasets.list_snapshots(snaplabel) originsets = datasets.list_auto_snapshot_sets(schedule) snappeddatasets = [] snapnames = [name for [name,ctime] in candidates \ if name.split('@',1)[0] in originsets] # Place a hold on the the newly created snapshots so # they can be backed up without fear of being destroyed # before the backup gets a chance to complete. for snap in snapnames: snapshot = zfs.Snapshot(snap) holds = snapshot.holds() try: holds.index(propname) except ValueError: util.debug("Placing hold on %s" % (snap), verbose) snapshot.hold(propname) datasetname = snapshot.fsname # Insert datasetnames in alphabetically sorted order because # zfs receive falls over if it receives a child before the # parent if the "-F" option is not used. insort(snappeddatasets, datasetname) # Find out the receive command property value cmd = [smf.SVCPROPCMD, "-c", "-p", "receive/command", pluginfmri] outdata,errdata = util.run_command(cmd) # Strip out '\' characters inserted by svcprop recvcmd = outdata.strip().replace('\\', '').split() # Check to see if the receive command is accessible and executable try: statinfo = os.stat(recvcmd[0]) other_x = (statinfo.st_mode & 01) if other_x == 0: log_error(syslog.LOG_ERR, "Plugin: %s: Configured receive/command is not " \ "executable: %s" \ % (pluginfmri, outdata)) maintenance(pluginfmri) sys.exit(-1) except OSError: log_error(syslog.LOG_ERR, "Plugin: %s: Can not access the configured " \ "receive/command: %s" \ % (pluginfmri, outdata)) maintenance(pluginfmri) sys.exit(-1) for dataset in snappeddatasets: sendcmd = None prevsnapname = None ds = zfs.ReadableDataset(dataset) prevlabel = ds.get_user_property(propname) snapname = "%s@%s" % (ds.name, snaplabel) if (prevlabel == None or prevlabel == '-' or len(prevlabel) == 0): # No previous backup - send a full replication stream sendcmd = [zfs.ZFSCMD, "send", snapname] util.debug("No previous backup registered for %s" % ds.name, verbose) else: # A record of a previous backup exists. # Check that it exists to enable send of an incremental stream. prevsnapname = "%s@%s" % (ds.name, prevlabel) util.debug("Previously sent snapshot: %s" % prevsnapname, verbose) prevsnap = zfs.Snapshot(prevsnapname) if prevsnap.exists(): sendcmd = [zfs.ZFSCMD, "send", "-i", prevsnapname, snapname] else: # This should not happen under normal operation since we # place a hold on the snapshot until it gets sent. So # getting here suggests that something else released the # hold on the snapshot, allowing it to get destroyed # prematurely. log_error(syslog.LOG_ERR, "Previously sent snapshot no longer exists: %s" \ % prevsnapname) maintenance(pluginfmri) sys.exit(-1) # Invoke the send and receive commands via pfexec(1) since # we are not using the role's shell to take care of that # for us. sendcmd.insert(0, smf.PFCMD) recvcmd.insert(0, smf.PFCMD) try: sendP = subprocess.Popen(sendcmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) recvP = subprocess.Popen(recvcmd, stdin=sendP.stdout, stderr=subprocess.PIPE, close_fds=True) recvout,recverr = recvP.communicate() recverrno = recvP.wait() sendout,senderr = sendP.communicate() senderrno = sendP.wait() if senderrno != 0: raise RuntimeError, "Send command: %s failed with exit code" \ "%d. Error message: \n%s" \ % (str(sendcmd), senderrno, senderr) if recverrno != 0: raise RuntimeError, "Receive command %s failed with exit " \ "code %d. Error message: \n%s" \ % (str(recvcmd), recverrno, recverr) if prevsnapname != None: util.debug("Releasing hold on %s" % (prevsnapname), verbose) snapshot = zfs.Snapshot(prevsnapname) util.debug("Releasing hold on previous snapshot: %s" \ % (prevsnapname), verbose) snapshot.release(propname) except Exception, message: log_error(syslog.LOG_ERR, "Error during snapshot send/receive operation: %s" \ % (message)) maintenance(pluginfmri) sys.exit(-1) # Finally, after success, make a record of the latest backup # and release the old snapshot. ds.set_user_property(propname, snaplabel) util.debug("Sending of \"%s\"snapshot streams completed" \ % (snaplabel), verbose)
def _update_schedules(self): interval = 0 idx = 1 # Used to index subsets for schedule overlap calculation last = None for schedule, interval, period, keep in self._allSchedules: # Shortcut if we've already processed this schedule and it's # still up to date. Don't skip the default schedules though # because overlap affects their scheduling if [schedule,interval,period,keep] not in \ self._defaultSchedules and \ (self._next[schedule] > self._last[schedule]): util.debug("Short circuiting %s recalculation" \ % (schedule), \ self.verbose) continue # If we don't have an internal timestamp for the given schedule # ask zfs for the last snapshot and get it's creation timestamp. if self._last[schedule] == 0: try: snaps = self._datasets.list_snapshots("%s%s" % \ (self._prefix, schedule)) except RuntimeError as message: self.exitCode = smf.SMF_EXIT_ERR_FATAL sys.stderr.write( "Failed to list snapshots during schedule update\n") #Propogate up to the thread's run() method raise RuntimeError(message) if len(snaps) > 0: util.debug("Last %s snapshot was: %s" % \ (schedule, snaps[-1][0]), \ self.verbose) self._last[schedule] = snaps[-1][1] last = self._last[schedule] if interval != "months": # months is non-constant. See below. util.debug("Recalculating %s schedule" % (schedule), \ self.verbose) try: totalinterval = intervals[interval] * period except KeyError: self.exitCode = smf.SMF_EXIT_ERR_CONFIG sys.stderr.write(schedule + \ " schedule has invalid interval: " + \ "'%s\'\n" % interval) #Propogate up to thread's run() method raise RuntimeError if [schedule, interval, period, keep] in self._defaultSchedules: # This is one of the default schedules so check for an # overlap with one of the dominant shchedules. for s, i, p, k in self._defaultSchedules[:idx]: last = max(last, self._last[s]) idx += 1 else: # interval == "months" if self._next[schedule] > last: util.debug("Short circuiting " + \ schedule + \ " recalculation", \ self.verbose) continue util.debug("Recalculating %s schedule" % (schedule), \ self.verbose) snap_tm = time.gmtime(self._last[schedule]) # Increment year if period >= than 1 calender year. year = snap_tm.tm_year year += int(period / 12) period = period % 12 mon = (snap_tm.tm_mon + period) % 12 # Result of 0 actually means december. if mon == 0: mon = 12 # Account for period that spans calendar year boundary. elif snap_tm.tm_mon + period > 12: year += 1 d, dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon) d, dnewmon = calendar.monthrange(year, mon) mday = snap_tm.tm_mday if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon: mday = dnewmon tm =(year, mon, mday, \ snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \ 0, 0, -1) newt = calendar.timegm(tm) new_tm = time.gmtime(newt) totalinterval = newt - self._last[schedule] self._next[schedule] = last + totalinterval
def _run_cleanup(self, zpool, schedule, threshold): clonedsnaps = [] snapshots = [] try: clonedsnaps = self._datasets.list_cloned_snapshots() except RuntimeError as message: sys.stderr.write("Error (non-fatal) listing cloned snapshots" + " while recovering pool capacity\n") sys.stderr.write("Error details:\n" + \ "--------BEGIN ERROR MESSAGE--------\n" + \ str(message) + \ "\n--------END ERROR MESSAGE--------\n") # Build a list of snapshots in the given schedule, that are not # cloned, and sort the result in reverse chronological order. try: snapshots = [s for s,t in \ zpool.list_snapshots("%s%s" \ % (self._prefix,schedule)) \ if not s in clonedsnaps] snapshots.reverse() except RuntimeError as message: sys.stderr.write("Error listing snapshots" + " while recovering pool capacity\n") self.exitCode = smf.SMF_EXIT_ERR_FATAL # Propogate the error up to the thread's run() method. raise RuntimeError(message) while zpool.get_capacity() > threshold: if len(snapshots) == 0: syslog.syslog(syslog.LOG_NOTICE, "No more %s snapshots left" \ % schedule) return """This is not an exact science. Deleteing a zero sized snapshot can have unpredictable results. For example a pair of snapshots may share exclusive reference to a large amount of data (eg. a large core file). The usage of both snapshots will initially be seen to be 0 by zfs(1). Deleting one of the snapshots will make the data become unique to the single remaining snapshot that references it uniquely. The remaining snapshot's size will then show up as non zero. So deleting 0 sized snapshot is not as pointless as it might seem. It also means we have to loop through this, each snapshot set at a time and observe the before and after results. Perhaps better way exists....""" # Start with the oldest first snapname = snapshots.pop() snapshot = zfs.Snapshot(snapname) # It would be nicer, for performance purposes, to delete sets # of snapshots recursively but this might destroy more data than # absolutely necessary, plus the previous purging of zero sized # snapshots can easily break the recursion chain between # filesystems. # On the positive side there should be fewer snapshots and they # will mostly non-zero so we should get more effectiveness as a # result of deleting snapshots since they should be nearly always # non zero sized. util.debug("Destroying %s" % snapname, self.verbose) try: snapshot.destroy() except RuntimeError as message: # Would be nice to be able to mark service as degraded here # but it's better to try to continue on rather than to give # up alltogether (SMF maintenance state) sys.stderr.write("Warning: Cleanup failed to destroy: %s\n" % \ (snapshot.name)) sys.stderr.write("Details:\n%s\n" % (str(message))) else: self._destroyedsnaps.append(snapname) # Give zfs some time to recalculate. time.sleep(3)
def _prune_snapshots(self, dataset, schedule): """Cleans out zero sized snapshots, kind of cautiously""" # Per schedule: We want to delete 0 sized # snapshots but we need to keep at least one around (the most # recent one) for each schedule so that that overlap is # maintained from frequent -> hourly -> daily etc. # Start off with the smallest interval schedule first and # move up. This increases the amount of data retained where # several snapshots are taken together like a frequent hourly # and daily snapshot taken at 12:00am. If 3 snapshots are all # identical and reference the same identical data they will all # be initially reported as zero for used size. Deleting the # daily first then the hourly would shift make the data referenced # by all 3 snapshots unique to the frequent scheduled snapshot. # This snapshot would probably be purged within an how ever and the # data referenced by it would be gone for good. # Doing it the other way however ensures that the data should # remain accessible to the user for at least a week as long as # the pool doesn't run low on available space before that. try: snaps = dataset.list_snapshots("%s%s" % (self._prefix, schedule)) # Clone the list because we want to remove items from it # while iterating through it. remainingsnaps = snaps[:] except RuntimeError as message: sys.stderr.write( "Failed to list snapshots during snapshot cleanup\n") self.exitCode = smf.SMF_EXIT_ERR_FATAL raise RuntimeError(message) if (self._keepEmpties == False): try: # remove the newest one from the list. snaps.pop() except IndexError: pass for snapname in snaps: try: snapshot = zfs.Snapshot(snapname) except Exception as message: sys.stderr.write(str(message)) # Not fatal, just skip to the next snapshot continue try: if snapshot.get_used_size() == 0: util.debug("Destroying zero sized: " + snapname, \ self.verbose) try: snapshot.destroy() except RuntimeError as message: sys.stderr.write("Failed to destroy snapshot: " + snapname + "\n") self.exitCode = smf.SMF_EXIT_MON_DEGRADE # Propogate exception so thread can exit raise RuntimeError(message) remainingsnaps.remove(snapname) except RuntimeError as message: sys.stderr.write("Can not determine used size of: " + \ snapname + "\n") self.exitCode = smf.SMF_EXIT_MON_DEGRADE #Propogate the exception to the thead run() method raise RuntimeError(message) # Deleting individual snapshots instead of recursive sets # breaks the recursion chain and leaves child snapshots # dangling so we need to take care of cleaning up the # snapshots. target = len(remainingsnaps) - self._keep[schedule] counter = 0 while counter < target: util.debug("Destroy expired snapshot: " + \ remainingsnaps[counter], self.verbose) try: snapshot = zfs.Snapshot(remainingsnaps[counter]) except Exception as message: sys.stderr.write(str(message)) # Not fatal, just skip to the next snapshot counter += 1 continue try: snapshot.destroy() except RuntimeError as message: sys.stderr.write("Failed to destroy snapshot: " + snapshot.name + "\n") self.exitCode = smf.SMF_EXIT_ERR_FATAL # Propogate exception so thread can exit raise RuntimeError(message) else: counter += 1
def run(self): # Deselect swap and dump volumes so they don't get snapshotted. for vol in self._datasets.list_volumes(): name = vol.rsplit("/") try: if (name[1] == "swap" or name[1] == "dump"): util.debug("Auto excluding %s volume" % vol, self.verbose) volume = zfs.Volume(vol) volume.set_auto_snap(False) except IndexError: pass nexttime = None waittime = None while True: try: self.refresh() # First check and, if necessary, perform any remedial cleanup. # This is best done before creating any new snapshots which may # otherwise get immediately gobbled up by the remedial cleanup. if self._needs_cleanup() == True: self._perform_cleanup() # Check to see if cleanup actually deleted anything before # notifying the user. Avoids the popup appearing continuously if len(self._destroyedsnaps) > 0: self._send_notification() self._send_to_syslog() nexttime = self._check_snapshots() # Overdue snapshots are already taken automatically # inside _check_snapshots() so nexttime should never be # < 0. It can be None however, which is fine since it # will cause the scheduler thread to sleep indefinitely # or until a SIGHUP is caught. if nexttime: util.debug("Waiting until " + str(nexttime), self.verbose) waittime = None if nexttime != None: waittime = nexttime - int(time.time()) if (waittime <= 0): # We took too long and missed a snapshot, so break out # and catch up on it the next time through the loop continue # waittime could be None if no auto-snap schedules are online self._conditionLock.acquire() if waittime: util.debug("Waiting %d seconds" % (waittime), self.verbose) self._conditionLock.wait(waittime) else: #None. Just wait a while to check for cleanups. util.debug("No auto-snapshot schedules online.", \ self.verbose) self._conditionLock.wait(_MINUTE * 15) except OSError as message: sys.stderr.write("Caught OSError exception in snapshot" + " manager thread\n") sys.stderr.write("Error details:\n" + \ "--------BEGIN ERROR MESSAGE--------\n" + \ str(message) + \ "\n--------END ERROR MESSAGE--------\n") self.exitCode = smf.SMF_EXIT_ERR_FATAL # Exit this thread break except RuntimeError as message: sys.stderr.write("Caught RuntimeError exception in snapshot" + " manager thread\n") sys.stderr.write("Error details:\n" + \ "--------BEGIN ERROR MESSAGE--------\n" + \ str(message) + \ "\n--------END ERROR MESSAGE--------\n") # Exit this thread break
def main(argv): # Check that appropriate environment variables have been # provided by time-sliderd # # The label used for the snapshot set just taken, ie. the # component proceeding the "@" in the snapshot name snaplabel = os.getenv("AUTOSNAP_LABEL") # The SMF fmri of the auto-snapshot instance corresponding to # the snapshot set just taken. snapfmri = os.getenv("AUTOSNAP_FMRI") # The SMF fmri of the time-slider plugin instance associated with # this command. pluginfmri = os.getenv("PLUGIN_FMRI") if pluginfmri == None: sys.stderr.write( "No time-slider plugin SMF instance FMRI defined. " "This plugin does not support command line " "execution. Exiting\n" ) sys.exit(-1) syslog.openlog(pluginfmri, 0, syslog.LOG_DAEMON) cmd = [smf.SVCPROPCMD, "-p", verboseprop, pluginfmri] outdata, errdata = util.run_command(cmd) if outdata.rstrip() == "true": verbose = True else: verbose = False if snaplabel == None: log_error(syslog.LOG_ERR, "No snapshot label provided. Exiting") sys.exit(-1) if snapfmri == None: log_error(syslog.LOG_ERR, "No auto-snapshot SMF instance FMRI provided. Exiting") sys.exit(-1) schedule = snapfmri.rsplit(":", 1)[1] plugininstance = pluginfmri.rsplit(":", 1)[1] # The user property/tag used when tagging and holding zfs datasets propname = "%s:%s" % (propbasename, plugininstance) # Identifying snapshots is a 3 stage process. # # First: identify all snapshots matching the AUTOSNAP_LABEL # value passed in by the time-slider daemon. # # Second: Filter out snapshots of volumes, since rsync can only # back up filesystems. # # Third: we need to filter the results and ensure that the # filesystem corresponding to each snapshot is actually # tagged with the property (com.sun:auto-snapshot<:schedule>) # # This is necessary to avoid confusion whereby a snapshot might # have been sent|received from one zpool to another on the same # system. The received snapshot will show up in the first pass # results but is not actually part of the auto-snapshot set # created by time-slider. It also avoids incorrectly placing # zfs holds on the imported snapshots. datasets = zfs.Datasets() candidates = datasets.list_snapshots(snaplabel) autosnapsets = datasets.list_auto_snapshot_sets(schedule) autosnapfs = [name for [name, mount] in datasets.list_filesystems() if name in autosnapsets] snappeddatasets = [] snapnames = [name for [name, ctime] in candidates if name.split("@", 1)[0] in autosnapfs] # Mark the snapshots with a user property. Doing this instead of # placing a physical hold on the snapshot allows time-slider to # expire the snapshots naturally or destroy them if a zpool fills # up and triggers a remedial cleanup. # It also prevents the possiblity of leaving snapshots lying around # indefinitely on the system if the plugin SMF instance becomes # disabled or having to release a pile of held snapshots. # We set org.opensolaris:time-slider-plugin:<instance> to "pending", # indicate snapshots = [] for snap in snapnames: snapshot = zfs.Snapshot(snap) fs = zfs.Filesystem(snapshot.fsname) if fs.get_user_property(rsyncsmf.RSYNCFSTAG) == "true": if fs.is_mounted() == True: snapshot.set_user_property(propname, "pending") util.debug("Marking %s as pending rsync" % (snap), verbose) else: util.debug("Ignoring snapshot of unmounted fileystem: %s" % (snap), verbose)