def backup_blobs_gzip(source, destination, keep=0): """Make gzip archive from blobs in source directory. Source is usually something like var/blobstorage and destination would be var/blobstoragebackups. Within that destination we create a gzip file with a fresh blob backup from the source. We use 'keep' to simply keep the last X backups. Again, let's test this using the tools from zc.buildout: >>> mkdir('blobs') >>> write('blobs', 'one.txt', "File One") >>> write('blobs', 'two.txt', "File Two") >>> write('blobs', 'three.txt', "File Three") >>> mkdir('blobs', 'dir') >>> mkdir('backups') >>> backup_blobs_gzip('blobs', 'backups', keep=0) >>> ls('backups') - blobs.0.tar.gz Change some stuff. >>> write('blobs', 'one.txt', "Changed File One") >>> write('blobs', 'four.txt', "File Four") >>> remove('blobs', 'two.txt') >>> backup_blobs_gzip('blobs', 'backups') >>> ls('backups') - blobs.0.tar.gz - blobs.1.tar.gz Cleanup: >>> remove('blobs') >>> remove('backups') """ base_name = os.path.basename(source) if not os.path.exists(destination): os.makedirs(destination) rotate_gzips(destination, base_name) dest = os.path.join(destination, base_name + '.0.tar.gz') if os.path.exists(dest): raise Exception("Path already exists: %s" % dest) cmd = "tar czf %s -C %s ." % (dest, source) logger.info(cmd) output, failed = utils.system(cmd) if output: print output if failed: return # Now possibly remove old backups. cleanup_gzips(destination, keep=keep)
def restore_blobs_gzip(source, destination, date=None): """Restore blobs from source to destination. Prepare backup for test: >>> mkdir('blobs') >>> write('blobs', 'one.txt', "File One") >>> write('blobs', 'two.txt', "File Two") >>> write('blobs', 'three.txt', "File Three") >>> mkdir('blobs', 'dir') >>> mkdir('backups') >>> backup_blobs_gzip('blobs', 'backups', keep=0) >>> ls('backups') - blobs.0.tar.gz Test restore: >>> remove('blobs') >>> restore_blobs_gzip('backups', 'blobs') >>> ls('blobs') d dir - one.txt - three.txt - two.txt Cleanup: >>> remove('blobs') >>> remove('backups') """ if destination.endswith(os.sep): # strip that separator destination = destination[:-len(os.sep)] base_name = os.path.basename(destination) # Determine the source (blob backup) that should be restored. backup_source = None if date is not None: # From repozo: specify UTC (not local) time in this format: # yyyy-mm-dd[-hh[-mm[-ss]]] # Note that this matches the 2011-10-05-12-12-45.fsz that is created. try: date_args = [int(num) for num in date.split('-')] except: logger.info("Could not parse date argument to restore blobs: %r", date) logger.info("Restoring most recent backup instead.") else: target_datetime = datetime(*date_args) backup_gzips = get_blob_backup_gzips(source) # We want to find the first backup after the requested # modification time, so we reverse the order. backup_gzips.reverse() # Note: this reverses in place. for num, mod_time, gzip_file in backup_gzips: backup_time = datetime.utcfromtimestamp(mod_time) if backup_time >= target_datetime: backup_source = gzip_file break if not backup_source: logger.warn("Could not find backup more recent than %r. Using " "most recent instead.", date) if not backup_source: # The most recent is the default: backup_source = os.path.join( source, base_name + '.0.tar.gz' ) if os.path.exists(destination): logger.info("Removing %s", destination) shutil.rmtree(destination) os.mkdir(destination) logger.info("Extracting %s to %s", backup_source, destination) cmd = "tar xzf %s -C %s" % (backup_source, destination) logger.info(cmd) output, failed = utils.system(cmd) if output: print output if failed: return
def restore_blobs(source, destination, use_rsync=True, date=None, gzip_blob=False, rsync_options=''): """Restore blobs from source to destination. With 'use_rsync' at the default True, we use rsync to copy, otherwise we use shutil.copytree. This is mostly there for systems that don't have rsync available. rsync is recommended. We could remove the destination first (with 'shutil.rmtree(destination)'), but an 'rsync -a --delete' works faster. Note that trailing slashes in source and destination do matter, so be careful with that otherwise you may end up with something like var/blobstorage/blobstorage """ if gzip_blob: restore_blobs_gzip(source, destination, date) return if destination.endswith(os.sep): # strip that separator destination = destination[:-len(os.sep)] base_name = os.path.basename(destination) dest_dir = os.path.dirname(destination) # Determine the source (blob backup) that should be restored. backup_source = None if date is not None: # From repozo: specify UTC (not local) time in this format: # yyyy-mm-dd[-hh[-mm[-ss]]] # Note that this matches the 2011-10-05-12-12-45.fsz that is created. try: date_args = [int(num) for num in date.split('-')] except: logger.info("Could not parse date argument to restore blobs: %r", date) logger.info("Restoring most recent backup instead.") else: target_datetime = datetime(*date_args) backup_dirs = get_blob_backup_dirs(source) # We want to find the first backup after the requested # modification time, so we reverse the order. backup_dirs.reverse() # Note: this reverses in place. for num, mod_time, directory in backup_dirs: backup_time = datetime.utcfromtimestamp(mod_time) if backup_time >= target_datetime: backup_source = os.path.join(directory, base_name) break if not backup_source: logger.warn("Could not find backup more recent than %r. Using " "most recent instead.", date) if not backup_source: # The most recent is the default: backup_source = os.path.join(source, base_name + '.0', base_name) # You should end up with something like this: #rsync -a --delete var/blobstoragebackups/blobstorage.0/blobstorage var/ if use_rsync: cmd = 'rsync -a %(options)s --delete %(source)s %(dest)s' % dict( options=rsync_options, source=backup_source, dest=dest_dir) logger.info(cmd) output, failed = utils.system(cmd) if output: print output if failed: return else: if os.path.exists(destination): logger.info("Removing %s", destination) shutil.rmtree(destination) logger.info("Copying %s to %s", backup_source, destination) shutil.copytree(backup_source, destination)
def backup_blobs(source, destination, full=False, use_rsync=True, keep=0, keep_blob_days=0, gzip_blob=False, rsync_options=''): """Copy blobs from source to destination. Source is usually something like var/blobstorage and destination would be var/blobstoragebackups. Within that destination we create a subdirectory with a fresh blob backup from the source. We can make a full backup or a partial backup. Partial backups are done with rsync and hard links to safe disk space. Actually, full backups used to avoid the hard links, but that did not really have any extra value, so now it does the same thing, just in its own directory. With 'use_rsync' at the default True, we use rsync to copy, otherwise we use shutil.copytree. This is mostly there for systems that don't have rsync available. rsync is recommended. Note that we end up with something like var/blobstorage copied to var/blobbackups/blobstorage.0/blobstorage. We could copy the contents of var/blobstorage directly to blobstorage.0, but then the disk space safing hard links do not work. keep_blob_days only makes sense in combination with full=False. We then use this to keep the backups created in the last 'keep_blob_days' days. For full backups we use 'keep' to simply keep the last X backups. But for partial backups 'keep' should mean we keep the last X full Data.fs backups plus the partial backups created by repozo; and there is no similar concept in our blobstorage backups. Again, let's test this using the tools from zc.buildout: >>> mkdir('blobs') >>> write('blobs', 'one.txt', "File One") >>> write('blobs', 'two.txt', "File Two") >>> write('blobs', 'three.txt', "File Three") >>> mkdir('blobs', 'dir') >>> mkdir('backups') >>> backup_blobs('blobs', 'backups') >>> ls('backups') d blobs.0 >>> ls('backups', 'blobs.0') d blobs >>> ls('backups', 'blobs.0', 'blobs') d dir - one.txt - three.txt - two.txt Change some stuff. >>> write('blobs', 'one.txt', "Changed File One") >>> write('blobs', 'four.txt', "File Four") >>> remove('blobs', 'two.txt') >>> backup_blobs('blobs', 'backups') >>> ls('backups') d blobs.0 d blobs.1 >>> ls('backups', 'blobs.1', 'blobs') d dir - one.txt - three.txt - two.txt >>> ls('backups', 'blobs.0', 'blobs') d dir - four.txt - one.txt - three.txt >>> cat('backups', 'blobs.1', 'blobs', 'one.txt') File One >>> cat('backups', 'blobs.0', 'blobs', 'one.txt') Changed File One Check the file stats to see if they are really hard links: >>> import os >>> stat_0 = os.stat(os.path.join('backups', 'blobs.0', 'blobs', ... 'three.txt')) >>> stat_1 = os.stat(os.path.join('backups', 'blobs.1', 'blobs', ... 'three.txt')) >>> stat_0.st_ino == stat_1.st_ino True Cleanup: >>> remove('blobs') >>> remove('backups') We do exactly the same (if developers remember to copy changes done above to below) but now using full backups. >>> mkdir('blobs') >>> write('blobs', 'one.txt', "File One") >>> write('blobs', 'two.txt', "File Two") >>> write('blobs', 'three.txt', "File Three") >>> mkdir('blobs', 'dir') >>> mkdir('backups') >>> backup_blobs('blobs', 'backups', full=True) >>> ls('backups') d blobs.0 >>> ls('backups', 'blobs.0') d blobs >>> ls('backups', 'blobs.0', 'blobs') d dir - one.txt - three.txt - two.txt Change some stuff. >>> write('blobs', 'one.txt', "Changed File One") >>> write('blobs', 'four.txt', "File Four") >>> remove('blobs', 'two.txt') >>> backup_blobs('blobs', 'backups', full=True) >>> ls('backups') d blobs.0 d blobs.1 >>> ls('backups', 'blobs.1', 'blobs') d dir - one.txt - three.txt - two.txt >>> ls('backups', 'blobs.0', 'blobs') d dir - four.txt - one.txt - three.txt >>> cat('backups', 'blobs.1', 'blobs', 'one.txt') File One >>> cat('backups', 'blobs.0', 'blobs', 'one.txt') Changed File One Check the file stats. We did full copies, but these should still be hard links. >>> import os >>> stat_0 = os.stat(os.path.join('backups', 'blobs.0', 'blobs', ... 'three.txt')) >>> stat_1 = os.stat(os.path.join('backups', 'blobs.1', 'blobs', ... 'three.txt')) >>> stat_0.st_ino == stat_1.st_ino True Cleanup: >>> remove('blobs') >>> remove('backups') """ base_name = os.path.basename(source) if gzip_blob: backup_blobs_gzip(source, destination, keep) return rotate_directories(destination, base_name) prev = os.path.join(destination, base_name + '.1') dest = os.path.join(destination, base_name + '.0') if use_rsync: if os.path.exists(prev): # Make a 'partial' backup by reusing the previous backup. We # might not want to do this for full backups, but this is a # lot faster and the end result really is the same, so why # not. if not os.path.isdir(prev): # Should have been caught already. raise Exception("%s must be a directory" % prev) # Hardlink against the previous directory. Done by hand it # would be: # rsync -a --delete --link-dest=../blobstorage.1 blobstorage/ # backups/blobstorage.0 prev_link = os.path.join(os.pardir, base_name + '.1') cmd = ('rsync -a %(options)s --delete --link-dest=%(link)s %(source)s ' '%(dest)s' % dict(options=rsync_options,link=prev_link, source=source, dest=dest)) else: # No previous directory to hardlink against. cmd = 'rsync -a %(options)s %(source)s %(dest)s' % dict( options=rsync_options, source=source, dest=dest) logger.info(cmd) output, failed = utils.system(cmd) if output: print output if failed: return else: if not os.path.exists(dest): # The parent directory must exist for shutil.copytree # in python2.4. os.makedirs(dest) dest = os.path.join(dest, base_name) logger.info("Copying %s to %s", source, dest) shutil.copytree(source, dest) # Now possibly remove old backups. cleanup(destination, full, keep, keep_blob_days)
def restore_blobs_gzip(source, destination, date=None): """Restore blobs from source to destination. Prepare backup for test: >>> mkdir('blobs') >>> write('blobs', 'one.txt', "File One") >>> write('blobs', 'two.txt', "File Two") >>> write('blobs', 'three.txt', "File Three") >>> mkdir('blobs', 'dir') >>> mkdir('backups') >>> backup_blobs_gzip('blobs', 'backups', keep=0) >>> ls('backups') - blobs.0.tar.gz Test restore: >>> remove('blobs') >>> restore_blobs_gzip('backups', 'blobs') >>> ls('blobs') d dir - one.txt - three.txt - two.txt Cleanup: >>> remove('blobs') >>> remove('backups') """ if destination.endswith(os.sep): # strip that separator destination = destination[:-len(os.sep)] base_name = os.path.basename(destination) # Determine the source (blob backup) that should be restored. backup_source = None if date is not None: # From repozo: specify UTC (not local) time in this format: # yyyy-mm-dd[-hh[-mm[-ss]]] # Note that this matches the 2011-10-05-12-12-45.fsz that is created. try: date_args = [int(num) for num in date.split('-')] except: logger.info("Could not parse date argument to restore blobs: %r", date) logger.info("Restoring most recent backup instead.") else: target_datetime = datetime(*date_args) backup_gzips = get_blob_backup_gzips(source) # We want to find the first backup after the requested # modification time, so we reverse the order. backup_gzips.reverse() # Note: this reverses in place. for num, mod_time, gzip_file in backup_gzips: backup_time = datetime.utcfromtimestamp(mod_time) if backup_time >= target_datetime: backup_source = gzip_file break if not backup_source: logger.warn( "Could not find backup more recent than %r. Using " "most recent instead.", date) if not backup_source: # The most recent is the default: backup_source = os.path.join(source, base_name + '.0.tar.gz') if os.path.exists(destination): logger.info("Removing %s", destination) shutil.rmtree(destination) os.mkdir(destination) logger.info("Extracting %s to %s", backup_source, destination) cmd = "tar xzf %s -C %s" % (backup_source, destination) logger.info(cmd) output, failed = utils.system(cmd) if output: print output if failed: return
def restore_blobs(source, destination, use_rsync=True, date=None, gzip_blob=False, rsync_options=''): """Restore blobs from source to destination. With 'use_rsync' at the default True, we use rsync to copy, otherwise we use shutil.copytree. This is mostly there for systems that don't have rsync available. rsync is recommended. We could remove the destination first (with 'shutil.rmtree(destination)'), but an 'rsync -a --delete' works faster. Note that trailing slashes in source and destination do matter, so be careful with that otherwise you may end up with something like var/blobstorage/blobstorage """ if gzip_blob: restore_blobs_gzip(source, destination, date) return if destination.endswith(os.sep): # strip that separator destination = destination[:-len(os.sep)] base_name = os.path.basename(destination) dest_dir = os.path.dirname(destination) # Determine the source (blob backup) that should be restored. backup_source = None if date is not None: # From repozo: specify UTC (not local) time in this format: # yyyy-mm-dd[-hh[-mm[-ss]]] # Note that this matches the 2011-10-05-12-12-45.fsz that is created. try: date_args = [int(num) for num in date.split('-')] except: logger.info("Could not parse date argument to restore blobs: %r", date) logger.info("Restoring most recent backup instead.") else: target_datetime = datetime(*date_args) backup_dirs = get_blob_backup_dirs(source) # We want to find the first backup after the requested # modification time, so we reverse the order. backup_dirs.reverse() # Note: this reverses in place. for num, mod_time, directory in backup_dirs: backup_time = datetime.utcfromtimestamp(mod_time) if backup_time >= target_datetime: backup_source = os.path.join(directory, base_name) break if not backup_source: logger.warn( "Could not find backup more recent than %r. Using " "most recent instead.", date) if not backup_source: # The most recent is the default: backup_source = os.path.join(source, base_name + '.0', base_name) # You should end up with something like this: # rsync -a --delete var/blobstoragebackups/blobstorage.0/blobstorage var/ if use_rsync: cmd = 'rsync -a %(options)s --delete %(source)s %(dest)s' % dict( options=rsync_options, source=backup_source, dest=dest_dir) logger.info(cmd) output, failed = utils.system(cmd) if output: print output if failed: return else: if os.path.exists(destination): logger.info("Removing %s", destination) shutil.rmtree(destination) logger.info("Copying %s to %s", backup_source, destination) shutil.copytree(backup_source, destination)
def backup_blobs(source, destination, full=False, use_rsync=True, keep=0, keep_blob_days=0, gzip_blob=False, rsync_options=''): """Copy blobs from source to destination. Source is usually something like var/blobstorage and destination would be var/blobstoragebackups. Within that destination we create a subdirectory with a fresh blob backup from the source. We can make a full backup or a partial backup. Partial backups are done with rsync and hard links to safe disk space. Actually, full backups used to avoid the hard links, but that did not really have any extra value, so now it does the same thing, just in its own directory. With 'use_rsync' at the default True, we use rsync to copy, otherwise we use shutil.copytree. This is mostly there for systems that don't have rsync available. rsync is recommended. Note that we end up with something like var/blobstorage copied to var/blobbackups/blobstorage.0/blobstorage. We could copy the contents of var/blobstorage directly to blobstorage.0, but then the disk space safing hard links do not work. keep_blob_days only makes sense in combination with full=False. We then use this to keep the backups created in the last 'keep_blob_days' days. For full backups we use 'keep' to simply keep the last X backups. But for partial backups 'keep' should mean we keep the last X full Data.fs backups plus the partial backups created by repozo; and there is no similar concept in our blobstorage backups. Again, let's test this using the tools from zc.buildout: >>> mkdir('blobs') >>> write('blobs', 'one.txt', "File One") >>> write('blobs', 'two.txt', "File Two") >>> write('blobs', 'three.txt', "File Three") >>> mkdir('blobs', 'dir') >>> mkdir('backups') >>> backup_blobs('blobs', 'backups') >>> ls('backups') d blobs.0 >>> ls('backups', 'blobs.0') d blobs >>> ls('backups', 'blobs.0', 'blobs') d dir - one.txt - three.txt - two.txt Change some stuff. >>> write('blobs', 'one.txt', "Changed File One") >>> write('blobs', 'four.txt', "File Four") >>> remove('blobs', 'two.txt') >>> backup_blobs('blobs', 'backups') >>> ls('backups') d blobs.0 d blobs.1 >>> ls('backups', 'blobs.1', 'blobs') d dir - one.txt - three.txt - two.txt >>> ls('backups', 'blobs.0', 'blobs') d dir - four.txt - one.txt - three.txt >>> cat('backups', 'blobs.1', 'blobs', 'one.txt') File One >>> cat('backups', 'blobs.0', 'blobs', 'one.txt') Changed File One Check the file stats to see if they are really hard links: >>> import os >>> stat_0 = os.stat(os.path.join('backups', 'blobs.0', 'blobs', ... 'three.txt')) >>> stat_1 = os.stat(os.path.join('backups', 'blobs.1', 'blobs', ... 'three.txt')) >>> stat_0.st_ino == stat_1.st_ino True Cleanup: >>> remove('blobs') >>> remove('backups') We do exactly the same (if developers remember to copy changes done above to below) but now using full backups. >>> mkdir('blobs') >>> write('blobs', 'one.txt', "File One") >>> write('blobs', 'two.txt', "File Two") >>> write('blobs', 'three.txt', "File Three") >>> mkdir('blobs', 'dir') >>> mkdir('backups') >>> backup_blobs('blobs', 'backups', full=True) >>> ls('backups') d blobs.0 >>> ls('backups', 'blobs.0') d blobs >>> ls('backups', 'blobs.0', 'blobs') d dir - one.txt - three.txt - two.txt Change some stuff. >>> write('blobs', 'one.txt', "Changed File One") >>> write('blobs', 'four.txt', "File Four") >>> remove('blobs', 'two.txt') >>> backup_blobs('blobs', 'backups', full=True) >>> ls('backups') d blobs.0 d blobs.1 >>> ls('backups', 'blobs.1', 'blobs') d dir - one.txt - three.txt - two.txt >>> ls('backups', 'blobs.0', 'blobs') d dir - four.txt - one.txt - three.txt >>> cat('backups', 'blobs.1', 'blobs', 'one.txt') File One >>> cat('backups', 'blobs.0', 'blobs', 'one.txt') Changed File One Check the file stats. We did full copies, but these should still be hard links. >>> import os >>> stat_0 = os.stat(os.path.join('backups', 'blobs.0', 'blobs', ... 'three.txt')) >>> stat_1 = os.stat(os.path.join('backups', 'blobs.1', 'blobs', ... 'three.txt')) >>> stat_0.st_ino == stat_1.st_ino True Cleanup: >>> remove('blobs') >>> remove('backups') """ base_name = os.path.basename(source) if gzip_blob: backup_blobs_gzip(source, destination, keep) return rotate_directories(destination, base_name) prev = os.path.join(destination, base_name + '.1') dest = os.path.join(destination, base_name + '.0') if use_rsync: if os.path.exists(prev): # Make a 'partial' backup by reusing the previous backup. We # might not want to do this for full backups, but this is a # lot faster and the end result really is the same, so why # not. if not os.path.isdir(prev): # Should have been caught already. raise Exception("%s must be a directory" % prev) # Hardlink against the previous directory. Done by hand it # would be: # rsync -a --delete --link-dest=../blobstorage.1 blobstorage/ # backups/blobstorage.0 prev_link = os.path.join(os.pardir, base_name + '.1') cmd = ( 'rsync -a %(options)s --delete --link-dest=%(link)s %(source)s ' '%(dest)s' % dict(options=rsync_options, link=prev_link, source=source, dest=dest)) else: # No previous directory to hardlink against. cmd = 'rsync -a %(options)s %(source)s %(dest)s' % dict( options=rsync_options, source=source, dest=dest) logger.info(cmd) output, failed = utils.system(cmd) if output: print output if failed: return else: if not os.path.exists(dest): # The parent directory must exist for shutil.copytree # in python2.4. os.makedirs(dest) dest = os.path.join(dest, base_name) logger.info("Copying %s to %s", source, dest) shutil.copytree(source, dest) # Now possibly remove old backups. cleanup(destination, full, keep, keep_blob_days)