def build_replication_cache(context, tags, configurations, region, installed_region): """Build a giant cache of replication-relevant snapshots for region""" LOG.debug("Building cache of replication-relevant snapshots in %s", region) # all replication-related snapshots will have one or the other of these tags found_snapshots = {} region_owner_ids = get_owner_id(region) for tag in tags: found_snapshots[tag] = [] params = { 'Filters': [{ 'Name': 'tag-key', 'Values': [tag] }], 'OwnerIds': region_owner_ids, } paginator = build_snapshot_paginator(params, region) for page in paginator: if timeout_check(context, 'perform_replication'): break if not page and 'Snapshots' not in page: continue for snapshot in page['Snapshots']: found_snapshots[tag].append(snapshot) if timeout_check(context, 'perform_replication'): break return found_snapshots
def build_replication_cache(context, tags, configurations, region, installed_region): """Build a giant cache of replication-relevant snapshots for region""" LOG.debug("Building cache of replication-relevant snapshots in %s", region) # all replication-related snapshots will have one or the other of these tags found_snapshots = {} ec2 = boto3.client('ec2', region_name=region) region_owner_ids = get_owner_id(region) for tag in tags: found_snapshots[tag] = [] paginator = ec2.get_paginator('describe_snapshots') operation_parameters = { 'Filters': [{ 'Name': 'tag-key', 'Values': [tag] }], 'OwnerIds': region_owner_ids, } sleep(1) # help w/ API limits for page in paginator.paginate(**operation_parameters): if timeout_check(context, 'perform_replication'): break if not page and 'Snapshots' not in page: continue for snapshot in page['Snapshots']: found_snapshots[tag].append(snapshot) if timeout_check(context, 'perform_replication'): break return found_snapshots
def perform_snapshot(context, region, installed_region='us-east-1'): """Check the region and instance, and see if we should take any snapshots""" LOG.info('Reviewing snapshots in region %s', region) # fetch these, in case we need to figure out what applies to an instance configurations = dynamo.list_configurations(context, installed_region) LOG.debug('Fetched all possible configuration rules from DynamoDB') # build a list of any IDs (anywhere) that we should ignore ignore_ids = utils.build_ignore_list(configurations) # setup some lookup tables cache_data = utils.build_cache_maps(context, configurations, region, installed_region) all_instances = cache_data['instance_id_to_data'] instance_configs = cache_data['instance_id_to_config'] volume_snap_recent = cache_data['volume_id_to_most_recent_snapshot_date'] for instance_id in set(all_instances.keys()): # before we go do some work if timeout_check(context, 'perform_snapshot'): break if instance_id in ignore_ids: continue snapshot_settings = instance_configs[instance_id] # parse out snapshot settings retention, frequency = utils.parse_snapshot_settings(snapshot_settings) # grab the data about this instance id, if we don't already have it instance_data = all_instances[instance_id] ami_id = instance_data['ImageId'] LOG.info('Reviewing snapshots in region %s on instance %s', region, instance_id) for dev in instance_data.get('BlockDeviceMappings', []): # before we go make a bunch more API calls if timeout_check(context, 'perform_snapshot'): break # we probably should have been using volume keys from one of the # caches here, but since we're not, we're going to have to check here too LOG.debug('Considering device %s', dev) volume_id = dev['Ebs']['VolumeId'] if volume_id in ignore_ids: continue # find snapshots recent = volume_snap_recent.get(volume_id) now = datetime.datetime.now(dateutil.tz.tzutc()) # snapshot due? if should_perform_snapshot(frequency, now, volume_id, recent): LOG.debug('Performing snapshot for %s, calculating tags', volume_id) else: LOG.debug('NOT Performing snapshot for %s', volume_id) continue # perform actual snapshot and create tag: retention + now() as a Y-M-D delete_on_dt = now + retention delete_on = delete_on_dt.strftime('%Y-%m-%d') volume_data = utils.get_volume(volume_id, region=region) expected_tags = utils.calculate_relevant_tags( instance_data.get('Tags', None), volume_data.get('Tags', None)) utils.snapshot_and_tag( instance_id, ami_id, volume_id, delete_on, region, additional_tags=expected_tags)
def clean_snapshot(context, region, default_min_snaps=5, installed_region='us-east-1'): """Check the region see if we should clean up any snapshots""" LOG.info('clean_snapshot in region %s', region) # fetch these, in case we need to figure out what applies to an instance configurations = dynamo.list_configurations(context, installed_region) LOG.debug('Fetched all possible configuration rules from DynamoDB') # build a list of any IDs (anywhere) that we should ignore ignore_ids = utils.build_ignore_list(configurations) # figure out if we're in an account-wide mode where we ignore retention and # destroy all snapshots with a delete_on value that we want to delete ignore_retention_enabled = utils.ignore_retention_enabled(configurations) cache_data = utils.build_cache_maps(context, configurations, region, installed_region) instance_configs = cache_data['instance_id_to_config'] all_volumes = cache_data['volume_id_to_instance_id'] volume_snap_count = cache_data['volume_id_to_snapshot_count'] # figure out what dates we want to nuke today = datetime.date.today() delete_on_values = [] for i in range(0, 8): # seven days ago until today del_date = today + timedelta(days=-i) delete_on_values.append(del_date.strftime('%Y-%m-%d')) # setup counters before we start deleted_count = 0 # setup our filters filters = [ { 'Name': 'tag-key', 'Values': ['DeleteOn'] }, { 'Name': 'tag-value', 'Values': delete_on_values }, ] params = {'Filters': filters} # paginate the snapshot list tag_paginator = utils.build_snapshot_paginator(params, region) for page in tag_paginator: # stop if we're running out of time if timeout_check(context, 'clean_snapshot'): break # if we don't get even a page of results, or missing hash key, skip if not page and 'Snapshots' not in page: continue for snap in page['Snapshots']: # stop if we're running out of time if timeout_check(context, 'clean_snapshot'): break # ugly comprehension to strip out a tag delete_on = [ r['Value'] for r in snap['Tags'] if r.get('Key') == 'DeleteOn' ][0] # volume for snapshot snapshot_volume = snap['VolumeId'] minimum_snaps = default_min_snaps if snapshot_volume in ignore_ids: continue try: # given volume id, get the instance for it volume_instance = all_volumes.get(snapshot_volume, None) # minimum required if volume_instance is not None: snapshot_settings = instance_configs.get( volume_instance, None) if snapshot_settings is not None: try: minimum_snaps = int( snapshot_settings['snapshot']['minimum']) except ValueError: raise Exception( "Minimum number of snaps configured is not an integer." ) # current number of snapshots if snapshot_volume in volume_snap_count: no_snaps = volume_snap_count[snapshot_volume] else: raise Exception( 'Could not count snapshots, missing volume') # if we have less than the minimum, don't delete this one if no_snaps <= minimum_snaps: LOG.warn('Not deleting snapshot %s from %s (%s)', snap['SnapshotId'], region, delete_on) LOG.warn('Only %s snapshots exist, below minimum of %s', no_snaps, minimum_snaps) continue except: # if we couldn't figure out a minimum of snapshots, # don't clean this up -- these could be orphaned snapshots LOG.warn( 'Error analyzing snapshot %s from %s, skipping... (%s)', snap['SnapshotId'], region, delete_on) # skip this loop iteration unless ignore_retention_enabled if not ignore_retention_enabled: continue log_snapcount = volume_snap_count.get(snapshot_volume, 'unknown') \ if volume_snap_count else None LOG.warn('Deleting snapshot %s from %s (%s, count=%s > %s)', snap['SnapshotId'], region, delete_on, log_snapcount, minimum_snaps) deleted_count += utils.delete_snapshot(snap['SnapshotId'], region) if deleted_count <= 0: LOG.warn('No snapshots were cleaned up for the entire region %s', region) else: LOG.info( 'Function clean_snapshots_tagged completed, deleted count: %s', str(deleted_count)) LOG.info('Function clean_snapshot completed')
def perform_replication(context, region, installed_region='us-east-1'): """Check the region and instance, and see if we should clean or create copies""" LOG.info('Performing snapshot replication in region %s', region) # TL;DR -- always try to clean up first, before making new copies. # build a list of ignore IDs, just in case they are relevant here configurations = dynamo.list_configurations(context, installed_region) ignore_ids = utils.build_ignore_list(configurations) LOG.debug('Fetched all configured ignored IDs rules from DynamoDB') # 1. collect snapshots from this region relevant_tags = ['replication_src_region', 'replication_dst_region'] found_snapshots = utils.build_replication_cache(context, relevant_tags, configurations, region, installed_region) # 2. evaluate snapshots that were copied to this region, if source not found, delete for snapshot in found_snapshots.get('replication_src_region', []): snapshot_id = snapshot['SnapshotId'] snapshot_description = snapshot['Description'] if timeout_check(context, 'perform_replication'): break if snapshot_id in ignore_ids: continue if snapshot['State'] in ['pending', 'error']: LOG.warn('Skip cleaning up this snapshot ' + snapshot_id + ' due to ' + snapshot['State'] + ' state: ' + snapshot_description) continue LOG.info('Working on cleaning up this snapshot ' + snapshot_id + ' (if needed): ' + snapshot_description) # what region did this come from? tag_pairs = snapshot.get('Tags', []) region_tag_pair = [ x for x in tag_pairs if x.get('Key', None) == 'replication_src_region' ] region_tag_value = region_tag_pair[0].get('Value') # what snapshot id did this come from? snapshotid_tag_pair = [ x for x in tag_pairs if x.get('Key', None) == 'replication_snapshot_id' ] snapshotid_tag_value = snapshotid_tag_pair[0].get('Value') ec2_source = boto3.client('ec2', region_name=region_tag_value) try: found_originals = ec2_source.describe_snapshots( SnapshotIds=[snapshotid_tag_value ], # we think the original snapshot id is this Filters=[ # where it gets copied to should be us { 'Name': 'tag:replication_dst_region', 'Values': [region] }, ]) except Exception as err: if 'InvalidSnapshot.NotFound' in str(err): found_originals = {'Snapshots': []} else: raise err num_found = len(found_originals.get('Snapshots', [])) if num_found > 0: LOG.info('Not removing this snapshot ' + snapshot_id + ' from ' + region + ' since snapshot_id ' + snapshotid_tag_value + ' was already found in ' + region_tag_value) continue # ax it! LOG.warn('Removing this snapshot ' + snapshot_id + ' from ' + region + ' since snapshot_id ' + snapshotid_tag_value + ' was not found in ' + region_tag_value) utils.delete_snapshot(snapshot_id, region) # 3. evaluate snapshots that should be copied from this region, if dest not found, copy and tag for snapshot in found_snapshots.get('replication_dst_region', []): snapshot_id = snapshot['SnapshotId'] snapshot_description = snapshot['Description'] if timeout_check(context, 'perform_replication'): break if snapshot_id in ignore_ids: continue if snapshot['State'] in ['pending', 'error']: LOG.warn('Skip copying this snapshot ' + snapshot_id + ' due to ' + snapshot['State'] + ' state: ' + snapshot_description) continue LOG.info('Working on copying this snapshot ' + snapshot_id + ' (if needed): ' + snapshot_description) # what region should this be mapped to? tag_pairs = snapshot.get('Tags', []) region_tag_pair = [ x for x in tag_pairs if x.get('Key', None) == 'replication_dst_region' ] region_tag_value = region_tag_pair[0].get('Value') # does it already exist in the target region? ec2_destination = boto3.client('ec2', region_name=region_tag_value) found_replicas = ec2_destination.describe_snapshots(Filters=[ # came from our region originally { 'Name': 'tag:replication_src_region', 'Values': [region] }, # came from our snapshot originally { 'Name': 'tag:replication_snapshot_id', 'Values': [snapshot_id] } ]) num_found = len(found_replicas.get('Snapshots', [])) if num_found > 0: LOG.info('Not creating more snapshots, since snapshot_id ' + snapshot_id + ' was already found in ' + region_tag_value) continue # we need to make one in the target region LOG.warn('Creating a new snapshot, since snapshot_id ' + snapshot_id + ' was not already found in ' + region_tag_value) utils.copy_snapshot_and_tag(context, region, region_tag_value, snapshot_id, snapshot_description)
def perform_replication(context, region, installed_region='us-east-1'): """Check the region and instance, and see if we should clean or create copies""" LOG.info('Performing snapshot replication in region %s', region) # TL;DR -- always try to clean up first, before making new copies. # build a list of ignore IDs, just in case they are relevant here configurations = dynamo.list_configurations(context, installed_region) ignore_ids = utils.build_ignore_list(configurations) LOG.debug('Fetched all configured ignored IDs rules from DynamoDB') # 1. collect snapshots from this region snap_cached_src_regions = [] snap_cached_dst_regions = [] src_snap_list = [] replication_snap_list = [] relevant_tags = ['replication_src_region', 'replication_dst_region'] found_snapshots = utils.build_replication_cache( context, relevant_tags, configurations, region, installed_region ) # 1a. build snapshot cache from all source regions for snapshot_regions in found_snapshots.get('replication_src_region', []): # what region did this come from? tag_pairs = snapshot_regions.get('Tags', []) region_tag_pair = [x for x in tag_pairs if x.get('Key') == 'replication_src_region'] region_tag_value = region_tag_pair[0].get('Value') if region_tag_value not in snap_cached_src_regions: LOG.info('Caching snapshots in source region: %s', region_tag_value) snap_cached_src_regions.append(region_tag_value) ec2_source = boto3.client('ec2', region_name=region_tag_value) try: response = ec2_source.describe_snapshots( Filters=[{'Name': 'tag:replication_dst_region', 'Values': [region]}] ) mysnaps = response['Snapshots'] except Exception as err: if 'InvalidSnapshot.NotFound' in str(err): mysnaps = {'Snapshots', []} else: raise err for snap in mysnaps: src_snap_list.append(snap['SnapshotId']) LOG.info('Caching completed for source region: ' + region_tag_value + ': cache size: ' + str(len(src_snap_list))) sleep(1) # 1b. build snapshot cache for all destination regions for snapshot_regions in found_snapshots.get('replication_dst_region', []): # which region is destination tag_pairs = snapshot_regions.get('Tags', []) region_tag_pair = [x for x in tag_pairs if x.get('Key') == 'replication_dst_region'] region_tag_value = region_tag_pair[0].get('Value') if region_tag_value not in snap_cached_dst_regions: LOG.info('Caching snapshots in destination region: %s', region_tag_value) snap_cached_dst_regions.append(region_tag_value) ec2_source = boto3.client('ec2', region_name=region_tag_value) try: response = ec2_source.describe_snapshots( Filters=[{'Name': 'tag:replication_src_region', 'Values': [region]}] ) mysnaps = response['Snapshots'] except Exception as err: if 'InvalidSnapshot.NotFound' in str(err): mysnaps = {'Snapshots', []} else: raise err for snap in mysnaps: for tags in snap['Tags']: if tags["Key"] == 'replication_snapshot_id': replication_snap_list.append(tags["Value"]) LOG.info('Caching completed for destination region: ' + region_tag_value + ': cache size: ' + str(len(replication_snap_list))) sleep(1) # 2. evaluate snapshots that were copied to this region, if source not found, delete for snapshot in found_snapshots.get('replication_src_region', []): snapshot_id = snapshot['SnapshotId'] snapshot_description = snapshot['Description'] if timeout_check(context, 'perform_replication'): break if snapshot_id in ignore_ids: continue if snapshot['State'] in ['pending', 'error']: LOG.warn('Skip cleaning up this snapshot ' + snapshot_id + ' due to ' + snapshot['State'] + ' state: ' + snapshot_description) continue LOG.info('Working on cleaning up this snapshot ' + snapshot_id + ' (if needed): ' + snapshot_description) # what region did this come from? tag_pairs = snapshot.get('Tags', []) region_tag_pair = [x for x in tag_pairs if x.get('Key') == 'replication_src_region'] region_tag_value = region_tag_pair[0].get('Value') # what snapshot id did this come from? snapshotid_tag_pair = [x for x in tag_pairs if x.get('Key') == 'replication_snapshot_id'] snapshotid_tag_value = snapshotid_tag_pair[0].get('Value') if snapshotid_tag_value in src_snap_list: LOG.info('Not removing this snapshot ' + snapshot_id + ' from ' + region + ' since snapshot_id ' + snapshotid_tag_value + ' was found in ' + region_tag_value) continue # ax it! LOG.warn('Removing this snapshot ' + snapshot_id + ' from ' + region + ' since snapshot_id ' + snapshotid_tag_value + ' was not found in ' + region_tag_value) utils.delete_snapshot(snapshot_id, region) sleep(2) # 3. evaluate snapshots that should be copied from this region, if dest not found, copy and tag for snapshot in found_snapshots.get('replication_dst_region', []): snapshot_id = snapshot['SnapshotId'] snapshot_description = snapshot['Description'] if timeout_check(context, 'perform_replication'): break if snapshot_id in ignore_ids: continue if snapshot['State'] in ['pending', 'error']: LOG.warn('Skip copying this snapshot ' + snapshot_id + ' due to ' + snapshot['State'] + ' state: ' + snapshot_description) continue LOG.info('Working on copying this snapshot ' + snapshot_id + ' (if needed): ' + snapshot_description) # what region should this be mapped to? tag_pairs = snapshot.get('Tags', []) region_tag_pair = [x for x in tag_pairs if x.get('Key') == 'replication_dst_region'] region_tag_value = region_tag_pair[0].get('Value') name_tag_pair = [x for x in tag_pairs if x.get('Key') == 'Name'] name_tag_pair.append({}) # Adds empty dictionary to list in even no Name tag is present name_tag_value = name_tag_pair[0].get('Value') # does it already exist in the target region? if snapshot_id in replication_snap_list: LOG.info('Not creating more snapshots, since snapshot_id ' + snapshot_id + ' was already found in ' + region_tag_value) continue # we need to make one in the target region LOG.warn('Creating a new snapshot, since snapshot_id ' + snapshot_id + ' was not already found in ' + region_tag_value) utils.copy_snapshot_and_tag( context, region, region_tag_value, name_tag_value, snapshot_id, snapshot_description)
def build_cache_maps(context, configurations, region, installed_region): """Build a giant cache of instances, volumes, snapshots for region""" LOG.info("Building cache of instance, volume, and snapshots in %s", region) LOG.info("This may take a while...") cache_data = { # calculated here locally 'instance_id_to_data': {}, 'instance_id_to_config': {}, 'volume_id_to_instance_id': {}, # calculated w/ multiprocessing module 'snapshot_id_to_data': {}, 'volume_id_to_snapshot_count': {}, 'volume_id_to_most_recent_snapshot_date': {}, } # build an EC2 client, we're going to need it ec2 = boto3.client('ec2', region_name=region) if len(configurations) <= 0: LOG.info('No configurations found in %s, not building cache', region) return cache_data # populate them LOG.info("Retrieved %s DynamoDB configurations for caching", str(len(configurations))) # build a list of any IDs (anywhere) that we should ignore ignore_ids = build_ignore_list(configurations) for config in configurations: # stop if we're running out of time if ebs_snapper.timeout_check(context, 'build_cache_maps'): break # if it's missing the match section, ignore it if not validate_snapshot_settings(config): continue # build a boto3 filter to describe instances with configuration_matches = config['match'] filters = convert_configurations_to_boto_filter(configuration_matches) # if we ended up with no boto3 filters, we bail so we don't snapshot everything if len(filters) <= 0: LOG.warn('Could not convert configuration match to a filter: %s', configuration_matches) continue filters.append({ 'Name': 'instance-state-name', 'Values': ['running', 'stopped'] }) instances = ec2.describe_instances(Filters=filters) res_list = instances.get('Reservations', []) random.shuffle(res_list) # attempt to randomize order, for timeouts for reservation in res_list: inst_list = reservation.get('Instances', []) random.shuffle( inst_list) # attempt to randomize order, for timeouts for instance_data in inst_list: instance_id = instance_data['InstanceId'] # skip if we're ignoring this if instance_id in ignore_ids: continue cache_data['instance_id_to_config'][instance_id] = config cache_data['instance_id_to_data'][instance_id] = instance_data for dev in instance_data.get('BlockDeviceMappings', []): vid = dev['Ebs']['VolumeId'] # skip if we're ignoring this if vid in ignore_ids: continue cache_data['volume_id_to_instance_id'][vid] = instance_id LOG.info("Retrieved %s instances for caching", str(len(cache_data['instance_id_to_data'].keys()))) # look at each volume, get snapshots and count / most recent, and map to instance process_volumes = cache_data['volume_id_to_instance_id'].keys()[:] LOG.info("Retrieved %s volumes for caching", str(len(process_volumes))) chunked_work = [] while len(process_volumes) > 0: popped = process_volumes[:25] del process_volumes[:25] chunked_work.append(popped) LOG.debug('Split out volume work into %s lists, pulling snapshots...', str(len(chunked_work))) if len(chunked_work) > 0: f = functools.partial(chunk_volume_work, region) pool = ThreadPool(processes=4) results = pool.map(f, chunked_work) pool.close() pool.join() keys = [ 'volume_id_to_most_recent_snapshot_date', 'volume_id_to_snapshot_count', 'snapshot_id_to_data' ] for result_chunk in results: for k in keys: cache_data[k].update(result_chunk[k]) LOG.info("Retrieved %s snapshots for caching", str(len(cache_data['snapshot_id_to_data']))) return cache_data